Questo sito utilizza cookies solo per scopi di autenticazione sul sito e nient'altro. Nessuna informazione personale viene tracciata. Leggi l'informativa sui cookies.
Username: Password: oppure
C/C++ - Lexer scritto a mano
Forum - C/C++ - Lexer scritto a mano

Avatar
TheDarkJuster (Member)
Guru^2


Messaggi: 1620
Iscritto: 27/09/2013

Segnala al moderatore
Postato alle 15:17
Sabato, 16/08/2014
Sto scrivendo un lexer per un linguaggi di programmazione simile al c, ma sto avendo dei problemi strani. Vi posto i miei sorgenti:

Lexer.h
Codice sorgente - presumibilmente C++

  1. #ifndef __LEXER__
  2. #define __LEXER__
  3.  
  4. #include "Token.h"
  5.  
  6. #include <malloc.h>
  7. #include <string.h>
  8. #include <vector>
  9.  
  10. using namespace std;
  11.  
  12. #define EMPTY_BUFFER -1
  13. #define NULL_BUFFER -2
  14. #define NO_ERRORS 0
  15.  
  16. enum LexReading {
  17.         MultiLineComment,
  18.         SingleLineComment,
  19.         Number,
  20.         String,
  21.         Identifier,
  22.         Source
  23. };
  24.  
  25. class Lexer {
  26. public:
  27.         Lexer(char*);
  28.         short Analyze(void);
  29.  
  30.         //lexer data
  31.         char* buffer;
  32.         unsigned long bufferLength;
  33.  
  34.         //lex result
  35.         vector<Token> Tokens;
  36.         vector<char*> Identifiers;
  37.         vector<char*> Strings;
  38.         vector<char*> Numbers;
  39.         vector<char> Characters;
  40.  
  41.         //statistics
  42.         unsigned long MultiLineFoundComments;
  43.         unsigned long SingleLineFoundComments;
  44. };
  45.  
  46. #endif



Lexer.cpp
Codice sorgente - presumibilmente VB.NET

  1. #include "Lexer.h"
  2.  
  3. Lexer::Lexer(char* buffer)
  4. {
  5.         this->buffer = buffer;
  6.         this->bufferLength = strlen(buffer);
  7.         this->MultiLineFoundComments = 0L;
  8.         this->SingleLineFoundComments = 0L;
  9. }
  10.  
  11. short Lexer::Analyze(void)
  12. {
  13.         if (this->buffer == (char*)NULL)
  14.                 return NULL_BUFFER;
  15.         else if (this->bufferLength == 0)
  16.                 return EMPTY_BUFFER;
  17.  
  18.         //what is the lexer reading?
  19.         LexReading currentlyReading;
  20.  
  21.         unsigned long currentBufferCharacter = 0L; //the character that the leer will read
  22.         vector<char> temp; //a temporary vector to store the characters of identifiers, numbers, strings and chars
  23.  
  24.         while (currentBufferCharacter < (this->bufferLength - 1))
  25.         {
  26.                 switch (currentlyReading)
  27.                 {
  28.                         case MultiLineComment:
  29.                                 //if this is the end of the comment
  30.                                 if ((this->buffer[currentBufferCharacter] == '*') && (this->buffer[currentBufferCharacter + 1] == '/'))
  31.                                 {
  32.                                         currentlyReading = Source; //the lexer is going to read source code again
  33.                                         this->MultiLineFoundComments++; //update the number of multi line comments lexed
  34.                                         currentBufferCharacter++; //the lexer won't read the / character the next step
  35.                                 }
  36.                                 //else do nothing, I don't care about comments
  37.                                 break;
  38.  
  39.                         case SingleLineComment:
  40.                                 //if this is the end of the line
  41.                                 if (this->buffer[currentBufferCharacter] == '\n')
  42.                                 {
  43.                                         currentlyReading = Source; //the lexer is going to read source code again
  44.                                         this->SingleLineFoundComments++; //update the number of single line comments lexed
  45.                                 } //else do nothing, I don't care about comments
  46.                                 break;
  47.  
  48.                         case Number:
  49.                                 if (((this->buffer[currentBufferCharacter] < 48) || (this->buffer[currentBufferCharacter] > 57)) && ((this->buffer[currentBufferCharacter] != '.') && (this->buffer[currentBufferCharacter] != 'D') && (this->buffer[currentBufferCharacter] != 'X') && (this->buffer[currentBufferCharacter] != 'B') && (this->buffer[currentBufferCharacter] != 'd') && (this->buffer[currentBufferCharacter] != 'x') && (this->buffer[currentBufferCharacter] != 'b') && (this->buffer[currentBufferCharacter] != 'a')&& (this->buffer[currentBufferCharacter] != 'A') && (this->buffer[currentBufferCharacter] != 'b') && (this->buffer[currentBufferCharacter] != 'C') && (this->buffer[currentBufferCharacter] != 'c') && (this->buffer[currentBufferCharacter] != 'd') && (this->buffer[currentBufferCharacter] != 'D') && (this->buffer[currentBufferCharacter] != 'e') && (this->buffer[currentBufferCharacter] != 'E') && (this->buffer[currentBufferCharacter] != 'f') && (this->buffer[currentBufferCharacter] != 'F')))
  50.                                 {
  51.                                         size_t characters = temp.size();
  52.                                         char* numberStringFromTemp = (char*)malloc((unsigned long long int)(sizeof(char) * characters + 1));
  53.                                         numberStringFromTemp[characters - 1] = (char)0x00;
  54.                                         numberStringFromTemp[characters] = (char)0x00;
  55.                                         size_t currentChar;
  56.                                         for (currentChar = 0L; currentChar < characters; currentChar++)
  57.                                                 numberStringFromTemp[currentChar] = temp[currentChar];
  58.                                         this->Numbers.push_back(numberStringFromTemp);
  59.                                         this->Tokens.push_back(TNUMBER);
  60.                                         temp.clear();
  61.                                         currentlyReading = Source;
  62.                                 } else {
  63.                                         temp.push_back(this->buffer[currentBufferCharacter]); //store the read character
  64.                                 }
  65.                                 break;
  66.  
  67.                         case Source:
  68.                                 //if this is the beginning of a single line comment
  69.                                 if (this->buffer[currentBufferCharacter] == '#') //single line comments aren't C-like
  70.                                 {
  71.                                         currentlyReading = SingleLineComment; //the lexer is going to read a single line comment
  72.                                 }
  73.                                 else if ((this->buffer[currentBufferCharacter] == '/') && (this->buffer[currentBufferCharacter + 1] == '*')) //multi line comments are C-like
  74.                                 {
  75.                                         currentlyReading = MultiLineComment; //the lexer is going to read a multi line comment
  76.                                         currentBufferCharacter++; //the lexer won't read the * simbol
  77.                                 }
  78.                                 else if ((this->buffer[currentBufferCharacter] >= 48) && (this->buffer[currentBufferCharacter] <= 57)) // ASCII code of 0 is 48 and of 9 is 57
  79.                                 {
  80.                                         currentlyReading = Number; //the lexer is going to read a number
  81.                                         currentBufferCharacter--; //a little trick: i want the lexer to read this character again (when the lexer will expect a number)
  82.                                 }
  83.                                 break;
  84.  
  85.                         default:
  86.                                 break;
  87.                 }
  88.                 currentBufferCharacter++; //next time I'll read the next character
  89.         }
  90.         //the lexer's job is done
  91.         return NO_ERRORS;
  92. }



e qui arriva il bello.... Lo provo:

Codice sorgente - presumibilmente C++

  1. #include <stdio.h>
  2. #include <string.h>
  3. #include <vector>
  4.  
  5. #include "Lexer.h"
  6.  
  7. int main(int argc, char** argv)
  8. {
  9.         char* src = "/* sd */ # /*s5d*/\n/*cd*/#f\n55 6.5H\n5.9B /* 5.7 */ 5f\n";
  10.  
  11.         Lexer Analyzer(src);
  12.         if (Analyzer.Analyze() != 0)
  13.         {
  14.                 printf("Si e' verificato un errore!");
  15.         } else {
  16.                 printf("Commenti multilinea: %u\nCommenti: %u\n", Analyzer.MultiLineFoundComments, Analyzer.SingleLineFoundComments );
  17.         }
  18.  
  19.         size_t i;
  20.         printf("Numero di numeri: %u\nNumeri: ", Analyzer.Numbers.size());
  21.         for (i = 0; i < Analyzer.Numbers.size(); i++)
  22.         {
  23.                 puts(Analyzer.Numbers[i]);
  24.                 printf("  ");
  25.         }
  26.         return 0;
  27. }



E mi accorgo che c'è un problema:
5f non è nella lista dei numeri..... Tuttavia il lexer lo inserisce se dopo 5f aggiungo un altro carattere.
Qualcuno ha una idea sul perchè?

Utilizzare regex o generatori di lexer è una decisione che ho scartato e su cui non intendo tornare.

PM Quote
Avatar
pierotofy (Admin)
Guru^2


Messaggi: 6230
Iscritto: 04/12/2003

Segnala al moderatore
Postato alle 18:07
Sabato, 16/08/2014
Codice sorgente - presumibilmente C/C++

  1. while (currentBufferCharacter < this->bufferLength)



Invece di:

Codice sorgente - presumibilmente C/C++

  1. while (currentBufferCharacter < (this->bufferLength - 1))



?

Così ad occhio, non l'ho testato.


Il mio blog: https://piero.dev
PM Quote
Avatar
TheDarkJuster (Member)
Guru^2


Messaggi: 1620
Iscritto: 27/09/2013

Segnala al moderatore
Postato alle 18:32
Sabato, 16/08/2014
Funziona perfettamente ora, grazie mille. Era rimasto quel -1 da una prova antecedente :D

PM Quote
Avatar
pierotofy (Admin)
Guru^2


Messaggi: 6230
Iscritto: 04/12/2003

Segnala al moderatore
Postato alle 19:14
Sabato, 16/08/2014
:k:

Complimenti per l'impresa di scrivere un lexer a mano!


Il mio blog: https://piero.dev
PM Quote
Avatar
TheDarkJuster (Member)
Guru^2


Messaggi: 1620
Iscritto: 27/09/2013

Segnala al moderatore
Postato alle 20:25
Sabato, 16/08/2014
Imparare a usare le regex per me è peggio

PM Quote