using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
namespace Sloth
{
public class InvalidLexemeException : InterpreterException
{
public InvalidLexemeException(String message, Int32 lineNumber) : base(message, lineNumber) { }
}
public enum TokenType
{
Identifier,
Label,
Register,
LiteralConstant,
OpenBracket,
ClosedBracket,
Sign,
Separator,
Sharp,
NewLine,
LiteralString,
None
}
[System.Diagnostics.DebuggerDisplay("{ToString()}")]
public struct Token : IEquatable<Token>
{
private static Token empty
= new Token
(TokenType.
None,
String.
Empty);
public static Token Empty { get { return empty; } }
public TokenType Type { get; set; }
public String Lexeme { get; set; }
public Token(TokenType type, String lexeme) : this()
{
this.Type = type;
this.Lexeme = lexeme;
}
public override string ToString()
{
return String.Format("[{0}, {1}]", this.Type.ToString(), this.Lexeme);
}
public bool Equals(Token other)
{
return (this.Lexeme == other.Lexeme) && (this.Type == other.Type);
}
}
public class TokenList : List<Token> { }
public class TokenizationRuleSet : Dictionary<Regex, TokenType> { }
public class SymbolSet : Dictionary<Char, TokenType> { }
public class Lexer
{
protected Char commentChar;
private Int32 lineNumber;
protected TokenizationRuleSet Rules { get; set; }
protected SymbolSet Symbols { get; set; }
protected List<Char> IgnoredCharacters { get; set; }
public TokenList Output { get; protected set; }
public Lexer()
{
this.
Output = new TokenList
();
this.
IgnoredCharacters = new List
<Char>() { ' ',
'\t',
'\f',
'\r',
'\v' };
this.commentChar = ';';
this.
Symbols = new SymbolSet
()
{
{ ',', TokenType.Separator },
{ '(', TokenType.OpenBracket },
{ ')', TokenType.ClosedBracket },
{ '+', TokenType.Sign },
{ '-', TokenType.Sign },
{ '#', TokenType.Sharp }
};
this.
Rules = new TokenizationRuleSet
()
{
{ new Regex
("^[\\w_]+:$", RegexOptions.
IgnoreCase), TokenType.
Label },
{ new Regex
("^(A|D)[0-7]$", RegexOptions.
IgnoreCase), TokenType.
Register },
{ new Regex
("^(\\d+|\\$[0-9A-F]+)(\\.(B|W|L))?$", RegexOptions.
IgnoreCase), TokenType.
LiteralConstant },
{ new Regex
("^(#)?\\w+(\\.(B|W|L))?$", RegexOptions.
IgnoreCase), TokenType.
Identifier },
{ new Regex
("^.+$"), TokenType.
LiteralString }
};
}
public void Tokenize(String code)
{
StringBuilder purgedCodeBuilder
= new StringBuilder
();
StringBuilder buffer
= new StringBuilder
();
this.Output.Clear();
lineNumber = 1;
// Elimina i commenti
Boolean commentOpen = false;
foreach (Char c in code)
{
if (c == commentChar)
commentOpen = true;
else if (c == '\n')
commentOpen = false;
if (!commentOpen)
purgedCodeBuilder.Append(c);
}
String purgedCode = purgedCodeBuilder.ToString();
Boolean stringOpen = false;
foreach (Char c in purgedCode)
{
if (c == '\'')
{
stringOpen = !stringOpen;
if (stringOpen == false)
{
this.
Output.
Add(new Token
(TokenType.
LiteralString, buffer.
ToString().
Trim('\'')));
buffer.Clear();
continue;
}
}
if (!stringOpen) // legge token solo quando una c'è una stringa aperta
{
if (c == '\n')
{
this.AddToken(buffer);
lineNumber++;
this.
Output.
Add(new Token
(TokenType.
NewLine,
"\n"));
}
else if (this.IgnoredCharacters.Contains(c))
this.AddToken(buffer);
else if (this.Symbols.ContainsKey(c))
{
this.AddToken(buffer);
this.
Output.
Add(new Token
(this.
Symbols[c
], c.
ToString()));
}
else
buffer.Append(c);
}
else
buffer.Append(c);
}
this.AddToken(buffer);
buffer = null;
}
public void TokenizeFromFile(String path)
{
this.Tokenize(File.ReadAllText(path));
}
private Token ScanLexeme(String buffer)
{
if (String.IsNullOrEmpty(buffer))
return Token.Empty;
foreach (Regex expression in this.Rules.Keys)
if (expression.IsMatch(buffer))
return new Token
(this.
Rules[expression
], buffer
);
throw new InvalidLexemeException
(String.
Format("Il lessema '{0}' non è valido.", buffer
), lineNumber
);
}
private void AddToken(StringBuilder buffer)
{
String str = buffer.ToString();
Token token = this.ScanLexeme(str);
if (!token.Equals(Token.Empty))
this.Output.Add(token);
buffer.Clear();
}
}
}