Skip to content

Instantly share code, notes, and snippets.

@gszauer
Created April 1, 2024 23:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gszauer/ae964b262786667ca6f1d4dcc88d5af0 to your computer and use it in GitHub Desktop.
Save gszauer/ae964b262786667ca6f1d4dcc88d5af0 to your computer and use it in GitHub Desktop.
Tokenizer.cs
namespace Compiler {
public delegate void PrintErrorFunction(string message);
public class Location {
public string File { get; protected set; }
public int Line { get; protected set; }
public int Column { get; protected set; }
public Location(string file, int line, int column) {
File = file;
Line = line;
Column = column;
}
}
public class Token {
public string Lexeme { get; protected set; }
public Location Location { get; protected set; }
public Symbol Symbol { get; protected set; }
public Token(string lexeme, Location location, Symbol symbol) {
this.Lexeme = lexeme;
this.Location = location;
this.Symbol = symbol;
}
public void AppendLexeme(string lexeme) {
Lexeme += lexeme;
}
public void ReplaceLexeme(string lexem) {
Lexeme = lexem;
}
public string AsString {
get {
return Symbol.ToString() + " `" + Lexeme + "` on line: " + Location.Line + " column: " + Location.Column;
}
}
}
public class Tokenizer {
protected class State {
public PrintErrorFunction print;
public string Source;
public string File;
public int Current;
public int Start;
public int Line;
public int Column;
public string[]? ForceIdentifiers;
public bool HadError;
public string? Error;
public State(string fiole, string source, PrintErrorFunction p, string[]? forceIdentifiers) {
print = p;
Source = source;
File = fiole;
Current = 0;
Start = 0;
Line = 1;
Column = 1;
ForceIdentifiers = forceIdentifiers;
HadError = false;
Error = null;
}
}
protected static Dictionary<string, Symbol>? Keywords = null;
public static List<Token>? Tokenize(string fileName, string fileContent, PrintErrorFunction print, string[]? forceIdentifiers = null) {
InitKeywords();
State state = new State(fileName, fileContent, print, forceIdentifiers);
List<Token> result = new List<Token>();
Token startOfFile = new Token(fileName, new Location(fileName, 0, 0), Symbol.FILE_START);
result.Add(startOfFile);
Token? lastToken = null;
while (!IsAtEnd(state)) {
state.Start = state.Current;
Token? token = ScanToken(state);
if (token == null) {
Error(state, "Could not scan token");
return null;
}
if (token.Symbol == Symbol.COMMENT) {
continue;
}
if (token.Symbol == Symbol.LIT_STRING) {
if (lastToken != null && lastToken.Symbol == Symbol.LIT_STRING) {
lastToken.AppendLexeme(token.Lexeme);
continue;
}
}
result.Add(token);
lastToken = token;
}
Token endOfFile = new Token(fileName, new Location(fileName, 0, 0), Symbol.FILE_END);
result.Add(endOfFile);
if (state.HadError) {
return null;
}
foreach (Token t in result) {
if (t.Symbol == Symbol.LIT_STRING) {
t.ReplaceLexeme("\"" + t.Lexeme + "\"");
}
else if (t.Symbol == Symbol.LIT_CHAR) {
t.ReplaceLexeme("'" + t.Lexeme + "'");
}
}
return result;
}
protected static void InitKeywords() {
if (Keywords == null) {
Keywords = new Dictionary<string, Symbol>();
Keywords.Add("char", Symbol.TYPE_CHAR);
Keywords.Add("int", Symbol.TYPE_INT);
Keywords.Add("float", Symbol.TYPE_FLOAT);
Keywords.Add("bool", Symbol.TYPE_BOOL);
Keywords.Add("string", Symbol.TYPE_STRING);
Keywords.Add("object", Symbol.TYPE_OBJECT);
Keywords.Add("void", Symbol.TYPE_VOID);
Keywords.Add("delegate", Symbol.DELEGATE);
Keywords.Add("class", Symbol.CLASS);
Keywords.Add("interface", Symbol.INTERFACE);
Keywords.Add("extends", Symbol.EXTENDS);
Keywords.Add("implements", Symbol.IMPLEMENTS);
Keywords.Add("true", Symbol.LIT_BOOL);
Keywords.Add("false", Symbol.LIT_BOOL);
Keywords.Add("null", Symbol.LIT_NULL);
Keywords.Add("new", Symbol.NEW);
Keywords.Add("and", Symbol.AND);
Keywords.Add("or", Symbol.OR);
Keywords.Add("as", Symbol.AS);
Keywords.Add("if", Symbol.IF);
Keywords.Add("else", Symbol.ELSE);
Keywords.Add("for", Symbol.FOR);
Keywords.Add("while", Symbol.WHILE);
Keywords.Add("public", Symbol.PUBLIC);
Keywords.Add("protected", Symbol.PROTECTED);
Keywords.Add("private", Symbol.PRIVATE);
Keywords.Add("return", Symbol.RETURN);
Keywords.Add("continue", Symbol.CONTINUE);
Keywords.Add("break", Symbol.BREAK);
Keywords.Add("assert", Symbol.ASSERT);
Keywords.Add("this", Symbol.THIS);
Keywords.Add("base", Symbol.BASE);
Keywords.Add("set", Symbol.SET);
Keywords.Add("get", Symbol.GET);
}
}
protected static Token? ScanToken(State s) {
char c = Advance(s);
if (c == '(') { return MakeToken(s, Symbol.LPAREN); }
else if (c == ')') { return MakeToken(s, Symbol.RPAREN); }
else if (c == '{') { return MakeToken(s, Symbol.LBRACE); }
else if (c == '}') { return MakeToken(s, Symbol.RBRACE); }
else if (c == '[') { return MakeToken(s, Symbol.LBRACKET); }
else if (c == ']') { return MakeToken(s, Symbol.RBRACKET); }
else if (c == ',') { return MakeToken(s, Symbol.COMMA); }
else if (c == ';') { return MakeToken(s, Symbol.SEMICOLON); }
else if (c == '`') { return MakeToken(s, Symbol.TICK); }
else if (c == '!') { return MakeToken(s, Match(s, '=') ? Symbol.NOT_EQUAL : Symbol.NOT); }
else if (c == '~') { return MakeToken(s, Match(s, '=') ? Symbol.TILDE_EQUAL : Symbol.TILDE); }
else if (c == '*') { return MakeToken(s, Match(s, '=') ? Symbol.STAR_EQUAL : Symbol.STAR); }
else if (c == '%') { return MakeToken(s, Match(s, '=') ? Symbol.MOD_EQUAL : Symbol.MOD); }
else if (c == '^') { return MakeToken(s, Match(s, '=') ? Symbol.POW_EQUAL : Symbol.POW); }
else if (c == '>') { return MakeToken(s, Match(s, '=') ? Symbol.GREATER_EQUAL : Symbol.GREATER); }
else if (c == '<') { return MakeToken(s, Match(s, '=') ? Symbol.LESS_EQUAL : Symbol.LESS); }
else if (c == ':') { return MakeToken(s, Match(s, '=') ? Symbol.COLON_EQUAL : Symbol.COLON); }
else if (c == '?') { return MakeToken(s, Match(s, '=') ? Symbol.QUESTION_EQUAL : Symbol.QUESTION); }
else if (c == '@') { return MakeToken(s, Match(s, '=') ? Symbol.AT_EQUAL : Symbol.AT); }
else if (c == '#') { return MakeToken(s, Match(s, '=') ? Symbol.HASH_EQUAL : Symbol.HASH); }
else if (c == '$') { return MakeToken(s, Match(s, '=') ? Symbol.DOLLAR_EQUAL : Symbol.DOLLAR); }
else if (c == '&') { return MakeToken(s, Match(s, '=') ? Symbol.AMPER_EQUAL : Symbol.AMPER); }
else if (c == '|') { return MakeToken(s, Match(s, '=') ? Symbol.PIPE_EQUAL : Symbol.PIPE); }
else if (c == '.') {
if (Match(s, '.')) {
if (Match(s, '.')) {
return MakeToken(s, Symbol.DOT_DOT_DOT);
}
return MakeToken(s, Symbol.DOT_DOT);
}
return MakeToken(s, Symbol.DOT);
}
else if (c == '=') {
if (Match(s, '=')) {
if (Match(s, '=')) {
return MakeToken(s, Symbol.EQUAL_EQUAL_EQUAL);
}
return MakeToken(s, Symbol.EQUAL_EQUAL);
}
return MakeToken(s, Symbol.EQUAL);
}
else if (c == '+') {
if (Match(s, '=')) { return MakeToken(s, Symbol.PLUS_EQUAL); }
else if (Match(s, '+')) { return MakeToken(s, Symbol.PLUS_PLUS); }
else { return MakeToken(s, Symbol.PLUS); }
}
else if (c == '-') {
if (Match(s, '=')) { return MakeToken(s, Symbol.MINUS_EQUAL); }
else if (Match(s, '-')) { return MakeToken(s, Symbol.MINUS_MINUS); }
else { return MakeToken(s, Symbol.MINUS); }
}
else if (c == '/') {
if (Match(s, '=')) { return MakeToken(s, Symbol.SLASH_EQUAL); }
else if (Match(s, '/')) {
while (!IsAtEnd(s) && Peek(s, 0) != '\n') {
Advance(s);
}
return MakeToken(s, Symbol.COMMENT);
}
else if (Match(s, '*')) {
while (true) {
if (Peek(s, 0) == '/') {
if (Peek(s, -1) == '*') {
Advance(s); // Eat the slash
return MakeToken(s, Symbol.COMMENT);
}
}
Advance(s);
if (IsAtEnd(s)) {
Error(s, "Unterminated comment");
}
}
}
else {
return MakeToken(s, Symbol.SLASH);
}
}
else if (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f') {
c = Peek(s, 0);
while (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f') {
Advance(s);
if (IsAtEnd(s)) {
break;
}
c = Peek(s, 0);
}
return MakeToken(s, Symbol.COMMENT); // Skip white space
}
else if (c == '\'') {
char literal = Advance(s);
if (literal == '\\') {
literal = Advance(s);
if (literal != '0' && literal != 't' && literal != 'n' && literal != 'r' && literal != 'f' && literal != '\\' && literal != '\'') {
Error(s, "Unexpected char escape sequence: '" + literal + "'");
}
}
if (Peek(s, 0) != '\'') {
Error(s, "Unterminated character literal");
}
Advance(s); // Eat '
return MakeToken(s, Symbol.LIT_CHAR, literal.ToString());
}
else if (c == '"') {
string literal = "";
while (!IsAtEnd(s)) {
if (Peek(s, 0) == '"') {
if (Peek(s, -1) == '\\') {
// Just let it go
}
else {
break;
}
}
if (Peek(s, 0) == '\n') {
Error(s, "Newline is not supported inside string");
}
literal += Advance(s);
}
if (Peek(s, 0) != '"') {
Error(s, "Unterminated string");
}
Advance(s); // Eat "
return MakeToken(s, Symbol.LIT_STRING, literal.ToString());
}
else {
if (IsNumber(c)) {
while (MatchNumber(s)) ;
if (Match(s, '.')) {
while (MatchNumber(s)) ;
return MakeToken(s, Symbol.LIT_FLOAT);
}
/* else */
if (Match(s, 'f')) {
return MakeToken(s, Symbol.LIT_FLOAT);
}
return MakeToken(s, Symbol.LIT_INT);
}
else if (c == '_' || IsAlpha(c)) {
while (!IsAtEnd(s) && IsAlphaNumericWithUnderscore(Peek(s, 0))) {
Advance(s);
}
string lexeme = GetLexeme(s);
if (Keywords != null && Keywords.ContainsKey(lexeme)) {
return MakeToken(s, Keywords[lexeme]);
}
return MakeToken(s, Symbol.IDENTIFIER);
}
}
Error(s, "Encountered unexpected character: '" + c + "'");
return null;
}
protected static string GetLexeme(State s) {
return s.Source.Substring(s.Start, s.Current - s.Start);
}
protected static bool IsNumber(char c) {
return c >= '0' && c <= '9';
}
protected static bool IsAlpha(char c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}
protected static bool IsAlphaNumericWithUnderscore(char c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || (c == '_');
}
protected static bool MatchNumber(State s) {
if (IsAtEnd(s)) {
return false;
}
char peek = Peek(s, 0);
if (peek < '0' || peek > '9') {
return false;
}
Advance(s);
return true;
}
protected static bool Match(State s, char c) {
if (IsAtEnd(s)) {
return false;
}
if (Peek(s, 0) != c) {
return false;
}
Advance(s);
return true;
}
protected static char Peek(State s, int offset) {
int location = s.Current + offset;
if (location < 0) {
Error(s, "Can't peek below zero");
}
else if (location >= s.Source.Length) {
Error(s, "Can't peek past end");
}
char result = s.Source[location];
return result;
}
protected static char Advance(State s) {
if (IsAtEnd(s)) {
Error(s, "Can't advance past end of token stream");
}
s.Column += 1;
if (s.Source[s.Current] == '\n') {
s.Line += 1;
s.Column = 1;
}
return s.Source[s.Current++];
}
protected static Token MakeToken(State s, Symbol symbol, string? optLex = null) {
string lexeme = s.Source.Substring(s.Start, s.Current - s.Start);
Location location = new Location(s.File, s.Line, s.Column);
return new Token(optLex == null? lexeme : optLex, location, symbol);
}
protected static bool IsAtEnd(State s) {
return s.Current >= s.Source.Length;
}
protected static void Error(State s, string error) {
s.HadError = true;
error = "Error in Tokenizer: \n" + error;
error += "\nOn line: " + s.Line + ", column: " + s.Column + ", in file: " + s.File;
s.Error = error;
if (s.print != null) {
s.print(error);
}
else {
throw new Exception(error);
}
}
}
public enum Symbol {
FILE_START, FILE_END,
NOT, AT, HASH, DOLLAR, MOD, POW, AMPER, STAR, PLUS, MINUS, TILDE, SLASH, QUESTION, COLON, LESS, GREATER, EQUAL, PIPE,
NOT_EQUAL, AT_EQUAL, HASH_EQUAL, DOLLAR_EQUAL, MOD_EQUAL, POW_EQUAL,LESS_EQUAL, GREATER_EQUAL, COLON_EQUAL,
AMPER_EQUAL, STAR_EQUAL, PLUS_EQUAL, MINUS_EQUAL, TILDE_EQUAL, QUESTION_EQUAL, SLASH_EQUAL, EQUAL_EQUAL,
PLUS_PLUS, MINUS_MINUS, PIPE_EQUAL, EQUAL_EQUAL_EQUAL,
SEMICOLON, COMMA, DOT, DOT_DOT, DOT_DOT_DOT, TICK,
TYPE_CHAR, TYPE_INT, TYPE_FLOAT, TYPE_BOOL, TYPE_STRING, TYPE_OBJECT, TYPE_VOID,
LIT_CHAR, LIT_INT, LIT_FLOAT, LIT_BOOL, LIT_STRING, LIT_NULL,
DELEGATE, CLASS, INTERFACE, EXTENDS, IMPLEMENTS, STATIC,
NEW, AND, OR, AS, IF, ELSE, FOR, WHILE, PUBLIC, PROTECTED, PRIVATE,
RETURN, CONTINUE, BREAK, ASSERT, THIS, BASE, GET, SET,
LBRACE, RBRACE, /* { } */
LBRACKET, RBRACKET, /* [ ] */
LPAREN, RPAREN, /* ( ) */
IDENTIFIER, COMMENT
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment