Last active
January 24, 2024 19:15
-
-
Save polynomialherder/c9b081e0be39346abdfa724f397501f4 to your computer and use it in GitHub Desktop.
A toy lexer for an arithmetic language
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pprint | |
from enum import Enum, auto | |
class TokenType(Enum): | |
COMMENT = auto() | |
SEMICOLON = auto() | |
DATATYPE = auto() | |
STRING = auto() | |
INTEGER = auto() | |
FLOAT = auto() | |
IDENTIFIER = auto() | |
OPERATOR = auto() | |
VALUE = auto() | |
PLUS = auto() | |
MINUS = auto() | |
EQUALS = auto() | |
TIMES = auto() | |
DIVIDE = auto() | |
EXPONENT = auto() | |
EOF = auto() | |
def __repr__(self): | |
return f"<{self.name}>" | |
class Lexer: | |
def __init__(self, text): | |
self.text = text | |
self.position = 0 | |
self.line_number = 0 | |
self.column = 0 | |
self.char = 0 | |
self.buffer = "" | |
self.kind = None | |
self.lexed = [] | |
def advance(self): | |
if self.is_eof: | |
self.dump_buffer() | |
self.lex_eof() | |
self.position += 1 | |
@property | |
def current_char(self): | |
return self.text[self.position] | |
@property | |
def is_eof(self): | |
try: | |
self.current_char | |
return False | |
except IndexError: | |
return True | |
@property | |
def is_end_of_statement(self): | |
return self.current_char == ";" | |
@property | |
def is_whitespace(self): | |
return self.current_char in " \r\n\t" | |
@property | |
def is_operator(self): | |
return self.current_char in "+-=*^></" | |
@property | |
def is_paren(self): | |
return self.current_char in "()" | |
@property | |
def is_square_bracket(self): | |
return self.current_char in "[]" | |
@property | |
def is_curly_bracket(self): | |
return self.current_char in "{}" | |
@property | |
def is_pointy_bracket(self): | |
return self.current_char in "<>" | |
@property | |
def is_bracket(self): | |
return self.is_paren or self.is_square_bracket or self.is_curly_bracket or self.is_pointy_bracket | |
@property | |
def is_comma(self): | |
return self.current_char == "," | |
@property | |
def is_colon(self): | |
return self.current_char == ":" | |
@property | |
def is_period(self): | |
return self.current_char == "." | |
@property | |
def is_punctuation(self): | |
return self.is_comma or self.is_colon or self.is_period or self.is_bracket or self.is_operator | |
def dump_buffer(self): | |
self.lexed.append( | |
(self.kind, self.buffer) | |
) | |
self.clear_buffer() | |
def lex_eof(self): | |
self.lexed.append( | |
(TokenType.EOF, "") | |
) | |
def clear_buffer(self): | |
self.buffer = "" | |
self.kind = None | |
def read_whitespace(self): | |
self.advance() | |
def read_comment(self): | |
self.kind = TokenType.COMMENT | |
self.advance() | |
while self.is_whitespace: | |
self.advance() | |
while self.current_char != "\n" and not self.is_eof: | |
self.buffer += self.current_char | |
self.advance() | |
self.dump_buffer() | |
def read_operator(self): | |
self.kind = { | |
"=": TokenType.EQUALS, | |
"+": TokenType.PLUS, | |
"-": TokenType.MINUS, | |
"*": TokenType.TIMES, | |
"/": TokenType.DIVIDE, | |
"^": TokenType.EXPONENT | |
}[self.current_char] | |
self.buffer += self.current_char | |
self.dump_buffer() | |
self.advance() | |
def read_semicolon(self): | |
self.kind = TokenType.SEMICOLON | |
self.buffer = self.current_char | |
self.dump_buffer() | |
self.advance() | |
def read_string(self): | |
quote_start = self.current_char | |
self.advance() | |
self.kind = TokenType.STRING | |
previous = "" | |
while not (self.current_char == quote_start and previous != "\\") and not self.is_eof and not self.is_end_of_statement: | |
self.buffer += self.current_char | |
previous = self.current_char | |
self.advance() | |
self.dump_buffer() | |
self.advance() | |
def read_identifier(self): | |
self.kind = TokenType.IDENTIFIER | |
while not (self.is_whitespace or self.is_punctuation or self.is_end_of_statement): | |
self.buffer += self.current_char | |
self.advance() | |
self.dump_buffer() | |
def read_number(self): | |
while not (self.is_end_of_statement or self.is_whitespace or self.is_punctuation) or self.is_period: | |
self.buffer += self.current_char | |
if self.is_period: | |
self.kind = TokenType.FLOAT | |
self.advance() | |
if not self.kind: | |
self.kind = TokenType.INTEGER | |
self.dump_buffer() | |
def lex(self): | |
while not self.is_eof: | |
match char := self.current_char: | |
case "#": | |
self.read_comment() | |
case "=" | "+" | "-" | "*" | "/" | "^": | |
self.read_operator() | |
case " " | "\r" | "\n" | "\t": | |
self.read_whitespace() | |
case "'" | '"': | |
self.read_string() | |
case ";": | |
self.read_semicolon() | |
case char if char.isdigit() or char == ".": | |
self.read_number() | |
case _: | |
self.read_identifier() | |
if __name__ == '__main__': | |
PROGRAM = """ | |
x = 2; y = 3.0; # this is a comment | |
z = .2; a = x + y/z | |
""" | |
l = Lexer(PROGRAM) | |
l.lex() | |
pprint.pprint(l.lexed) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment