Skip to content

Instantly share code, notes, and snippets.

@polynomialherder
Last active January 24, 2024 19:15
Show Gist options
  • Save polynomialherder/c9b081e0be39346abdfa724f397501f4 to your computer and use it in GitHub Desktop.
Save polynomialherder/c9b081e0be39346abdfa724f397501f4 to your computer and use it in GitHub Desktop.
A toy lexer for an arithmetic language
import pprint
from enum import Enum, auto
class TokenType(Enum):
COMMENT = auto()
SEMICOLON = auto()
DATATYPE = auto()
STRING = auto()
INTEGER = auto()
FLOAT = auto()
IDENTIFIER = auto()
OPERATOR = auto()
VALUE = auto()
PLUS = auto()
MINUS = auto()
EQUALS = auto()
TIMES = auto()
DIVIDE = auto()
EXPONENT = auto()
EOF = auto()
def __repr__(self):
return f"<{self.name}>"
class Lexer:
def __init__(self, text):
self.text = text
self.position = 0
self.line_number = 0
self.column = 0
self.char = 0
self.buffer = ""
self.kind = None
self.lexed = []
def advance(self):
if self.is_eof:
self.dump_buffer()
self.lex_eof()
self.position += 1
@property
def current_char(self):
return self.text[self.position]
@property
def is_eof(self):
try:
self.current_char
return False
except IndexError:
return True
@property
def is_end_of_statement(self):
return self.current_char == ";"
@property
def is_whitespace(self):
return self.current_char in " \r\n\t"
@property
def is_operator(self):
return self.current_char in "+-=*^></"
@property
def is_paren(self):
return self.current_char in "()"
@property
def is_square_bracket(self):
return self.current_char in "[]"
@property
def is_curly_bracket(self):
return self.current_char in "{}"
@property
def is_pointy_bracket(self):
return self.current_char in "<>"
@property
def is_bracket(self):
return self.is_paren or self.is_square_bracket or self.is_curly_bracket or self.is_pointy_bracket
@property
def is_comma(self):
return self.current_char == ","
@property
def is_colon(self):
return self.current_char == ":"
@property
def is_period(self):
return self.current_char == "."
@property
def is_punctuation(self):
return self.is_comma or self.is_colon or self.is_period or self.is_bracket or self.is_operator
def dump_buffer(self):
self.lexed.append(
(self.kind, self.buffer)
)
self.clear_buffer()
def lex_eof(self):
self.lexed.append(
(TokenType.EOF, "")
)
def clear_buffer(self):
self.buffer = ""
self.kind = None
def read_whitespace(self):
self.advance()
def read_comment(self):
self.kind = TokenType.COMMENT
self.advance()
while self.is_whitespace:
self.advance()
while self.current_char != "\n" and not self.is_eof:
self.buffer += self.current_char
self.advance()
self.dump_buffer()
def read_operator(self):
self.kind = {
"=": TokenType.EQUALS,
"+": TokenType.PLUS,
"-": TokenType.MINUS,
"*": TokenType.TIMES,
"/": TokenType.DIVIDE,
"^": TokenType.EXPONENT
}[self.current_char]
self.buffer += self.current_char
self.dump_buffer()
self.advance()
def read_semicolon(self):
self.kind = TokenType.SEMICOLON
self.buffer = self.current_char
self.dump_buffer()
self.advance()
def read_string(self):
quote_start = self.current_char
self.advance()
self.kind = TokenType.STRING
previous = ""
while not (self.current_char == quote_start and previous != "\\") and not self.is_eof and not self.is_end_of_statement:
self.buffer += self.current_char
previous = self.current_char
self.advance()
self.dump_buffer()
self.advance()
def read_identifier(self):
self.kind = TokenType.IDENTIFIER
while not (self.is_whitespace or self.is_punctuation or self.is_end_of_statement):
self.buffer += self.current_char
self.advance()
self.dump_buffer()
def read_number(self):
while not (self.is_end_of_statement or self.is_whitespace or self.is_punctuation) or self.is_period:
self.buffer += self.current_char
if self.is_period:
self.kind = TokenType.FLOAT
self.advance()
if not self.kind:
self.kind = TokenType.INTEGER
self.dump_buffer()
def lex(self):
while not self.is_eof:
match char := self.current_char:
case "#":
self.read_comment()
case "=" | "+" | "-" | "*" | "/" | "^":
self.read_operator()
case " " | "\r" | "\n" | "\t":
self.read_whitespace()
case "'" | '"':
self.read_string()
case ";":
self.read_semicolon()
case char if char.isdigit() or char == ".":
self.read_number()
case _:
self.read_identifier()
if __name__ == '__main__':
PROGRAM = """
x = 2; y = 3.0; # this is a comment
z = .2; a = x + y/z
"""
l = Lexer(PROGRAM)
l.lex()
pprint.pprint(l.lexed)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment