Skip to content

Instantly share code, notes, and snippets.

@louisswarren

louisswarren/lexer.py

Last active Sep 27, 2020
Embed
What would you like to do?
BNF
from collections import namedtuple
import re
# Literal patterns only match themselves, but quack like regular expressions
class LiteralPattern(str):
def match(self, other):
if other.startswith(self):
return LiteralMatch(str(self))
return None
class LiteralMatch(namedtuple('LiteralMatch', 'literal')):
def group(self):
return self.literal
class Lookahead:
def __init__(self, it):
self.it = it
self.lookahead_exception = None
self._set_lookahead()
def __bool__(self):
return self.lookahead_exception is None
def _set_lookahead(self):
try:
self._lookahead = next(self.it)
except Exception as e:
self._lookahead = None
self.lookahead_exception = e
def peek(self):
if not self:
raise self.lookahead_exception
return self._lookahead
def __iter__(self):
while self:
yield next(self)
try:
raise self.lookahead_exception
except StopIteration:
return
def __next__(self):
if not self:
raise self.lookahead_exception
value = self._lookahead
self._set_lookahead()
return value
class LexError(Exception):
def __init__(self, src, index, message=None):
if message is None:
line_num = src[:index].count("\n") + 1
self.message = message or f'Failed to lex on line {line_num}'
else:
self.message = message
super(LexError, self).__init__(self.message)
self.src = src
self.index = index
def pretty_print(self):
start = self.src[:self.index].rfind('\n') + 1
end = self.src.find('\n', start)
line = self.src[start:end] if end > 0 else self.src[start:]
print(self.message)
print(line)
print(' ' * (self.index - start) + '^')
class TokenMatch(namedtuple('TokenMatch', 'token literal')):
def __gt__(self, other):
return len(self.literal) > len(other.literal)
def matching_tokens(token_list, src):
for token_name, token_re in token_list:
if (m := re.match(token_re, src)):
yield TokenMatch(token_name, m.group())
def tokenise(token_list, src):
i = 0
while i < len(src):
if src[i].isspace():
i += 1
continue
best_match = max(matching_tokens(token_list, src[i:]), default=None)
if best_match is None:
raise LexError(src, i)
yield best_match
i += len(best_match.literal)
def lex(token_list, src):
return Lookahead(tokenise(token_list, src))
if __name__ == '__main__':
tl = [('HELLO', LiteralPattern('hello')),
('NUM', re.compile('[0-9]+'))]
try:
lx = lex(tl, "hello 1241")
for tok, lit in lx:
print(tok, repr(lit))
except LexError as err:
err.pretty_print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment