Skip to content

Instantly share code, notes, and snippets.



Last active Jul 15, 2020
What would you like to do?
import re
from lexer import lex, LexError
def readlines_iter(f):
while (line := f.readline()):
yield line[:-1]
def main(lines):
tok_patterns = {}
prods = {}
last_prod = None
for line in lines:
if line.strip().startswith('|='):
prod, _, *terms = last_prod, *line.split()
prod, op, termsline = line.split(maxsplit=2)
if op == '|=':
terms = termsline.split()
assert(op == '~~')
terms = (termsline,)
last_prod = prod
for term in terms:
left, middle, right = term[0], term[1:-1], term[-1]
if left == '<' and right == '>':
prods[middle] = prods.get(middle, [])
elif left == '[' and right == ']':
assert(middle not in tok_patterns)
tok_patterns[middle] = None
elif left == '{' and right == '}':
assert(prod not in tok_patterns)
print("Adding regex for", prod, repr(middle))
tok_patterns[prod] = re.compile(middle)
with open('test.the') as f:
for tok, lit in lex(tok_patterns,
print(f'[{tok}]' + f' {lit}' if lit else '')
except LexError as e:
def handle(s):
linere = re.compile(r'((?P<term>\w+)\s*←)|((?P<tok>\w+)\s*≡)')
prodre = re.compile(r'\s*((⦃(?P<term>\w+)⦄)|(?P<lit>[^\s]+))')
m = linere.match(s)
if not m:
return False
if m.groupdict()['term']:
while m:
s = s[m.end():]
m = prodre.match(s)
print("m", m)
print("m.groups()", m.groups())
print("m.groupdict()", m.groupdict())
handle("decl ← ⦃type⦄ declare ⦃space⦄ ⦃var⦄")
if __name__ == '__main__':
with open('the.bnf') as f:
from collections import namedtuple
import re
class LexError(Exception):
def __init__(self, src, index, message=None):
if message is None:
line_num = src[:index].count("\n") + 1
self.message = message or f'Failed to lex on line {line_num}'
self.message = message
super(LexError, self).__init__(self.message)
self.src = src
self.index = index
def pretty_print(self):
line_start = self.src[:self.index].rfind('\n') + 1
line = self.src[line_start:self.src.index('\n', line_start)]
print('^' * (1 + self.index - line_start))
class TokenMatch(namedtuple('TokenMatch', 'token literal is_wild')):
def __gt__(self, other):
if len(self.literal) > len(other.literal):
return True
if len(self.literal) < len(other.literal):
return False
# Prioritise non-wild tokens
return not self.is_wild and other.is_wild
def matching_tokens(token_dict, src):
for token in token_dict:
if token_dict[token] and (m := re.match(token_dict[token], src)):
yield TokenMatch(token,, is_wild=True)
elif src.startswith(token):
yield TokenMatch(token, token, is_wild=False)
def tokenise(token_dict, src):
i = 0
while i < len(src):
if src[i].isspace():
i += 1
best_match = max(matching_tokens(token_dict, src[i:]), default=None)
if not best_match:
raise LexError(src, i)
i += len(best_match.literal)
yield best_match.token, best_match.is_wild and best_match.literal
class Lookahead:
def __init__(self, it): = it
self.is_empty = False
self.lookahead = next(it)
def __iter__(self):
while not self.is_empty:
yield next(self)
def __next__(self):
if self.is_empty:
raise StopIteration
value = self.lookahead
self.lookahead = next(
except StopIteration:
self.is_empty = True
return value
def __bool__(self):
return not self.is_empty
def lex(token_dict, src):
return Lookahead(tokenise(token_dict, src))
byte foo
print foo
print foo + foo
type ← byte
← int
decl ← ⦃type⦄ ⦃space⦄ ⦃var⦄
print ← print ⦃expr⦄
expr ← ⦃var⦄
← ⦃expr⦄ + ⦃expr⦄
stmt ← ⦃print⦄
← ⦃decl⦄
var ≡ [_a-zA-Z][_0-9a-zA-Z]*
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment