Skip to content

Instantly share code, notes, and snippets.

@betafcc
Last active September 4, 2018 13:16
Show Gist options
  • Save betafcc/8e43ddbf3e41a8488fd0f58687832a67 to your computer and use it in GitHub Desktop.
Save betafcc/8e43ddbf3e41a8488fd0f58687832a67 to your computer and use it in GitHub Desktop.
Tiny regex based tokenizer in python
from .lexer import Lexer
code = '''
(define (fib x)
(if (lt x 2)
x
(add (fib (sub x 1))
(fib (sub x 2)))))
'''
lexer = Lexer( # **kwargs do preserve order
WHITESPACE = r'\s+',
PAREN = r'[\(\)]',
DEFINE = r'define',
IF = r'if',
NAME = r'[a-zA-Z]\w*',
NUMBER = r'\d+',
)
list(lexer.tokenize(code))[:10]
# [('WHITESPACE', '\n'),
# ('LPAREN', '('),
# ('DEFINE', 'define'),
# ('WHITESPACE', ' '),
# ('LPAREN', '('),
# ('NAME', 'fib'),
# ('WHITESPACE', ' '),
# ('NAME', 'x'),
# ('RPAREN', ')'),
# ('WHITESPACE', '\n ')]
from re import compile
class Lexer:
def __init__(self, **rules):
self.rules = [(k, compile('^' + v)) for k, v in rules.items()]
def tokenize(self, code):
while code:
match, code = self.tokenize_one(code)
yield match
def tokenize_one(self, code):
for name, rule in self.rules:
m = rule.search(code)
if m: return (name, code[:m.end()]), code[m.end():]
raise Exception("Can't match:\n" + code)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment