Created
July 8, 2019 21:47
-
-
Save dwinston/5392a04c3312e8f1ba50439e3700c924 to your computer and use it in GitHub Desktop.
Code that takes some Lisp code and returns an abstract syntax tree (AST). The AST represents the structure of the code and the meaning of each token.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Code that takes some Lisp code and returns an abstract syntax tree (AST). | |
My lisp knows about the following types of tokens: | |
- the expression delimiters `(` and `)` | |
- strings (delimited by quote characters) | |
- numbers | |
- boolean values `true` (corresponding to Python's `True`) and `false` (corresponding to Python's `False`) | |
- the null value `null` (corresponding to Python's `None`) | |
- symbols, which are all tokens that are not one of the above. | |
A symbol can represent a function such as `list`, `first`, or `+`. | |
""" | |
import re | |
class NoMatch: | |
pass | |
_no_match = NoMatch() | |
class TokenPattern: | |
""" | |
Determiner of whether code_fragment starts with pattern. | |
Returns match and remains of code_fragment. | |
""" | |
def __init__(self, pattern): | |
self.pattern = re.compile(rf"\s*({pattern})(.*)") | |
def match_and_remains(self, code_fragment): | |
match = self.pattern.match(code_fragment) | |
if match: | |
groups = match.groups() | |
return groups[0], groups[-1] | |
else: | |
return _no_match, code_fragment | |
token_pattern = { | |
"paren": TokenPattern(r"\(|\)"), | |
# See https://docs.python.org/3/library/re.html#simulating-scanf | |
"number": TokenPattern(r"[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?"), | |
# See https://www.metaltoad.com/blog/regex-quoted-string-escapable-quotes | |
"string": TokenPattern( | |
r"(?P<quote>\\?['\"])" # Match a single or double quote (possibly escaped). Store this match as `quote`. | |
r"(?:(?:." # Continue matching any characters (but need not capture any internal groups) | |
r"(?!(?<!\\)(?P=quote)))*" # as long as they are not followed by (unescaped) `quote`. | |
r".?)(?P=quote)" # Finally, be sure to actually include the last character (if any) prior to the ending `quote`. | |
), | |
"true": TokenPattern(r"true"), | |
"false": TokenPattern(r"false"), | |
"null": TokenPattern(r"null"), | |
"symbol": TokenPattern(r"[^\s\(\)]+"), | |
} | |
def number_processor(token): | |
try: | |
return int(token) | |
except ValueError: | |
return float(token) | |
class LispParser: | |
token_readers = ( | |
(token_pattern["paren"], None), | |
(token_pattern["number"], number_processor), | |
(token_pattern["string"], None), | |
(token_pattern["true"], lambda _: True), | |
(token_pattern["false"], lambda _: False), | |
(token_pattern["null"], lambda _: None), | |
(token_pattern["symbol"], None) | |
) | |
@classmethod | |
def tokenize(cls, code): | |
tokens = [] | |
code = code.strip() | |
while code: | |
for pattern, processor in cls.token_readers: | |
match, code = pattern.match_and_remains(code) | |
if match is _no_match: | |
continue | |
else: | |
if processor: | |
match = processor(match) | |
tokens.append(match) | |
break | |
else: | |
# loop fell through without finding a token pattern | |
raise Exception(f'Could not find token at head of code fragment "{code}"') | |
return tokens | |
@classmethod | |
def astify(cls, tokens): | |
ast_ = [] | |
while tokens: | |
head, rest = tokens[0], tokens[1:] | |
if head == "(": | |
ast_element, tokens = cls.astify(rest) | |
ast_.append(ast_element) | |
elif head == ")": | |
return ast_, rest | |
else: | |
ast_.append(head) | |
tokens = rest | |
return ast_, [] | |
@classmethod | |
def read(cls, code): | |
tokens = cls.tokenize(code) | |
return cls.astify(tokens)[0][0] | |
@classmethod | |
def evaluate(cls, ast_): | |
raise NotImplementedError("TBD") | |
@classmethod | |
def repl(cls, code): | |
print(cls.evaluate(cls.read(code))) | |
def test_tokenizing(): | |
parser = LispParser() | |
expected_outcomes = ( | |
( | |
"(first (list 1 (+ 2 3) 9))", | |
['(', 'first', '(', 'list', 1, '(', '+', 2, 3, ')', 9, ')', ')'] | |
), | |
( | |
(r"(first (list (true false null symbol? .3 1e5 3.14 6.022e23 6.626e-34 -2.71828)" | |
r" ('say your name' 'say \'your name\'' \"Quoth the Maven 'Rarelymore'.\")" | |
r"))"), | |
['(', 'first', '(', 'list', '(', True, False, None, 'symbol?', 0.3, 100000.0, 3.14, 6.022e+23, 6.626e-34, -2.71828, | |
')', '(', "'say your name'", "'say \\'your name\\''", '\\"Quoth the Maven \'Rarelymore\'.\\"', ')', ')', ')'] | |
), | |
) | |
for code, tokens in expected_outcomes: | |
assert parser.tokenize(code) == tokens | |
def test_astify(): | |
parser = LispParser() | |
expected_outcomes = ( | |
( | |
['(', 'first', '(', 'list', 1, '(', '+', 2, 3, ')', 9, ')', ')'], | |
([['first', ['list', 1, ['+', 2, 3], 9]]], []) | |
), | |
) | |
for tokens, ast_ in expected_outcomes: | |
assert parser.astify(tokens) == ast_ | |
def test_read(): | |
parser = LispParser() | |
expected_outcomes = ( | |
( | |
"(first (list 1 (+ 2 3) 9))", | |
['first', ['list', 1, ['+', 2, 3], 9]] | |
), | |
) | |
for code, ast_ in expected_outcomes: | |
assert parser.read(code) == ast_ | |
test_tokenizing() | |
test_astify() | |
test_read() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment