Skip to content

Instantly share code, notes, and snippets.

@dwinston
Created July 8, 2019 21:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dwinston/5392a04c3312e8f1ba50439e3700c924 to your computer and use it in GitHub Desktop.
Save dwinston/5392a04c3312e8f1ba50439e3700c924 to your computer and use it in GitHub Desktop.
Code that takes some Lisp code and returns an abstract syntax tree (AST). The AST represents the structure of the code and the meaning of each token.
"""
Code that takes some Lisp code and returns an abstract syntax tree (AST).
My lisp knows about the following types of tokens:
- the expression delimiters `(` and `)`
- strings (delimited by quote characters)
- numbers
- boolean values `true` (corresponding to Python's `True`) and `false` (corresponding to Python's `False`)
- the null value `null` (corresponding to Python's `None`)
- symbols, which are all tokens that are not one of the above.
A symbol can represent a function such as `list`, `first`, or `+`.
"""
import re
class NoMatch:
pass
_no_match = NoMatch()
class TokenPattern:
"""
Determiner of whether code_fragment starts with pattern.
Returns match and remains of code_fragment.
"""
def __init__(self, pattern):
self.pattern = re.compile(rf"\s*({pattern})(.*)")
def match_and_remains(self, code_fragment):
match = self.pattern.match(code_fragment)
if match:
groups = match.groups()
return groups[0], groups[-1]
else:
return _no_match, code_fragment
token_pattern = {
"paren": TokenPattern(r"\(|\)"),
# See https://docs.python.org/3/library/re.html#simulating-scanf
"number": TokenPattern(r"[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?"),
# See https://www.metaltoad.com/blog/regex-quoted-string-escapable-quotes
"string": TokenPattern(
r"(?P<quote>\\?['\"])" # Match a single or double quote (possibly escaped). Store this match as `quote`.
r"(?:(?:." # Continue matching any characters (but need not capture any internal groups)
r"(?!(?<!\\)(?P=quote)))*" # as long as they are not followed by (unescaped) `quote`.
r".?)(?P=quote)" # Finally, be sure to actually include the last character (if any) prior to the ending `quote`.
),
"true": TokenPattern(r"true"),
"false": TokenPattern(r"false"),
"null": TokenPattern(r"null"),
"symbol": TokenPattern(r"[^\s\(\)]+"),
}
def number_processor(token):
try:
return int(token)
except ValueError:
return float(token)
class LispParser:
token_readers = (
(token_pattern["paren"], None),
(token_pattern["number"], number_processor),
(token_pattern["string"], None),
(token_pattern["true"], lambda _: True),
(token_pattern["false"], lambda _: False),
(token_pattern["null"], lambda _: None),
(token_pattern["symbol"], None)
)
@classmethod
def tokenize(cls, code):
tokens = []
code = code.strip()
while code:
for pattern, processor in cls.token_readers:
match, code = pattern.match_and_remains(code)
if match is _no_match:
continue
else:
if processor:
match = processor(match)
tokens.append(match)
break
else:
# loop fell through without finding a token pattern
raise Exception(f'Could not find token at head of code fragment "{code}"')
return tokens
@classmethod
def astify(cls, tokens):
ast_ = []
while tokens:
head, rest = tokens[0], tokens[1:]
if head == "(":
ast_element, tokens = cls.astify(rest)
ast_.append(ast_element)
elif head == ")":
return ast_, rest
else:
ast_.append(head)
tokens = rest
return ast_, []
@classmethod
def read(cls, code):
tokens = cls.tokenize(code)
return cls.astify(tokens)[0][0]
@classmethod
def evaluate(cls, ast_):
raise NotImplementedError("TBD")
@classmethod
def repl(cls, code):
print(cls.evaluate(cls.read(code)))
def test_tokenizing():
parser = LispParser()
expected_outcomes = (
(
"(first (list 1 (+ 2 3) 9))",
['(', 'first', '(', 'list', 1, '(', '+', 2, 3, ')', 9, ')', ')']
),
(
(r"(first (list (true false null symbol? .3 1e5 3.14 6.022e23 6.626e-34 -2.71828)"
r" ('say your name' 'say \'your name\'' \"Quoth the Maven 'Rarelymore'.\")"
r"))"),
['(', 'first', '(', 'list', '(', True, False, None, 'symbol?', 0.3, 100000.0, 3.14, 6.022e+23, 6.626e-34, -2.71828,
')', '(', "'say your name'", "'say \\'your name\\''", '\\"Quoth the Maven \'Rarelymore\'.\\"', ')', ')', ')']
),
)
for code, tokens in expected_outcomes:
assert parser.tokenize(code) == tokens
def test_astify():
parser = LispParser()
expected_outcomes = (
(
['(', 'first', '(', 'list', 1, '(', '+', 2, 3, ')', 9, ')', ')'],
([['first', ['list', 1, ['+', 2, 3], 9]]], [])
),
)
for tokens, ast_ in expected_outcomes:
assert parser.astify(tokens) == ast_
def test_read():
parser = LispParser()
expected_outcomes = (
(
"(first (list 1 (+ 2 3) 9))",
['first', ['list', 1, ['+', 2, 3], 9]]
),
)
for code, ast_ in expected_outcomes:
assert parser.read(code) == ast_
test_tokenizing()
test_astify()
test_read()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment