dwinston/lisp_parser.py

## lisp_parser.py
"""
Code that takes some Lisp code and returns an abstract syntax tree (AST).

My lisp knows about the following types of tokens:

- the expression delimiters `(` and `)`
- strings (delimited by quote characters)
- numbers
- boolean values `true` (corresponding to Python's `True`) and `false` (corresponding to Python's `False`)
- the null value `null` (corresponding to Python's `None`)
- symbols, which are all tokens that are not one of the above.

A symbol can represent a function such as `list`, `first`, or `+`.
"""
import re

class NoMatch:
    pass

_no_match = NoMatch()

class TokenPattern:
    """
    Determiner of whether code_fragment starts with pattern.

    Returns match and remains of code_fragment.
    """
    def __init__(self, pattern):
        self.pattern = re.compile(rf"\s*({pattern})(.*)")

    def match_and_remains(self, code_fragment):
        match = self.pattern.match(code_fragment)
        if match:
            groups = match.groups()
            return groups[0], groups[-1]
        else:
            return _no_match, code_fragment

token_pattern = {
    "paren": TokenPattern(r"\(|\)"),

    # See https://docs.python.org/3/library/re.html#simulating-scanf
    "number": TokenPattern(r"[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?"),

    # See https://www.metaltoad.com/blog/regex-quoted-string-escapable-quotes
    "string": TokenPattern(
        r"(?P<quote>\\?['\"])"  # Match a single or double quote (possibly escaped). Store this match as `quote`.
        r"(?:(?:."  # Continue matching any characters (but need not capture any internal groups)
        r"(?!(?<!\\)(?P=quote)))*"  # as long as they are not followed by (unescaped) `quote`.
        r".?)(?P=quote)"  # Finally, be sure to actually include the last character (if any) prior to the ending `quote`.
    ),

    "true": TokenPattern(r"true"),
    "false": TokenPattern(r"false"),
    "null": TokenPattern(r"null"),
    "symbol": TokenPattern(r"[^\s\(\)]+"),
}

def number_processor(token):
    try:
        return int(token)
    except ValueError:
        return float(token)

class LispParser:
    token_readers = (
        (token_pattern["paren"], None),
        (token_pattern["number"], number_processor),
        (token_pattern["string"], None),
        (token_pattern["true"], lambda _: True),
        (token_pattern["false"], lambda _: False),
        (token_pattern["null"], lambda _: None),
        (token_pattern["symbol"], None)
    )

    @classmethod
    def tokenize(cls, code):
        tokens = []
        code = code.strip()
        while code:
            for pattern, processor in cls.token_readers:
                match, code = pattern.match_and_remains(code)
                if match is _no_match:
                    continue
                else:
                    if processor:
                        match = processor(match)
                    tokens.append(match)
                    break
            else:
                # loop fell through without finding a token pattern
                raise Exception(f'Could not find token at head of code fragment "{code}"')
        return tokens

    @classmethod
    def astify(cls, tokens):
        ast_ = []
        while tokens:
            head, rest = tokens[0], tokens[1:]
            if head == "(":
                ast_element, tokens = cls.astify(rest)
                ast_.append(ast_element)
            elif head == ")":
                return ast_, rest
            else:
                ast_.append(head)
                tokens = rest
        return ast_, []

    @classmethod
    def read(cls, code):
        tokens = cls.tokenize(code)
        return cls.astify(tokens)[0][0]

    @classmethod
    def evaluate(cls, ast_):
        raise NotImplementedError("TBD")

    @classmethod
    def repl(cls, code):
        print(cls.evaluate(cls.read(code)))

def test_tokenizing():
    parser = LispParser()
    expected_outcomes = (
        (
            "(first (list 1 (+ 2 3) 9))",
            ['(', 'first', '(', 'list', 1, '(', '+', 2, 3, ')', 9, ')', ')']
        ),
        (
            (r"(first (list (true false null symbol? .3 1e5 3.14 6.022e23 6.626e-34 -2.71828)"
             r"             ('say your name' 'say \'your name\'' \"Quoth the Maven 'Rarelymore'.\")"
             r"))"),
            ['(', 'first', '(', 'list', '(', True, False, None, 'symbol?', 0.3, 100000.0, 3.14, 6.022e+23, 6.626e-34, -2.71828,
             ')', '(',  "'say your name'", "'say \\'your name\\''", '\\"Quoth the Maven \'Rarelymore\'.\\"', ')', ')', ')']
        ),
    )
    for code, tokens in expected_outcomes:
        assert parser.tokenize(code) == tokens

def test_astify():
    parser = LispParser()
    expected_outcomes = (
        (
            ['(', 'first', '(', 'list', 1, '(', '+', 2, 3, ')', 9, ')', ')'],
            ([['first', ['list', 1, ['+', 2, 3], 9]]], [])
        ),
    )
    for tokens, ast_ in expected_outcomes:
        assert parser.astify(tokens) == ast_

def test_read():
    parser = LispParser()
    expected_outcomes = (
        (
            "(first (list 1 (+ 2 3) 9))",
            ['first', ['list', 1, ['+', 2, 3], 9]]
        ),
    )
    for code, ast_ in expected_outcomes:
        assert parser.read(code) == ast_

test_tokenizing()
test_astify()
test_read()
	"""
	Code that takes some Lisp code and returns an abstract syntax tree (AST).

	My lisp knows about the following types of tokens:

	- the expression delimiters `(` and `)`
	- strings (delimited by quote characters)
	- numbers
	- boolean values `true` (corresponding to Python's `True`) and `false` (corresponding to Python's `False`)
	- the null value `null` (corresponding to Python's `None`)
	- symbols, which are all tokens that are not one of the above.

	A symbol can represent a function such as `list`, `first`, or `+`.
	"""
	import re

	class NoMatch:
	pass

	_no_match = NoMatch()

	class TokenPattern:
	"""
	Determiner of whether code_fragment starts with pattern.

	Returns match and remains of code_fragment.
	"""
	def __init__(self, pattern):
	self.pattern = re.compile(rf"\s({pattern})(.)")

	def match_and_remains(self, code_fragment):
	match = self.pattern.match(code_fragment)
	if match:
	groups = match.groups()
	return groups[0], groups[-1]
	else:
	return _no_match, code_fragment

	token_pattern = {
	"paren": TokenPattern(r"\(\|\)"),

	# See https://docs.python.org/3/library/re.html#simulating-scanf
	"number": TokenPattern(r"[-+]?(\d+(\.\d*)?\|\.\d+)([eE][-+]?\d+)?"),

	# See https://www.metaltoad.com/blog/regex-quoted-string-escapable-quotes
	"string": TokenPattern(
	r"(?P<quote>\\?['\"])" # Match a single or double quote (possibly escaped). Store this match as `quote`.
	r"(?:(?:." # Continue matching any characters (but need not capture any internal groups)
	r"(?!(?<!\\)(?P=quote)))*" # as long as they are not followed by (unescaped) `quote`.
	r".?)(?P=quote)" # Finally, be sure to actually include the last character (if any) prior to the ending `quote`.
	),

	"true": TokenPattern(r"true"),
	"false": TokenPattern(r"false"),
	"null": TokenPattern(r"null"),
	"symbol": TokenPattern(r"[^\s\(\)]+"),
	}

	def number_processor(token):
	try:
	return int(token)
	except ValueError:
	return float(token)

	class LispParser:
	token_readers = (
	(token_pattern["paren"], None),
	(token_pattern["number"], number_processor),
	(token_pattern["string"], None),
	(token_pattern["true"], lambda _: True),
	(token_pattern["false"], lambda _: False),
	(token_pattern["null"], lambda _: None),
	(token_pattern["symbol"], None)
	)

	@classmethod
	def tokenize(cls, code):
	tokens = []
	code = code.strip()
	while code:
	for pattern, processor in cls.token_readers:
	match, code = pattern.match_and_remains(code)
	if match is _no_match:
	continue
	else:
	if processor:
	match = processor(match)
	tokens.append(match)
	break
	else:
	# loop fell through without finding a token pattern
	raise Exception(f'Could not find token at head of code fragment "{code}"')
	return tokens

	@classmethod
	def astify(cls, tokens):
	ast_ = []
	while tokens:
	head, rest = tokens[0], tokens[1:]
	if head == "(":
	ast_element, tokens = cls.astify(rest)
	ast_.append(ast_element)
	elif head == ")":
	return ast_, rest
	else:
	ast_.append(head)
	tokens = rest
	return ast_, []

	@classmethod
	def read(cls, code):
	tokens = cls.tokenize(code)
	return cls.astify(tokens)[0][0]

	@classmethod
	def evaluate(cls, ast_):
	raise NotImplementedError("TBD")

	@classmethod
	def repl(cls, code):
	print(cls.evaluate(cls.read(code)))

	def test_tokenizing():
	parser = LispParser()
	expected_outcomes = (
	(
	"(first (list 1 (+ 2 3) 9))",
	['(', 'first', '(', 'list', 1, '(', '+', 2, 3, ')', 9, ')', ')']
	),
	(
	(r"(first (list (true false null symbol? .3 1e5 3.14 6.022e23 6.626e-34 -2.71828)"
	r" ('say your name' 'say \'your name\'' \"Quoth the Maven 'Rarelymore'.\")"
	r"))"),
	['(', 'first', '(', 'list', '(', True, False, None, 'symbol?', 0.3, 100000.0, 3.14, 6.022e+23, 6.626e-34, -2.71828,
	')', '(', "'say your name'", "'say \\'your name\\''", '\\"Quoth the Maven \'Rarelymore\'.\\"', ')', ')', ')']
	),
	)
	for code, tokens in expected_outcomes:
	assert parser.tokenize(code) == tokens

	def test_astify():
	parser = LispParser()
	expected_outcomes = (
	(
	['(', 'first', '(', 'list', 1, '(', '+', 2, 3, ')', 9, ')', ')'],
	([['first', ['list', 1, ['+', 2, 3], 9]]], [])
	),
	)
	for tokens, ast_ in expected_outcomes:
	assert parser.astify(tokens) == ast_

	def test_read():
	parser = LispParser()
	expected_outcomes = (
	(
	"(first (list 1 (+ 2 3) 9))",
	['first', ['list', 1, ['+', 2, 3], 9]]
	),
	)
	for code, ast_ in expected_outcomes:
	assert parser.read(code) == ast_

	test_tokenizing()
	test_astify()
	test_read()