Skip to content

Instantly share code, notes, and snippets.

@cellularmitosis
Last active October 15, 2019 07:33
Show Gist options
  • Save cellularmitosis/fa41234f641ce55a737460e9f7ba0c30 to your computer and use it in GitHub Desktop.
Save cellularmitosis/fa41234f641ce55a737460e9f7ba0c30 to your computer and use it in GitHub Desktop.
Messing around with lexing and parsing (mathterp.py)

Blog 2019/7/1

<- previous | index | next ->

Messing around with lexing and parsing (mathterp.py)

Revisiting this again with some co-workers.

This time around I've tried to split up the code into some reusable chunks.

mathterp1.py

This grammar makes sense to us as humans, but demonstrates a weakness of recursive descent: that it can't do left recursion (it leads to infinite recursion).

# grammar:
#  expr:   infix | num
#  infix:  expr op expr
#  op:     plus | star
#  num:    /\d+/
#  plus:   /\+/
#  star:   /\*/
$ echo '1+1' | ./mathterp1.py
...
  File "./mathterp1.py", line 48, in parse_infix
    (ltree, tokens1) = parse_expr(tokens)
  File "./mathterp1.py", line 61, in parse_expr
    (tree, tokens1) = parse_infix(tokens)
  File "./mathterp1.py", line 48, in parse_infix
    (ltree, tokens1) = parse_expr(tokens)
  File "./mathterp1.py", line 61, in parse_expr
    (tree, tokens1) = parse_infix(tokens)
  File "./mathterp1.py", line 48, in parse_infix
    (ltree, tokens1) = parse_expr(tokens)
RuntimeError: maximum recursion depth exceeded

The way to address this is by modifying the grammar. This brings up a trade-off with recursive descent: that you often end up creating an unintuitive grammar in order to satisfy the limitations of recursive descent.

mathterp2.py

This grammar avoids the infinite loop because it is right-recursive.

# grammar:
#  expr:   infix | num
#  infix:  num op expr
#  op:     plus | star
#  num:    /\d+/
#  plus:   /\+/
#  star:   /\*/
$ echo '1+1' | ./mathterp2.py --graphviz
=> 2

m2_1p1

However, it can't deal with operator precedence rules (3+1 is evaluated first, which is incorrect):

$ echo '2*3+1' | ./mathterp2.py --graphviz
=> 8

m2_2t3p1

and it can't deal with associativity (2-1 is evaluated first, which is incorrect):

$ echo '3-2-1' | ./mathterp2.py --graphviz
=> 2

m2_3m2m1

mathterp3.py (TODO)

We can implement this grammar, which will address operator precedence, but still doesn't address associativity.

# grammar:
#  expr:    term
#  term:    factor plus term
#  factor:  num star factor
#  num:     /\d+/
#  plus:    /\+/
#  star:    /\*/
import sys
def evaluate(tree):
"""evaluate the parse tree (calculate the answer)."""
if tree is None:
return None
if "--ast" in sys.argv:
import pprint
pprint.pprint(tree)
if "--dot" in sys.argv:
print dot(tree),
if "--graphviz" in sys.argv:
run_graphviz(dot(tree))
def _eval(node):
ntype = node["ntype"]
children = node["children"]
if ntype == "expr":
return _eval(node["children"][0])
elif ntype == "num":
return int(node["token"]["text"])
elif ntype == "plus":
return lambda x,y: x + y
elif ntype == "minus":
return lambda x,y: x - y
elif ntype == "star":
return lambda x,y: x * y
elif ntype == "slash":
return lambda x,y: x / y
elif ntype == "infix":
l = _eval(children[0])
op = _eval(children[1])
r = _eval(children[2])
return op(l,r)
else:
raise Exception("Don't know how to eval %s" % node)
return _eval(tree)
### output
def myprint(result):
if result is not None:
print "=>", result
return result is not None
def dot(tree):
"""represent the tree in graphviz "dot" notation."""
n = [0] # we stuff 'n' into an array to easily pass by reference
leaves = []
def _dot(tree, i):
root = "n%i" % n[0]
out = ""
for child in tree["children"]:
n[0] += 1
nodenum = "n%i" % n[0]
if len(child["children"]):
out += "\n %s [label=\"%s\"]" % (nodenum, child["ntype"])
else:
out += "\n %s [label=\"%s\"]" % (nodenum, "%s(\'%s\')" % (child["ntype"], child["token"]["text"]))
out += "\n %s -> %s;" % (root, nodenum)
out += _dot(child, n)
if len(tree["children"]) == 0:
leaves.append(root)
return out
output = "// paste this into http://www.webgraphviz.com/"
output += "\ndigraph parse_tree {"
root = "n%i" % n[0]
output += "\n %s [label=\"%s\"]" % (root, tree["ntype"])
output += _dot(tree, n)
output += "\n {rank = same; %s}" % " ".join(["%s;" % l for l in leaves])
output += "\n}\n"
return output
def run_graphviz(dot):
with open("/tmp/ast.dot", "w") as f:
f.write(dot)
import subprocess
subprocess.check_call("dot -Tpng /tmp/ast.dot > /tmp/ast.png", shell=True)
subprocess.check_call("open /tmp/ast.png", shell=True)
import sys
import re
def load_token_defs(fname):
"""load the labelled token regex patterns from a file."""
re_table = {}
with open(fname) as f:
while True:
label = f.readline().rstrip()
if label == '':
# we reached the end of the file.
break
pattern = f.readline().rstrip()
re_table[label] = re.compile(pattern)
return re_table
def read():
"""read the next line of input."""
return sys.stdin.readline().rstrip()
def make_token(toktype, text):
"""create a token (using JSON structure)."""
return { "ttype": toktype, "text": text }
def tokenize(re_table, line):
"""use the regex table to turn the line of input into tokens."""
tokens = []
while len(line):
token = None
for toktype, regex in re_table.iteritems():
m = regex.match(line)
if m:
matched_text = m.group()
token = make_token(toktype, matched_text)
line = line[len(matched_text):]
break
if token:
tokens.append(token)
else:
raise ValueError("parse error near: '%s'" % line)
return tokens
def is_toktype(token, toktype):
if token is None:
return False
return token["ttype"] == toktype
#!/usr/bin/env python
import sys
from xlist import *
from lexer import *
from xast import *
from evaluator import *
### recursive descent parser
# grammar:
# expr: infix | num
# infix: expr op expr
# op: plus | star
# num: /\d+/
# plus: /\+/
# star: /\*/
def parse_num(tokens):
if is_toktype(first(tokens), 'num'):
return (make_leaf_node(first(tokens)), rest(tokens))
else:
return (None, tokens)
def parse_plus(tokens):
if is_toktype(first(tokens), 'plus'):
return (make_leaf_node(first(tokens)), rest(tokens))
else:
return (None, tokens)
def parse_star(tokens):
if is_toktype(first(tokens), 'star'):
return (make_leaf_node(first(tokens)), rest(tokens))
else:
return (None, tokens)
def parse_op(tokens):
(tree, tokens1) = parse_plus(tokens)
if tree is None:
(tree, tokens1) = parse_star(tokens)
if tree is None:
return (None, tokens)
return (tree, tokens1)
def parse_infix(tokens):
(ltree, tokens1) = parse_expr(tokens)
if ltree is None:
return (None, tokens)
(op, tokens2) = parse_op(tokens1)
if op is None:
return (None, tokens)
(rtree, tokens3) = parse_expr(tokens2)
if rtree is None:
return (None, tokens)
children = [ltree, op, rtree]
return (make_internal_node("infix", children), tokens3)
def parse_expr(tokens):
(tree, tokens1) = parse_infix(tokens)
if tree is None:
(tree, tokens1) = parse_num(tokens)
if tree is None:
return (None, tokens)
children = [tree]
return (make_internal_node("expr", children), tokens1)
def parse(tokens):
(tree, tokens1) = parse_expr(tokens)
if tree is None:
raise ValueError("unable to parse tokens: %s" % tokens)
if len(tokens1):
raise ValueError("leftover tokens: %s" % tokens1)
return tree
### main
if __name__ == "__main__":
re_table = load_token_defs('token-defs.txt')
while myprint(evaluate(parse(tokenize(re_table, read())))):
continue
#!/usr/bin/env python
import sys
from xlist import *
from lexer import *
from xast import *
from evaluator import *
### recursive descent parser
# grammar:
# expr: infix | num
# infix: num op expr
# op: plus | minus | star
# num: /\d+/
# plus: /\+/
# minus: /-/
# star: /\*/
def parse_num(tokens):
if is_toktype(first(tokens), 'num'):
return (make_leaf_node(first(tokens)), rest(tokens))
else:
return (None, tokens)
def parse_plus(tokens):
if is_toktype(first(tokens), 'plus'):
return (make_leaf_node(first(tokens)), rest(tokens))
else:
return (None, tokens)
def parse_minus(tokens):
if is_toktype(first(tokens), 'minus'):
return (make_leaf_node(first(tokens)), rest(tokens))
else:
return (None, tokens)
def parse_star(tokens):
if is_toktype(first(tokens), 'star'):
return (make_leaf_node(first(tokens)), rest(tokens))
else:
return (None, tokens)
def parse_op(tokens):
(tree, tokens1) = parse_plus(tokens)
if tree is None:
(tree, tokens1) = parse_minus(tokens)
if tree is None:
(tree, tokens1) = parse_star(tokens)
if tree is None:
return (None, tokens)
return (tree, tokens1)
def parse_infix(tokens):
(ltree, tokens1) = parse_num(tokens)
if ltree is None:
return (None, tokens)
(op, tokens2) = parse_op(tokens1)
if op is None:
return (None, tokens)
(rtree, tokens3) = parse_expr(tokens2)
if rtree is None:
return (None, tokens)
children = [ltree, op, rtree]
return (make_internal_node("infix", children), tokens3)
def parse_expr(tokens):
(tree, tokens1) = parse_infix(tokens)
if tree is None:
(tree, tokens1) = parse_num(tokens)
if tree is None:
return (None, tokens)
children = [tree]
return (make_internal_node("expr", children), tokens1)
def parse(tokens):
if len(tokens) == 0:
return None
(tree, tokens1) = parse_expr(tokens)
if tree is None:
raise ValueError("unable to parse tokens: %s" % tokens)
if len(tokens1):
raise ValueError("leftover tokens: %s" % tokens1)
return tree
### main
if __name__ == "__main__":
re_table = load_token_defs('token-defs.txt')
while myprint(evaluate(parse(tokenize(re_table, read())))):
continue
num
\d+
plus
\+
minus
-
star
\*
slash
/
carat
\^
def make_internal_node(nodetype, children):
"""create an internal AST node."""
return {"ntype": nodetype, "token": None, "children": children}
def make_leaf_node(token):
"""create an AST leaf node."""
return {"ntype": token["ttype"], "token": token, "children": []}
def first(l):
if len(l):
return l[0]
else:
return None
def rest(l):
return l[1:]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment