Skip to content
{{ message }}

Instantly share code, notes, and snippets.

# cellularmitosis/README.md

Last active Oct 15, 2019
Messing around with lexing and parsing (mathterp.py)

Blog 2019/7/1

# Messing around with lexing and parsing (mathterp.py)

Revisiting this again with some co-workers.

This time around I've tried to split up the code into some reusable chunks.

## mathterp1.py

This grammar makes sense to us as humans, but demonstrates a weakness of recursive descent: that it can't do left recursion (it leads to infinite recursion).

``````# grammar:
#  expr:   infix | num
#  infix:  expr op expr
#  op:     plus | star
#  num:    /\d+/
#  plus:   /\+/
#  star:   /\*/
``````
``````\$ echo '1+1' | ./mathterp1.py
...
File "./mathterp1.py", line 48, in parse_infix
(ltree, tokens1) = parse_expr(tokens)
File "./mathterp1.py", line 61, in parse_expr
(tree, tokens1) = parse_infix(tokens)
File "./mathterp1.py", line 48, in parse_infix
(ltree, tokens1) = parse_expr(tokens)
File "./mathterp1.py", line 61, in parse_expr
(tree, tokens1) = parse_infix(tokens)
File "./mathterp1.py", line 48, in parse_infix
(ltree, tokens1) = parse_expr(tokens)
RuntimeError: maximum recursion depth exceeded
``````

The way to address this is by modifying the grammar. This brings up a trade-off with recursive descent: that you often end up creating an unintuitive grammar in order to satisfy the limitations of recursive descent.

## mathterp2.py

This grammar avoids the infinite loop because it is right-recursive.

``````# grammar:
#  expr:   infix | num
#  infix:  num op expr
#  op:     plus | star
#  num:    /\d+/
#  plus:   /\+/
#  star:   /\*/
``````
``````\$ echo '1+1' | ./mathterp2.py --graphviz
=> 2
`````` However, it can't deal with operator precedence rules (`3+1` is evaluated first, which is incorrect):

``````\$ echo '2*3+1' | ./mathterp2.py --graphviz
=> 8
`````` and it can't deal with associativity (`2-1` is evaluated first, which is incorrect):

``````\$ echo '3-2-1' | ./mathterp2.py --graphviz
=> 2
`````` ## mathterp3.py (TODO)

We can implement this grammar, which will address operator precedence, but still doesn't address associativity.

``````# grammar:
#  expr:    term
#  term:    factor plus term
#  factor:  num star factor
#  num:     /\d+/
#  plus:    /\+/
#  star:    /\*/
``````
 import sys def evaluate(tree): """evaluate the parse tree (calculate the answer).""" if tree is None: return None if "--ast" in sys.argv: import pprint pprint.pprint(tree) if "--dot" in sys.argv: print dot(tree), if "--graphviz" in sys.argv: run_graphviz(dot(tree)) def _eval(node): ntype = node["ntype"] children = node["children"] if ntype == "expr": return _eval(node["children"]) elif ntype == "num": return int(node["token"]["text"]) elif ntype == "plus": return lambda x,y: x + y elif ntype == "minus": return lambda x,y: x - y elif ntype == "star": return lambda x,y: x * y elif ntype == "slash": return lambda x,y: x / y elif ntype == "infix": l = _eval(children) op = _eval(children) r = _eval(children) return op(l,r) else: raise Exception("Don't know how to eval %s" % node) return _eval(tree) ### output def myprint(result): if result is not None: print "=>", result return result is not None def dot(tree): """represent the tree in graphviz "dot" notation.""" n =  # we stuff 'n' into an array to easily pass by reference leaves = [] def _dot(tree, i): root = "n%i" % n out = "" for child in tree["children"]: n += 1 nodenum = "n%i" % n if len(child["children"]): out += "\n %s [label=\"%s\"]" % (nodenum, child["ntype"]) else: out += "\n %s [label=\"%s\"]" % (nodenum, "%s(\'%s\')" % (child["ntype"], child["token"]["text"])) out += "\n %s -> %s;" % (root, nodenum) out += _dot(child, n) if len(tree["children"]) == 0: leaves.append(root) return out output = "// paste this into http://www.webgraphviz.com/" output += "\ndigraph parse_tree {" root = "n%i" % n output += "\n %s [label=\"%s\"]" % (root, tree["ntype"]) output += _dot(tree, n) output += "\n {rank = same; %s}" % " ".join(["%s;" % l for l in leaves]) output += "\n}\n" return output def run_graphviz(dot): with open("/tmp/ast.dot", "w") as f: f.write(dot) import subprocess subprocess.check_call("dot -Tpng /tmp/ast.dot > /tmp/ast.png", shell=True) subprocess.check_call("open /tmp/ast.png", shell=True)
 import sys import re def load_token_defs(fname): """load the labelled token regex patterns from a file.""" re_table = {} with open(fname) as f: while True: label = f.readline().rstrip() if label == '': # we reached the end of the file. break pattern = f.readline().rstrip() re_table[label] = re.compile(pattern) return re_table def read(): """read the next line of input.""" return sys.stdin.readline().rstrip() def make_token(toktype, text): """create a token (using JSON structure).""" return { "ttype": toktype, "text": text } def tokenize(re_table, line): """use the regex table to turn the line of input into tokens.""" tokens = [] while len(line): token = None for toktype, regex in re_table.iteritems(): m = regex.match(line) if m: matched_text = m.group() token = make_token(toktype, matched_text) line = line[len(matched_text):] break if token: tokens.append(token) else: raise ValueError("parse error near: '%s'" % line) return tokens def is_toktype(token, toktype): if token is None: return False return token["ttype"] == toktype
 #!/usr/bin/env python import sys from xlist import * from lexer import * from xast import * from evaluator import * ### recursive descent parser # grammar: # expr: infix | num # infix: expr op expr # op: plus | star # num: /\d+/ # plus: /\+/ # star: /\*/ def parse_num(tokens): if is_toktype(first(tokens), 'num'): return (make_leaf_node(first(tokens)), rest(tokens)) else: return (None, tokens) def parse_plus(tokens): if is_toktype(first(tokens), 'plus'): return (make_leaf_node(first(tokens)), rest(tokens)) else: return (None, tokens) def parse_star(tokens): if is_toktype(first(tokens), 'star'): return (make_leaf_node(first(tokens)), rest(tokens)) else: return (None, tokens) def parse_op(tokens): (tree, tokens1) = parse_plus(tokens) if tree is None: (tree, tokens1) = parse_star(tokens) if tree is None: return (None, tokens) return (tree, tokens1) def parse_infix(tokens): (ltree, tokens1) = parse_expr(tokens) if ltree is None: return (None, tokens) (op, tokens2) = parse_op(tokens1) if op is None: return (None, tokens) (rtree, tokens3) = parse_expr(tokens2) if rtree is None: return (None, tokens) children = [ltree, op, rtree] return (make_internal_node("infix", children), tokens3) def parse_expr(tokens): (tree, tokens1) = parse_infix(tokens) if tree is None: (tree, tokens1) = parse_num(tokens) if tree is None: return (None, tokens) children = [tree] return (make_internal_node("expr", children), tokens1) def parse(tokens): (tree, tokens1) = parse_expr(tokens) if tree is None: raise ValueError("unable to parse tokens: %s" % tokens) if len(tokens1): raise ValueError("leftover tokens: %s" % tokens1) return tree ### main if __name__ == "__main__": re_table = load_token_defs('token-defs.txt') while myprint(evaluate(parse(tokenize(re_table, read())))): continue
 #!/usr/bin/env python import sys from xlist import * from lexer import * from xast import * from evaluator import * ### recursive descent parser # grammar: # expr: infix | num # infix: num op expr # op: plus | minus | star # num: /\d+/ # plus: /\+/ # minus: /-/ # star: /\*/ def parse_num(tokens): if is_toktype(first(tokens), 'num'): return (make_leaf_node(first(tokens)), rest(tokens)) else: return (None, tokens) def parse_plus(tokens): if is_toktype(first(tokens), 'plus'): return (make_leaf_node(first(tokens)), rest(tokens)) else: return (None, tokens) def parse_minus(tokens): if is_toktype(first(tokens), 'minus'): return (make_leaf_node(first(tokens)), rest(tokens)) else: return (None, tokens) def parse_star(tokens): if is_toktype(first(tokens), 'star'): return (make_leaf_node(first(tokens)), rest(tokens)) else: return (None, tokens) def parse_op(tokens): (tree, tokens1) = parse_plus(tokens) if tree is None: (tree, tokens1) = parse_minus(tokens) if tree is None: (tree, tokens1) = parse_star(tokens) if tree is None: return (None, tokens) return (tree, tokens1) def parse_infix(tokens): (ltree, tokens1) = parse_num(tokens) if ltree is None: return (None, tokens) (op, tokens2) = parse_op(tokens1) if op is None: return (None, tokens) (rtree, tokens3) = parse_expr(tokens2) if rtree is None: return (None, tokens) children = [ltree, op, rtree] return (make_internal_node("infix", children), tokens3) def parse_expr(tokens): (tree, tokens1) = parse_infix(tokens) if tree is None: (tree, tokens1) = parse_num(tokens) if tree is None: return (None, tokens) children = [tree] return (make_internal_node("expr", children), tokens1) def parse(tokens): if len(tokens) == 0: return None (tree, tokens1) = parse_expr(tokens) if tree is None: raise ValueError("unable to parse tokens: %s" % tokens) if len(tokens1): raise ValueError("leftover tokens: %s" % tokens1) return tree ### main if __name__ == "__main__": re_table = load_token_defs('token-defs.txt') while myprint(evaluate(parse(tokenize(re_table, read())))): continue
 num \d+ plus \+ minus - star \* slash / carat \^
 def make_internal_node(nodetype, children): """create an internal AST node.""" return {"ntype": nodetype, "token": None, "children": children} def make_leaf_node(token): """create an AST leaf node.""" return {"ntype": token["ttype"], "token": token, "children": []}
 def first(l): if len(l): return l else: return None def rest(l): return l[1:]
to join this conversation on GitHub. Already have an account? Sign in to comment