Skip to content

Instantly share code, notes, and snippets.

@joekreu
Last active April 16, 2022 13:10
Show Gist options
  • Save joekreu/8bab0831b5f3c7ac8b640ff12e3233ef to your computer and use it in GitHub Desktop.
Save joekreu/8bab0831b5f3c7ac8b640ff12e3233ef to your computer and use it in GitHub Desktop.
Iterative and recursive precedence parsers for infix expressions, in Python. This gist contains four scripts with increasing complexity.
#! /usr/bin/env python3
''' Script for precedence climbing parsing for infix operators, based
on binding powers; iterative and recursive implementation. Binding
powers are in range 1 to 99. - Separate tokens in DEMOS by spaces.
'''
LBP = {"+": 14, "*": 17, "^": 21, ":=": 22, "$END": -1}
RBP = {"+": 15, "*": 18, "^": 20, ":=": 8}
DEMOS = ['x + 3.4 * y', 'a + b + c', 'n ^ m ^ k', '3 * x := v + 1']
def parse_expr(toks, min_rbp=0):
''' 'toks' is the list of tokens, it is used as stack. '''
toks.pop()
sub_ex = toks.pop()
while min_rbp < LBP[toks[-1]]: # toks[-1] is the top of the stack
sub_ex = [toks[-1], sub_ex, parse_expr(toks, RBP[toks[-1]])]
return sub_ex
print("Parse results:")
for demo in DEMOS:
tokens = list(reversed(["$BEGIN"] + demo.split() + ["$END"]))
print(demo + " "*(18-len(demo)) + "==> " +
str(parse_expr(tokens)))
#! /usr/bin/env python3
''' Precedence climbing parsing based on binding powers for prefix,
infix and postfix operators. An extension of '1_infix_parser.py'.
'''
# Binding powers (except LBP["$END"]) are integers in range 1 to 99.
LBP = {"+": 14, "*": 17, "^": 21, ":=": 22, "!": 19, "$END": -1}
RBP = {"+": 15, "*": 18, "^": 20, ":=": 8, "&": 13}
DEMOS = ['a * c ! + 3', 'a ! * b', 'm ^ k !',
'a * b + c', 'a * & b + c']
def parse_expr(toks, min_rbp=0):
''' This function is identical to 'parse_expr' in 'infix.py'. '''
toks.pop()
sub_ex = toks.pop()
while min_rbp < LBP[toks[-1]]:
sub_ex = [toks[-1], sub_ex, parse_expr(toks, RBP[toks[-1]])]
return sub_ex
# Add fake binding powers for unary operators
for operator in LBP:
RBP.setdefault(operator, 100)
for operator in RBP:
LBP.setdefault(operator, 100)
print("Parse results:")
for demo in DEMOS:
toklist = []
for tok in demo.split():
if LBP.get(tok) == 100: # Insert fake operand
toklist.append("$PRE") # before prefix operator
toklist.append(tok)
if RBP.get(tok) == 100: # Insert fake operand
toklist.append("$POST") # after postfix operator
tokl = list(reversed(["$BEGIN"] + toklist + ["$END"]))
print(demo + " "*(18-len(demo)) + "==> " + str(parse_expr(tokl)))
#! /usr/bin/env python3
''' Precedence climbing parsing based on binding powers for prefix,
infix, postfix operators and parenthesized subexpressions. This is
an extension of the script '2_prefix_infix_postfix_parser.py'.
Tokenizer is separate, the test code is in the
'if __name__ == "__main__":' - part now.
The function 's_expr' was added to improve parse tree printing.
'''
# Binding powers are integers in range 1 to 99.
LBP = {"+": 14, "*": 17, "/": 17, "^": 21, ":=": 22, "!": 19}
RBP = {"+": 15, "*": 18, "/": 18, "^": 20, ":=": 8, "&": 13}
def s_expr(n_list):
''' Format a nested Python list as Lisp-like S-expression. '''
return ("(" + " ".join([s_expr(p) for p in n_list]) + ")"
if isinstance(n_list, list) else str(n_list))
def tokens(code):
''' Very simple tokenizer. Split at spaces; insert tokens. '''
toklist = []
for tok in code.split():
if LBP.get(tok) == 100:
toklist.append("$PRE")
toklist.append(tok)
if RBP.get(tok) == 100:
toklist.append("$POST")
return list(reversed(["$BEGIN"] + toklist + ["$END"]))
def parse_expr(toks, min_rbp=0):
''' Parse expressions iteratively and recursively. Accept parenthe-
sized subexpressions. Return parse tree as nested Python list.
toks -- list of tokens, processed from the end (as stack).
'''
toks.pop()
sub_ex = parse_expr(toks) if toks[-1] == "(" else toks[-1]
toks.pop()
while min_rbp < LBP[toks[-1]]:
sub_ex = [toks[-1], sub_ex, parse_expr(toks, RBP[toks[-1]])]
return sub_ex
if __name__ == "__main__":
DEMOS = ['a * ( b + c ) ^ ( x := y )', '2 ^ ( a * ( b ! + c ) )',
'( a * & b ) + c', '2 * ( 3 + 4 ) ^ 4', '2 ^ ( ( a ) )']
for operator in LBP:
RBP.setdefault(operator, 100)
for operator in RBP:
LBP.setdefault(operator, 100)
LBP.update({"$END": -1, ")": -1})
print("Parse results:")
for demo in DEMOS:
print(demo + " "*(30-len(demo)) + "==> " +
s_expr(parse_expr(tokens(demo))))
#! /usr/bin/env python3
''' Precedence climbing parser based on binding powers, for prefix,
infix, postfix operators and primary (sub)expressions. This is
an extension of the script '3_pre_in_post_parens.py'.
'''
LBP = {"+": 14, "*": 17, "^": 21, ":=": 22, "!": 19}
RBP = {"+": 15, "*": 18, "^": 20, ":=": 8, "&": 13, ")": 0}
PRIMARIES = {"[": "]", "{": "}", "begin": "end"}
FLATOPS = {","} # Other left associative infix operators can be added.
def set_bp():
''' Set missing binding powers in LBP, RBP. '''
for operator in LBP:
RBP.setdefault(operator, 100)
for operator in RBP:
LBP.setdefault(operator, 100)
LBP.update({"$END": -1, ")": -1, ",": 4})
RBP.update({"$BEGIN": -1, "(": -1, ",": 5})
for dkey, dval in PRIMARIES.items():
RBP.setdefault(dkey, -1)
LBP.setdefault(dval, -1)
def s_expr(n_list):
''' Format a nested Python list as Lisp-like S-expression. '''
return ("(" + " ".join([s_expr(p) for p in n_list]) + ")"
if isinstance(n_list, list) else str(n_list))
def tokens(code):
''' Very simple tokenizer. Split at spaces; insert tokens. '''
toklist = []
for tok in code.split():
if LBP.get(tok) == 100:
toklist.append("$PRE")
toklist.append(tok)
if RBP.get(tok) == 100:
toklist.append("$POST")
return list(reversed(["$BEGIN"] + toklist + ["$END"]))
def parse_primary(toks):
''' Parse a primary expression (parenthesized, bracketed, ... ) '''
if (toksm1 := toks[-1]) == "(":
return parse_expr(toks)
if toksm1 in PRIMARIES:
prim = parse_expr(toks)
if isinstance(prim, list) and prim and prim[0] == ",":
return [toksm1] + prim[1:]
return [toksm1, prim]
return toks[-1]
def new_sub_ex(oator, firstarg, secondarg):
''' Create a new subexpression from opeator and arguments. '''
if (oator in FLATOPS and isinstance(firstarg, list) and firstarg
and firstarg[0] == oator):
return firstarg + [secondarg]
return [oator, firstarg, secondarg]
def parse_expr(toks, min_rbp=0):
''' Parse expressions iteratively and recursively. Accept primary
subexpressions. Return parse tree as nested Python list.
toks -- list of tokens, processed from the end (as stack).
'''
toks.pop()
sub_ex = parse_primary(toks)
toks.pop()
while min_rbp < LBP[toks[-1]]:
sub_ex = new_sub_ex(toks[-1], sub_ex,
parse_expr(toks, RBP[toks[-1]]))
return sub_ex
if __name__ == "__main__":
DEMOS = ['a * ( b + c ) ^ ( x := y )', '2 ^ ( a * ( b ! + c ) )',
'( a * & b ) + c', '2 * ( 3 + 4 ) ^ 4', '2 ^ ( ( a ) )',
'2 ^ [ [ a ] ]', '4 ^ begin 5 + [ 6 ] end * 3',
'4 * [ 5 + 6 ] ^ 3', '[ a * x , 2 ^ y , b , c * d ]', 'x']
set_bp()
print("Parse results:")
for demo in DEMOS:
print(demo + " "*(32-len(demo)) + "==> " +
s_expr(parse_expr(tokens(demo))))
@joekreu
Copy link
Author

joekreu commented Apr 5, 2022

Simple iterative and recursive precedence climbing parsers for expressions, based on binding powers. Binding powers can be any integers in range 1 to 99; for '4_pre_in_post_primary_parser.py' binding powers should be in range 6 to 99.
For some background, see joekreu/basic_pcp.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment