Last active
April 16, 2022 13:10
-
-
Save joekreu/8bab0831b5f3c7ac8b640ff12e3233ef to your computer and use it in GitHub Desktop.
Iterative and recursive precedence parsers for infix expressions, in Python. This gist contains four scripts with increasing complexity.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
''' Script for precedence climbing parsing for infix operators, based | |
on binding powers; iterative and recursive implementation. Binding | |
powers are in range 1 to 99. - Separate tokens in DEMOS by spaces. | |
''' | |
LBP = {"+": 14, "*": 17, "^": 21, ":=": 22, "$END": -1} | |
RBP = {"+": 15, "*": 18, "^": 20, ":=": 8} | |
DEMOS = ['x + 3.4 * y', 'a + b + c', 'n ^ m ^ k', '3 * x := v + 1'] | |
def parse_expr(toks, min_rbp=0): | |
''' 'toks' is the list of tokens, it is used as stack. ''' | |
toks.pop() | |
sub_ex = toks.pop() | |
while min_rbp < LBP[toks[-1]]: # toks[-1] is the top of the stack | |
sub_ex = [toks[-1], sub_ex, parse_expr(toks, RBP[toks[-1]])] | |
return sub_ex | |
print("Parse results:") | |
for demo in DEMOS: | |
tokens = list(reversed(["$BEGIN"] + demo.split() + ["$END"])) | |
print(demo + " "*(18-len(demo)) + "==> " + | |
str(parse_expr(tokens))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
''' Precedence climbing parsing based on binding powers for prefix, | |
infix and postfix operators. An extension of '1_infix_parser.py'. | |
''' | |
# Binding powers (except LBP["$END"]) are integers in range 1 to 99. | |
LBP = {"+": 14, "*": 17, "^": 21, ":=": 22, "!": 19, "$END": -1} | |
RBP = {"+": 15, "*": 18, "^": 20, ":=": 8, "&": 13} | |
DEMOS = ['a * c ! + 3', 'a ! * b', 'm ^ k !', | |
'a * b + c', 'a * & b + c'] | |
def parse_expr(toks, min_rbp=0): | |
''' This function is identical to 'parse_expr' in 'infix.py'. ''' | |
toks.pop() | |
sub_ex = toks.pop() | |
while min_rbp < LBP[toks[-1]]: | |
sub_ex = [toks[-1], sub_ex, parse_expr(toks, RBP[toks[-1]])] | |
return sub_ex | |
# Add fake binding powers for unary operators | |
for operator in LBP: | |
RBP.setdefault(operator, 100) | |
for operator in RBP: | |
LBP.setdefault(operator, 100) | |
print("Parse results:") | |
for demo in DEMOS: | |
toklist = [] | |
for tok in demo.split(): | |
if LBP.get(tok) == 100: # Insert fake operand | |
toklist.append("$PRE") # before prefix operator | |
toklist.append(tok) | |
if RBP.get(tok) == 100: # Insert fake operand | |
toklist.append("$POST") # after postfix operator | |
tokl = list(reversed(["$BEGIN"] + toklist + ["$END"])) | |
print(demo + " "*(18-len(demo)) + "==> " + str(parse_expr(tokl))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
''' Precedence climbing parsing based on binding powers for prefix, | |
infix, postfix operators and parenthesized subexpressions. This is | |
an extension of the script '2_prefix_infix_postfix_parser.py'. | |
Tokenizer is separate, the test code is in the | |
'if __name__ == "__main__":' - part now. | |
The function 's_expr' was added to improve parse tree printing. | |
''' | |
# Binding powers are integers in range 1 to 99. | |
LBP = {"+": 14, "*": 17, "/": 17, "^": 21, ":=": 22, "!": 19} | |
RBP = {"+": 15, "*": 18, "/": 18, "^": 20, ":=": 8, "&": 13} | |
def s_expr(n_list): | |
''' Format a nested Python list as Lisp-like S-expression. ''' | |
return ("(" + " ".join([s_expr(p) for p in n_list]) + ")" | |
if isinstance(n_list, list) else str(n_list)) | |
def tokens(code): | |
''' Very simple tokenizer. Split at spaces; insert tokens. ''' | |
toklist = [] | |
for tok in code.split(): | |
if LBP.get(tok) == 100: | |
toklist.append("$PRE") | |
toklist.append(tok) | |
if RBP.get(tok) == 100: | |
toklist.append("$POST") | |
return list(reversed(["$BEGIN"] + toklist + ["$END"])) | |
def parse_expr(toks, min_rbp=0): | |
''' Parse expressions iteratively and recursively. Accept parenthe- | |
sized subexpressions. Return parse tree as nested Python list. | |
toks -- list of tokens, processed from the end (as stack). | |
''' | |
toks.pop() | |
sub_ex = parse_expr(toks) if toks[-1] == "(" else toks[-1] | |
toks.pop() | |
while min_rbp < LBP[toks[-1]]: | |
sub_ex = [toks[-1], sub_ex, parse_expr(toks, RBP[toks[-1]])] | |
return sub_ex | |
if __name__ == "__main__": | |
DEMOS = ['a * ( b + c ) ^ ( x := y )', '2 ^ ( a * ( b ! + c ) )', | |
'( a * & b ) + c', '2 * ( 3 + 4 ) ^ 4', '2 ^ ( ( a ) )'] | |
for operator in LBP: | |
RBP.setdefault(operator, 100) | |
for operator in RBP: | |
LBP.setdefault(operator, 100) | |
LBP.update({"$END": -1, ")": -1}) | |
print("Parse results:") | |
for demo in DEMOS: | |
print(demo + " "*(30-len(demo)) + "==> " + | |
s_expr(parse_expr(tokens(demo)))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
''' Precedence climbing parser based on binding powers, for prefix, | |
infix, postfix operators and primary (sub)expressions. This is | |
an extension of the script '3_pre_in_post_parens.py'. | |
''' | |
LBP = {"+": 14, "*": 17, "^": 21, ":=": 22, "!": 19} | |
RBP = {"+": 15, "*": 18, "^": 20, ":=": 8, "&": 13, ")": 0} | |
PRIMARIES = {"[": "]", "{": "}", "begin": "end"} | |
FLATOPS = {","} # Other left associative infix operators can be added. | |
def set_bp(): | |
''' Set missing binding powers in LBP, RBP. ''' | |
for operator in LBP: | |
RBP.setdefault(operator, 100) | |
for operator in RBP: | |
LBP.setdefault(operator, 100) | |
LBP.update({"$END": -1, ")": -1, ",": 4}) | |
RBP.update({"$BEGIN": -1, "(": -1, ",": 5}) | |
for dkey, dval in PRIMARIES.items(): | |
RBP.setdefault(dkey, -1) | |
LBP.setdefault(dval, -1) | |
def s_expr(n_list): | |
''' Format a nested Python list as Lisp-like S-expression. ''' | |
return ("(" + " ".join([s_expr(p) for p in n_list]) + ")" | |
if isinstance(n_list, list) else str(n_list)) | |
def tokens(code): | |
''' Very simple tokenizer. Split at spaces; insert tokens. ''' | |
toklist = [] | |
for tok in code.split(): | |
if LBP.get(tok) == 100: | |
toklist.append("$PRE") | |
toklist.append(tok) | |
if RBP.get(tok) == 100: | |
toklist.append("$POST") | |
return list(reversed(["$BEGIN"] + toklist + ["$END"])) | |
def parse_primary(toks): | |
''' Parse a primary expression (parenthesized, bracketed, ... ) ''' | |
if (toksm1 := toks[-1]) == "(": | |
return parse_expr(toks) | |
if toksm1 in PRIMARIES: | |
prim = parse_expr(toks) | |
if isinstance(prim, list) and prim and prim[0] == ",": | |
return [toksm1] + prim[1:] | |
return [toksm1, prim] | |
return toks[-1] | |
def new_sub_ex(oator, firstarg, secondarg): | |
''' Create a new subexpression from opeator and arguments. ''' | |
if (oator in FLATOPS and isinstance(firstarg, list) and firstarg | |
and firstarg[0] == oator): | |
return firstarg + [secondarg] | |
return [oator, firstarg, secondarg] | |
def parse_expr(toks, min_rbp=0): | |
''' Parse expressions iteratively and recursively. Accept primary | |
subexpressions. Return parse tree as nested Python list. | |
toks -- list of tokens, processed from the end (as stack). | |
''' | |
toks.pop() | |
sub_ex = parse_primary(toks) | |
toks.pop() | |
while min_rbp < LBP[toks[-1]]: | |
sub_ex = new_sub_ex(toks[-1], sub_ex, | |
parse_expr(toks, RBP[toks[-1]])) | |
return sub_ex | |
if __name__ == "__main__": | |
DEMOS = ['a * ( b + c ) ^ ( x := y )', '2 ^ ( a * ( b ! + c ) )', | |
'( a * & b ) + c', '2 * ( 3 + 4 ) ^ 4', '2 ^ ( ( a ) )', | |
'2 ^ [ [ a ] ]', '4 ^ begin 5 + [ 6 ] end * 3', | |
'4 * [ 5 + 6 ] ^ 3', '[ a * x , 2 ^ y , b , c * d ]', 'x'] | |
set_bp() | |
print("Parse results:") | |
for demo in DEMOS: | |
print(demo + " "*(32-len(demo)) + "==> " + | |
s_expr(parse_expr(tokens(demo)))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Simple iterative and recursive precedence climbing parsers for expressions, based on binding powers. Binding powers can be any integers in range 1 to 99; for '4_pre_in_post_primary_parser.py' binding powers should be in range 6 to 99.
For some background, see joekreu/basic_pcp.