Skip to content

Instantly share code, notes, and snippets.

@Yoxem
Last active May 11, 2016 18:09
Show Gist options
  • Save Yoxem/bd6f031f847d4e068d661e0791347f7b to your computer and use it in GitHub Desktop.
Save Yoxem/bd6f031f847d4e068d661e0791347f7b to your computer and use it in GitHub Desktop.
Simple recursive descent (?) parser with a tiny tokenizer function in Python 3
#!/usr/bin/env python3
import re
token_pattern = \
[
[r'[+]', 'ADD'],
[r'[-]', 'SUB'],
[r'[+-]?[0-9]*\.[0-9]+(e[+-]?[0-9]+)?','FLOAT'],
[r'[+-]?[0-9]+','INT'],
[r'[*]', 'MUL'],
[r'[/]', 'DIV'],
[r'[a-zA-Z_][a-zA-Z0-9_]*','NAME'],
[r'[=]', 'DEF'],
[r'[(]', 'PATH_L'],
[r'[)]', 'PATH_R'],
[r'[\n\s\t]',''], # Ignored
]
grammar_table = \
[
['START', 'NAME', 'DEF', 'EXP'], # START -> NAME 'DEF' EXP
['START', 'EXP'], # START -> EXP
['EXP', 'A', ['ADD', 'SUB'], 'EXP'],
['EXP', 'A'],
['A', 'B', ['MUL','DIV'], 'A'],
['A', 'B'],
['B', 'PATH_L', 'C'],
['C', 'EXP', 'PATH_R'],
['B', 'D'],
['D', 'INT'],
['D', 'FLOAT'],
['D', 'NAME'],
['INT', 'TERM'], #TERM = terminal
['FLOAT', 'TERM'],
['NAME', 'TERM'],
['DIV', 'TERM'],
['ADD', 'TERM'],
['SUB', 'TERM'],
['MUL', 'TERM'],
['DEF', 'TERM'],
['PATH_L', 'TERM'],
['PATH_R', 'TERM'],
]
input_var = "abcde = (-e.2*-2.0e-9+2 / 12 * - 1.00)"
def tokenize(input_var, token_pattern):
token_result = []
while input_var:
for i in range(len(token_pattern)):
token_item = token_pattern[i]
item_re_pattern = re.compile(r"^(" + token_item[0] + r")")
item_re_search = item_re_pattern.search(input_var)
if item_re_search:
matched_variable = item_re_search.group(0)
else:
matched_variable = None
if matched_variable:
input_var = input_var[len(matched_variable):]
#find substring not in "ignored" shown by ''
if token_item[1] != '':
token_result.append([matched_variable, token_item[1]])
break
else:
if i == len(token_pattern) - 1:
try:
raise Exception(input_var[0])
except Exception as ex:
print("Error:", ex.args[0] , \
"are not in the token pattern table")
raise
return token_result
def detokenize(token_result):
result_string = ""
for i in token_result:
result_string = result_string+ i[0]
return result_string
def select_grammar_rule(grammar_table, situation):
selected_grammar_rule = []
for i in grammar_table:
if i[0] == situation:
selected_grammar_rule.append(i)
return selected_grammar_rule
def parsing_iter(grammar_table, tokenized, situation):
selected_grammar_rule = select_grammar_rule(grammar_table, situation)
for i,rule in enumerate(selected_grammar_rule):
# operator of infix notation
if len(rule) == 4 and len(tokenized) >= 3:
for j in range(len(tokenized)):
if tokenized[j][1] == rule[2] or tokenized[j][1] in rule[2]:
'''if the amount of ( and the amount of ) is equal,
split it.'''
token_name_before = [i[1] for i in tokenized[:j]]
path_l_amount = token_name_before.count("PATH_L")
path_r_amount = token_name_before.count("PATH_R")
if path_r_amount == path_l_amount:
return_tree = [tokenized[j],
parsing_iter(grammar_table,tokenized[0:j],rule[1]),
parsing_iter(grammar_table,tokenized[j+1:],rule[3])
]
return return_tree
else:
continue
else:
continue
if len(rule) == 3 and len(tokenized) >= 2:
# poland notation
if tokenized[0][1] == rule[1]:
return_tree = [tokenized[0:1],
parsing_iter(grammar_table,tokenized[1:],rule[2]),
]
return return_tree
# reversing poland notatio n
elif tokenized[-1][1] == rule[-1]:
return_tree = [parsing_iter(grammar_table,tokenized[:-1],rule[1]),
tokenized[-1],
]
return return_tree
else:
continue
if len(rule) == 2:
if rule[1] == "TERM" and len(tokenized) == 1:
if rule[0] == "FLOAT":
return [float(tokenized[0][0]),tokenized[0][1]]
elif rule[0] == "INT":
return [int(tokenized[0][0]),tokenized[0][1]]
else:
return tokenized
elif tokenized[0][1] == rule[1]:
return_tree = parsing_iter(grammar_table,tokenized[:],rule[1])
return return_tree
elif i == len(selected_grammar_rule) - 1:
return parsing_iter(grammar_table,tokenized,rule[1])
else:
continue
def parsing(grammar_table, tokenized):
result_table = []
return parsing_iter(grammar_table, tokenized, "START")
'''
returns:
[['abcde', 'NAME'], ['=', 'DEF'], ['(', 'PATH_L'], ['-', 'SUB'], ['e', 'NAME'], ['.2', 'FLOAT'], [
'*', 'MUL'], ['-2.0e-9', 'FLOAT'], ['+2', 'INT'], ['/', 'DIV'], ['a', 'NAME'], ['+', 'ADD'], ['_12
', 'NAME'], ['*', 'MUL'], ['-1.00', 'FLOAT'], ['+', 'ADD'], ['df2', 'NAME'], [')', 'PATH_R']]'''
#print(tokenize(input_var, token_pattern))
'''returns:
abcde=(-e.2*-2.0e-9+2/a+_12*-1.00+df2)'''
#print(detokenize(tokenize(input_var, token_pattern)))
'''print token_pattern'''
print("token_pattern")
print("Pattern\tToken")
print("------------------------------")
for i in token_pattern:
is_first_item = True
print_line = ""
for j in i:
if is_first_item:
print_line += j
is_first_item = False
else:
print_line += ('\t'+j)
print(print_line)
'''print grammar_table'''
print("\ngrammar_table")
print("LHS\t->\tRHS")
print("------------------------------")
for i in grammar_table:
is_first_item = True
print_line = ""
for j in i:
if is_first_item:
print_line += (j + '\t->')
is_first_item = False
else:
if type(j) is list:
print_line += ("\t("+ " | ".join(j) + ")")
else:
print_line += ('\t'+ j)
print(print_line)
while True:
raw_string = input("\n\ninput the string to be parsed>> ")
"""
(1+2)*3/(405) -> [['(', 'PATH_L'], ['1', 'INT'], ['+', 'ADD'],
['2', 'INT'], [')', 'PATH_R'], ['*', 'MUL'], ['3', 'INT'], ['/', 'DIV'],
['(', 'PATH_L'], ['405', 'INT'], [')', 'PATH_R']]
"""
tokenized_list= tokenize(raw_string, token_pattern)
"""
[['(', 'PATH_L'], ['1', 'INT'], ['+', 'ADD'],
['2', 'INT'], [')', 'PATH_R'], ['*', 'MUL'], ['3', 'INT'], ['/', 'DIV'],
['(', 'PATH_L'], ['405', 'INT'], [')', 'PATH_R']] ->
[['*', 'MUL'], [[['(', 'PATH_L']], [[['+', 'ADD'], [1, 'INT'], [2, 'INT']],
[')', 'PATH_R']]], [['/', 'DIV'], [3, 'INT'], [[['(', 'PATH_L']],
[[405, 'INT'], [')', 'PATH_R']]]]]
"""
parsed_result = parsing(grammar_table, tokenized_list)
print("Tokenized List:\n", tokenized_list)
print("\nParsed Tree:\n", parsed_result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment