Skip to content

Instantly share code, notes, and snippets.

@dequis
Created May 19, 2013 08:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dequis/5607120 to your computer and use it in GitHub Desktop.
Save dequis/5607120 to your computer and use it in GitHub Desktop.
a script that reads a file and outputs the same file.
'''
Parses a yacc grammar file.
Also removes left recursion from the output.
That's all.
Licensed under the GNU LGPL. This is distributed in the hope that it
will be useful, but WITHOUT ANY WARRANTY, because it probably won't be.
See the GNU Lesser General Public License for more details.
'''
import re
import sys
import itertools
# Named tokens (matched by regex)
T_IDENTIFIER, T_CHAR, T_COMMENT = range(3)
# Char tokens (the matching regex is the value itself)
CHAR_TOKENS = ':|;'
def flatten(lst):
'''[[a,b],[c,d]] --> [a,b,c,d]'''
return list(itertools.chain(*lst))
def get_rules_token_list(string):
'''
Returns a list/generator like:
[(token_type, value), ...]
token_type is either an integer constant like T_IDENTIFIER, or
a single character string from CHAR_TOKENS
'''
yacc_rules_tokens = [
["[a-zA-Z_][a-zA-Z0-9_]*", T_IDENTIFIER],
["'.'", T_CHAR],
[r"/\*.*?\*/", T_COMMENT],
]
for x in CHAR_TOKENS:
yacc_rules_tokens.append([re.escape(x), x])
rules_regex = re.compile('|'.join(['(%s)' % x[0]
for x in yacc_rules_tokens]))
for match in rules_regex.finditer(string):
token_num, value = [x for x in enumerate(match.groups()) if x[1]][0]
yield yacc_rules_tokens[token_num][1], value
def extract_sections_from_file(filename):
'''Open the file and divide it in sections by %%\n'''
input_buffers = [[]]
input_mode = 0
for line in open(filename):
if line == '%%\n':
input_mode += 1
input_buffers.append([])
input_buffers[input_mode].append(line)
return [''.join(x) for x in input_buffers]
def test_rebuild_tokenized_rules(token_list, indentation=' '):
'''
Test case.
Takes the output of get_rules_token_list() and rebuilds the yacc definition.
Diff the output of this to the original file
'''
last_token_type = None
for token_type, value in token_list:
if type(token_type) == str and token_type in CHAR_TOKENS:
sys.stdout.write("\n" + indentation)
elif last_token_type == ';':
sys.stdout.write("\n\n")
else:
sys.stdout.write(" ")
sys.stdout.write(value)
last_token_type = token_type
def test_rebuild_parsed_rules(parsed, indentation=' '):
'''
Test case.
Takes the output of parse_rules() or equivalent format and rebuilds the
yacc definition represented by this structure.
Diff the output of this to the original file.
'''
for rule_name, components in parsed:
sys.stdout.write(rule_name + "\n")
before = ':'
for component_tokens in components:
sys.stdout.write(indentation + before + ' ')
sys.stdout.write(' '.join(component_tokens))
sys.stdout.write('\n')
before = '|'
sys.stdout.write(indentation + ';\n\n')
class ParseError(Exception):
pass
def parse_rules(token_list, verbose=True):
'''
Takes a token list and parses it into a rule list with this format:
[[rule_name, components], ...]
Where components is:
[[token, ...], ...]
'''
expect = [T_IDENTIFIER]
rules = []
current_rule_name = None
current_rule_components = []
current_component_tokens = []
def debug(*args):
if verbose:
print ' '.join([str(x) for x in args])
for token_id, value in token_list:
if token_id == T_COMMENT:
continue
if token_id not in expect:
raise ParseError("Expecting %s got %r" % (expect, token_id))
if token_id == T_IDENTIFIER and not current_rule_name:
current_rule_name = value
debug(" \\ Entering rule", value)
expect = [':']
elif token_id in (T_IDENTIFIER, T_CHAR):
debug(" | Token:", value)
current_component_tokens.append(value)
expect = [T_IDENTIFIER, T_CHAR, '|', ';']
if token_id == ':':
expect = [T_IDENTIFIER, T_CHAR]
if token_id in ('|', ';'):
debug(" < End component", current_component_tokens)
expect = [T_IDENTIFIER, T_CHAR]
current_rule_components.append(current_component_tokens)
current_component_tokens = []
if token_id == ';':
debug(" / End rule", current_rule_name)
expect = [T_IDENTIFIER]
rules.append([current_rule_name, current_rule_components])
current_rule_name = None
current_rule_components = []
return rules
def remove_left_recursion(parsed):
'''
Takes a rule list from parse_rules() and tries to convert
left-recursive rules into right-recursive ones.
'''
new_parsed = []
for rule_name, components in parsed:
has_left_recursion = False
for component_tokens in components:
if component_tokens[0] == rule_name:
has_left_recursion = True
break
if has_left_recursion:
# betas = components that aren't left recursive
betas = [x for x in components if x[0] != rule_name]
# alphas = the tails of the components that are.
alphas = [x[1:] for x in components if x[0] == rule_name]
if alphas == betas:
# is the content of the tail going to be the same?
# if so, just make one right-recursive rule
components = flatten([[x, x + [rule_name]] for x in alphas])
else:
# for every other case, add a tail rule
# a few cases could be simplified but i'm keeping this naive
# pick a name for the tail that hopefully doesn't conflict
tail_name = rule_name + "__tail"
components = flatten([[x, x + [tail_name]] for x in betas])
tail = flatten([[x, x + [tail_name]] for x in alphas])
new_parsed.append([tail_name, tail])
# test this:
#test_rebuild_parsed_rules([[rule_name, components]])
#test_rebuild_parsed_rules([[tail_name, tail]])
# components is either the original or the modified ones.
new_parsed.append([rule_name, components])
return new_parsed
def main(filename):
sections = extract_sections_from_file(filename)
token_list = get_rules_token_list(sections[1])
# test that:
#return test_rebuild_tokenized_rules(token_list)
parsed = parse_rules(token_list, verbose=False)
parsed = remove_left_recursion(parsed)
return test_rebuild_parsed_rules(parsed)
if __name__ == '__main__':
main(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment