Created
May 19, 2013 08:50
-
-
Save dequis/5607120 to your computer and use it in GitHub Desktop.
a script that reads a file and outputs the same file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Parses a yacc grammar file. | |
Also removes left recursion from the output. | |
That's all. | |
Licensed under the GNU LGPL. This is distributed in the hope that it | |
will be useful, but WITHOUT ANY WARRANTY, because it probably won't be. | |
See the GNU Lesser General Public License for more details. | |
''' | |
import re | |
import sys | |
import itertools | |
# Named tokens (matched by regex) | |
T_IDENTIFIER, T_CHAR, T_COMMENT = range(3) | |
# Char tokens (the matching regex is the value itself) | |
CHAR_TOKENS = ':|;' | |
def flatten(lst): | |
'''[[a,b],[c,d]] --> [a,b,c,d]''' | |
return list(itertools.chain(*lst)) | |
def get_rules_token_list(string): | |
''' | |
Returns a list/generator like: | |
[(token_type, value), ...] | |
token_type is either an integer constant like T_IDENTIFIER, or | |
a single character string from CHAR_TOKENS | |
''' | |
yacc_rules_tokens = [ | |
["[a-zA-Z_][a-zA-Z0-9_]*", T_IDENTIFIER], | |
["'.'", T_CHAR], | |
[r"/\*.*?\*/", T_COMMENT], | |
] | |
for x in CHAR_TOKENS: | |
yacc_rules_tokens.append([re.escape(x), x]) | |
rules_regex = re.compile('|'.join(['(%s)' % x[0] | |
for x in yacc_rules_tokens])) | |
for match in rules_regex.finditer(string): | |
token_num, value = [x for x in enumerate(match.groups()) if x[1]][0] | |
yield yacc_rules_tokens[token_num][1], value | |
def extract_sections_from_file(filename): | |
'''Open the file and divide it in sections by %%\n''' | |
input_buffers = [[]] | |
input_mode = 0 | |
for line in open(filename): | |
if line == '%%\n': | |
input_mode += 1 | |
input_buffers.append([]) | |
input_buffers[input_mode].append(line) | |
return [''.join(x) for x in input_buffers] | |
def test_rebuild_tokenized_rules(token_list, indentation=' '): | |
''' | |
Test case. | |
Takes the output of get_rules_token_list() and rebuilds the yacc definition. | |
Diff the output of this to the original file | |
''' | |
last_token_type = None | |
for token_type, value in token_list: | |
if type(token_type) == str and token_type in CHAR_TOKENS: | |
sys.stdout.write("\n" + indentation) | |
elif last_token_type == ';': | |
sys.stdout.write("\n\n") | |
else: | |
sys.stdout.write(" ") | |
sys.stdout.write(value) | |
last_token_type = token_type | |
def test_rebuild_parsed_rules(parsed, indentation=' '): | |
''' | |
Test case. | |
Takes the output of parse_rules() or equivalent format and rebuilds the | |
yacc definition represented by this structure. | |
Diff the output of this to the original file. | |
''' | |
for rule_name, components in parsed: | |
sys.stdout.write(rule_name + "\n") | |
before = ':' | |
for component_tokens in components: | |
sys.stdout.write(indentation + before + ' ') | |
sys.stdout.write(' '.join(component_tokens)) | |
sys.stdout.write('\n') | |
before = '|' | |
sys.stdout.write(indentation + ';\n\n') | |
class ParseError(Exception): | |
pass | |
def parse_rules(token_list, verbose=True): | |
''' | |
Takes a token list and parses it into a rule list with this format: | |
[[rule_name, components], ...] | |
Where components is: | |
[[token, ...], ...] | |
''' | |
expect = [T_IDENTIFIER] | |
rules = [] | |
current_rule_name = None | |
current_rule_components = [] | |
current_component_tokens = [] | |
def debug(*args): | |
if verbose: | |
print ' '.join([str(x) for x in args]) | |
for token_id, value in token_list: | |
if token_id == T_COMMENT: | |
continue | |
if token_id not in expect: | |
raise ParseError("Expecting %s got %r" % (expect, token_id)) | |
if token_id == T_IDENTIFIER and not current_rule_name: | |
current_rule_name = value | |
debug(" \\ Entering rule", value) | |
expect = [':'] | |
elif token_id in (T_IDENTIFIER, T_CHAR): | |
debug(" | Token:", value) | |
current_component_tokens.append(value) | |
expect = [T_IDENTIFIER, T_CHAR, '|', ';'] | |
if token_id == ':': | |
expect = [T_IDENTIFIER, T_CHAR] | |
if token_id in ('|', ';'): | |
debug(" < End component", current_component_tokens) | |
expect = [T_IDENTIFIER, T_CHAR] | |
current_rule_components.append(current_component_tokens) | |
current_component_tokens = [] | |
if token_id == ';': | |
debug(" / End rule", current_rule_name) | |
expect = [T_IDENTIFIER] | |
rules.append([current_rule_name, current_rule_components]) | |
current_rule_name = None | |
current_rule_components = [] | |
return rules | |
def remove_left_recursion(parsed): | |
''' | |
Takes a rule list from parse_rules() and tries to convert | |
left-recursive rules into right-recursive ones. | |
''' | |
new_parsed = [] | |
for rule_name, components in parsed: | |
has_left_recursion = False | |
for component_tokens in components: | |
if component_tokens[0] == rule_name: | |
has_left_recursion = True | |
break | |
if has_left_recursion: | |
# betas = components that aren't left recursive | |
betas = [x for x in components if x[0] != rule_name] | |
# alphas = the tails of the components that are. | |
alphas = [x[1:] for x in components if x[0] == rule_name] | |
if alphas == betas: | |
# is the content of the tail going to be the same? | |
# if so, just make one right-recursive rule | |
components = flatten([[x, x + [rule_name]] for x in alphas]) | |
else: | |
# for every other case, add a tail rule | |
# a few cases could be simplified but i'm keeping this naive | |
# pick a name for the tail that hopefully doesn't conflict | |
tail_name = rule_name + "__tail" | |
components = flatten([[x, x + [tail_name]] for x in betas]) | |
tail = flatten([[x, x + [tail_name]] for x in alphas]) | |
new_parsed.append([tail_name, tail]) | |
# test this: | |
#test_rebuild_parsed_rules([[rule_name, components]]) | |
#test_rebuild_parsed_rules([[tail_name, tail]]) | |
# components is either the original or the modified ones. | |
new_parsed.append([rule_name, components]) | |
return new_parsed | |
def main(filename): | |
sections = extract_sections_from_file(filename) | |
token_list = get_rules_token_list(sections[1]) | |
# test that: | |
#return test_rebuild_tokenized_rules(token_list) | |
parsed = parse_rules(token_list, verbose=False) | |
parsed = remove_left_recursion(parsed) | |
return test_rebuild_parsed_rules(parsed) | |
if __name__ == '__main__': | |
main(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment