dequis/yaccparse.py

## yaccparse.py
'''
Parses a yacc grammar file.
Also removes left recursion from the output.
That's all.

Licensed under the GNU LGPL. This is distributed in the hope that it
will be useful, but WITHOUT ANY WARRANTY, because it probably won't be.
See the GNU Lesser General Public License for more details.
'''

import re
import sys
import itertools

# Named tokens (matched by regex)
T_IDENTIFIER, T_CHAR, T_COMMENT = range(3)
# Char tokens (the matching regex is the value itself)
CHAR_TOKENS = ':|;'

def flatten(lst):
    '''[[a,b],[c,d]] --> [a,b,c,d]'''
    return list(itertools.chain(*lst))

def get_rules_token_list(string):
    '''
    Returns a list/generator like:
        [(token_type, value), ...]
    token_type is either an integer constant like T_IDENTIFIER, or
    a single character string from CHAR_TOKENS
    '''

    yacc_rules_tokens = [
        ["[a-zA-Z_][a-zA-Z0-9_]*", T_IDENTIFIER],
        ["'.'", T_CHAR],
        [r"/\*.*?\*/", T_COMMENT],
    ]

    for x in CHAR_TOKENS:
        yacc_rules_tokens.append([re.escape(x), x])

    rules_regex = re.compile('|'.join(['(%s)' % x[0]
        for x in yacc_rules_tokens]))

    for match in rules_regex.finditer(string):
        token_num, value = [x for x in enumerate(match.groups()) if x[1]][0]
        yield yacc_rules_tokens[token_num][1], value

def extract_sections_from_file(filename):
    '''Open the file and divide it in sections by %%\n'''

    input_buffers = [[]]
    input_mode = 0
    for line in open(filename):
        if line == '%%\n':
            input_mode += 1
            input_buffers.append([])
        input_buffers[input_mode].append(line)
    return [''.join(x) for x in input_buffers]


def test_rebuild_tokenized_rules(token_list, indentation='    '):
    '''
    Test case.
    Takes the output of get_rules_token_list() and rebuilds the yacc definition.
    Diff the output of this to the original file
    '''

    last_token_type = None
    for token_type, value in token_list:
        if type(token_type) == str and token_type in CHAR_TOKENS:
            sys.stdout.write("\n" + indentation)
        elif last_token_type == ';':
            sys.stdout.write("\n\n")
        else:
            sys.stdout.write(" ")
        sys.stdout.write(value)
        last_token_type = token_type

def test_rebuild_parsed_rules(parsed, indentation='    '):
    '''
    Test case.
    Takes the output of parse_rules() or equivalent format and rebuilds the
    yacc definition represented by this structure.
    Diff the output of this to the original file.
    '''

    for rule_name, components in parsed:
        sys.stdout.write(rule_name + "\n")
        before = ':'
        for component_tokens in components:
            sys.stdout.write(indentation + before + ' ')
            sys.stdout.write(' '.join(component_tokens))
            sys.stdout.write('\n')
            before = '|'
        sys.stdout.write(indentation + ';\n\n')


class ParseError(Exception):
    pass

def parse_rules(token_list, verbose=True):
    '''
    Takes a token list and parses it into a rule list with this format:
        [[rule_name, components], ...]
    Where components is:
        [[token, ...], ...]
    '''
    expect = [T_IDENTIFIER]

    rules = []
    current_rule_name = None
    current_rule_components = []
    current_component_tokens = []

    def debug(*args):
        if verbose:
            print ' '.join([str(x) for x in args])

    for token_id, value in token_list:
        if token_id == T_COMMENT:
            continue

        if token_id not in expect:
            raise ParseError("Expecting %s got %r" % (expect, token_id))

        if token_id == T_IDENTIFIER and not current_rule_name:
            current_rule_name = value
            debug(" \\ Entering rule", value)
            expect = [':']
        elif token_id in (T_IDENTIFIER, T_CHAR):
            debug("  | Token:", value)
            current_component_tokens.append(value)
            expect = [T_IDENTIFIER, T_CHAR, '|', ';']

        if token_id == ':':
            expect = [T_IDENTIFIER, T_CHAR]

        if token_id in ('|', ';'):
            debug(" < End component", current_component_tokens)
            expect = [T_IDENTIFIER, T_CHAR]
            current_rule_components.append(current_component_tokens)
            current_component_tokens = []

        if token_id == ';':
            debug(" / End rule", current_rule_name)
            expect = [T_IDENTIFIER]
            rules.append([current_rule_name, current_rule_components])
            current_rule_name = None
            current_rule_components = []

    return rules

def remove_left_recursion(parsed):
    '''
    Takes a rule list from parse_rules() and tries to convert
    left-recursive rules into right-recursive ones.
    '''
    new_parsed = []

    for rule_name, components in parsed:

        has_left_recursion = False
        for component_tokens in components:
            if component_tokens[0] == rule_name:
                has_left_recursion = True
                break

        if has_left_recursion:
            # betas = components that aren't left recursive
            betas = [x for x in components if x[0] != rule_name]
            # alphas = the tails of the components that are.
            alphas = [x[1:] for x in components if x[0] == rule_name]

            if alphas == betas:
                # is the content of the tail going to be the same?
                # if so, just make one right-recursive rule
                components = flatten([[x, x + [rule_name]] for x in alphas])
            else:
                # for every other case, add a tail rule
                # a few cases could be simplified but i'm keeping this naive

                # pick a name for the tail that hopefully doesn't conflict
                tail_name = rule_name + "__tail"

                components = flatten([[x, x + [tail_name]] for x in betas])

                tail = flatten([[x, x + [tail_name]] for x in alphas])
                new_parsed.append([tail_name, tail])

                # test this:
                #test_rebuild_parsed_rules([[rule_name, components]])
                #test_rebuild_parsed_rules([[tail_name, tail]])

        # components is either the original or the modified ones.
        new_parsed.append([rule_name, components])

    return new_parsed

def main(filename):
    sections = extract_sections_from_file(filename)

    token_list = get_rules_token_list(sections[1])

    # test that:
    #return test_rebuild_tokenized_rules(token_list)

    parsed = parse_rules(token_list, verbose=False)

    parsed = remove_left_recursion(parsed)

    return test_rebuild_parsed_rules(parsed)

if __name__ == '__main__':
    main(sys.argv[1])
	'''
	Parses a yacc grammar file.
	Also removes left recursion from the output.
	That's all.

	Licensed under the GNU LGPL. This is distributed in the hope that it
	will be useful, but WITHOUT ANY WARRANTY, because it probably won't be.
	See the GNU Lesser General Public License for more details.
	'''

	import re
	import sys
	import itertools

	# Named tokens (matched by regex)
	T_IDENTIFIER, T_CHAR, T_COMMENT = range(3)
	# Char tokens (the matching regex is the value itself)
	CHAR_TOKENS = ':\|;'

	def flatten(lst):
	'''[[a,b],[c,d]] --> [a,b,c,d]'''
	return list(itertools.chain(*lst))

	def get_rules_token_list(string):
	'''
	Returns a list/generator like:
	[(token_type, value), ...]
	token_type is either an integer constant like T_IDENTIFIER, or
	a single character string from CHAR_TOKENS
	'''

	yacc_rules_tokens = [
	["[a-zA-Z_][a-zA-Z0-9_]*", T_IDENTIFIER],
	["'.'", T_CHAR],
	[r"/\.?\*/", T_COMMENT],
	]

	for x in CHAR_TOKENS:
	yacc_rules_tokens.append([re.escape(x), x])

	rules_regex = re.compile('\|'.join(['(%s)' % x[0]
	for x in yacc_rules_tokens]))

	for match in rules_regex.finditer(string):
	token_num, value = [x for x in enumerate(match.groups()) if x[1]][0]
	yield yacc_rules_tokens[token_num][1], value

	def extract_sections_from_file(filename):
	'''Open the file and divide it in sections by %%\n'''

	input_buffers = [[]]
	input_mode = 0
	for line in open(filename):
	if line == '%%\n':
	input_mode += 1
	input_buffers.append([])
	input_buffers[input_mode].append(line)
	return [''.join(x) for x in input_buffers]


	def test_rebuild_tokenized_rules(token_list, indentation=' '):
	'''
	Test case.
	Takes the output of get_rules_token_list() and rebuilds the yacc definition.
	Diff the output of this to the original file
	'''

	last_token_type = None
	for token_type, value in token_list:
	if type(token_type) == str and token_type in CHAR_TOKENS:
	sys.stdout.write("\n" + indentation)
	elif last_token_type == ';':
	sys.stdout.write("\n\n")
	else:
	sys.stdout.write(" ")
	sys.stdout.write(value)
	last_token_type = token_type

	def test_rebuild_parsed_rules(parsed, indentation=' '):
	'''
	Test case.
	Takes the output of parse_rules() or equivalent format and rebuilds the
	yacc definition represented by this structure.
	Diff the output of this to the original file.
	'''

	for rule_name, components in parsed:
	sys.stdout.write(rule_name + "\n")
	before = ':'
	for component_tokens in components:
	sys.stdout.write(indentation + before + ' ')
	sys.stdout.write(' '.join(component_tokens))
	sys.stdout.write('\n')
	before = '\|'
	sys.stdout.write(indentation + ';\n\n')


	class ParseError(Exception):
	pass

	def parse_rules(token_list, verbose=True):
	'''
	Takes a token list and parses it into a rule list with this format:
	[[rule_name, components], ...]
	Where components is:
	[[token, ...], ...]
	'''
	expect = [T_IDENTIFIER]

	rules = []
	current_rule_name = None
	current_rule_components = []
	current_component_tokens = []

	def debug(*args):
	if verbose:
	print ' '.join([str(x) for x in args])

	for token_id, value in token_list:
	if token_id == T_COMMENT:
	continue

	if token_id not in expect:
	raise ParseError("Expecting %s got %r" % (expect, token_id))

	if token_id == T_IDENTIFIER and not current_rule_name:
	current_rule_name = value
	debug(" \\ Entering rule", value)
	expect = [':']
	elif token_id in (T_IDENTIFIER, T_CHAR):
	debug(" \| Token:", value)
	current_component_tokens.append(value)
	expect = [T_IDENTIFIER, T_CHAR, '\|', ';']

	if token_id == ':':
	expect = [T_IDENTIFIER, T_CHAR]

	if token_id in ('\|', ';'):
	debug(" < End component", current_component_tokens)
	expect = [T_IDENTIFIER, T_CHAR]
	current_rule_components.append(current_component_tokens)
	current_component_tokens = []

	if token_id == ';':
	debug(" / End rule", current_rule_name)
	expect = [T_IDENTIFIER]
	rules.append([current_rule_name, current_rule_components])
	current_rule_name = None
	current_rule_components = []

	return rules

	def remove_left_recursion(parsed):
	'''
	Takes a rule list from parse_rules() and tries to convert
	left-recursive rules into right-recursive ones.
	'''
	new_parsed = []

	for rule_name, components in parsed:

	has_left_recursion = False
	for component_tokens in components:
	if component_tokens[0] == rule_name:
	has_left_recursion = True
	break

	if has_left_recursion:
	# betas = components that aren't left recursive
	betas = [x for x in components if x[0] != rule_name]
	# alphas = the tails of the components that are.
	alphas = [x[1:] for x in components if x[0] == rule_name]

	if alphas == betas:
	# is the content of the tail going to be the same?
	# if so, just make one right-recursive rule
	components = flatten([[x, x + [rule_name]] for x in alphas])
	else:
	# for every other case, add a tail rule
	# a few cases could be simplified but i'm keeping this naive

	# pick a name for the tail that hopefully doesn't conflict
	tail_name = rule_name + "__tail"

	components = flatten([[x, x + [tail_name]] for x in betas])

	tail = flatten([[x, x + [tail_name]] for x in alphas])
	new_parsed.append([tail_name, tail])

	# test this:
	#test_rebuild_parsed_rules([[rule_name, components]])
	#test_rebuild_parsed_rules([[tail_name, tail]])

	# components is either the original or the modified ones.
	new_parsed.append([rule_name, components])

	return new_parsed

	def main(filename):
	sections = extract_sections_from_file(filename)

	token_list = get_rules_token_list(sections[1])

	# test that:
	#return test_rebuild_tokenized_rules(token_list)

	parsed = parse_rules(token_list, verbose=False)

	parsed = remove_left_recursion(parsed)

	return test_rebuild_parsed_rules(parsed)

	if __name__ == '__main__':
	main(sys.argv[1])