jorendorff/distinguish.py

## distinguish.py
"""generate_js_parser_tables.py - Generate tables from the ES grammar."""

import argparse
import os
import jsparagus.gen
import jsparagus.grammar
from .parse_esgrammar import parse_esgrammar
from .lexer import ECMASCRIPT_FULL_KEYWORDS, ECMASCRIPT_CONDITIONAL_KEYWORDS


ECMASCRIPT_GOAL_NTS = [
    'Script',
    'Module',
    # 'FormalParameters',
    # 'FunctionBody',
]

ECMASCRIPT_SYNTHETIC_TERMINALS = {
    'IdentifierName': {
        'Name',
        *ECMASCRIPT_FULL_KEYWORDS,
        *ECMASCRIPT_CONDITIONAL_KEYWORDS
    },
    'Identifier': {
        'Name',
        *ECMASCRIPT_CONDITIONAL_KEYWORDS
    }
}


def hack_grammar(g):
    # We throw away most of the boolean parameters in the grammar, as the
    # current parser generator's approach of fully expanding them is a huge
    # pain.

    PARAM_WHITELIST = ['In', 'Default']

    def filter_params(params):
        return tuple(p for p in params if p in PARAM_WHITELIST)

    def filter_args(args):
        return tuple(pair for pair in args if pair[0] in PARAM_WHITELIST)

    def filter_element(e):
        """ Strip nt arguments. """
        if isinstance(e, jsparagus.grammar.Nt):
            return jsparagus.grammar.Nt(e.name, filter_args(e.args))
        elif isinstance(e, jsparagus.grammar.Optional):
            return jsparagus.grammar.Optional(filter_element(e.inner))
        else:
            return e

    def filter_condition(c):
        if c is None or c[0] not in PARAM_WHITELIST:
            return None
        return c

    def filter_production(p):
        """ Discard production conditions and nt arguments. """
        body = [filter_element(e) for e in p.body]
        return jsparagus.grammar.Production(body, p.reducer,
                                            condition=filter_condition(p.condition))

    nonterminals = {}
    for nt, nt_def in g.nonterminals.items():
        params = list(filter_params(nt_def.params))
        rhs_list = [filter_production(p) for p in nt_def.rhs_list]
        nonterminals[nt] = jsparagus.grammar.NtDef(params, rhs_list, nt_def.type)
    return g.with_nonterminals(nonterminals)


def investigate2(states):
    from collections import defaultdict
    grammar = states.grammar

    back_edges = defaultdict(lambda: defaultdict(set))
    for state in states.states:
        for t, action in state.action_row.items():
            if action >= 0:  # shift
                back_edges[action][t].add(state.id)
        for nt, next_state in state.ctn_row.items():
            back_edges[next_state][nt].add(state.id)

    def all_back_edges(ids):
        result = defaultdict(set)
        for i in ids:
            for symbol, js in back_edges[i].items():
                result[symbol] |= js
        return result

    def report(seq, what):
        print(seq, "=>", what)

    def distinguish(a, b, a0, b0, postfix):
        """See what lookbehind distinguishes two sets of states."""
        skip_set = set()
        if postfix:
            for name, target_set in [('div', a0), ('re', b0)]:
                if a & target_set and not (b & target_set):
                    yield("<{} context> {}".format(name, grammar.rhs_to_str(postfix)), "div")
                    skip_set.add((name, "div"))
                if b & target_set and not (a & target_set):
                    yield("<{} context> {}".format(name, grammar.rhs_to_str(postfix)), "re")
                    skip_set.add((name, "re"))
        za = all_back_edges(a)
        zb = all_back_edges(b)
        zau = set(za.keys()) - set(zb.keys())
        zbu = set(zb.keys()) - set(za.keys())
        if len(zau) > len(zbu):
            others = 'div'
            for symbol in zau:
                del za[symbol]
        elif len(zbu) > 1:
            others = 're'
            for symbol in zbu:
                del zb[symbol]
        else:
            others = None

        for symbol in sorted(za, key=repr):
            if symbol not in zb:
                pre = za[symbol]
                if pre & a0 and not (pre & b0) and ('div', 'div') in skip_set:
                    pass
                elif pre & b0 and not (pre & a0) and ('re', 'div') in skip_set:
                    pass
                else:
                    yield grammar.rhs_to_str([symbol] + postfix), "div"
        for symbol in sorted(zb, key=repr):
            if symbol not in za:
                pre = zb[symbol]
                if pre & a0 and not (pre & b0) and ('div', 're') in skip_set:
                    pass
                elif pre & b0 and not (pre & a0) and ('re', 're') in skip_set:
                    pass
                else:
                    yield grammar.rhs_to_str([symbol] + postfix), "re"
            else:
                yield from distinguish(za[symbol], zb[symbol], a0, b0, [symbol] + postfix)

        if others is not None:
            yield "<other> " + grammar.rhs_to_str(postfix), others

    div_ss = {s.id for s in states.states if '?' in s.action_row or '/=' in s.action_row}
    re_ss = {s.id for s in states.states if 'RegularExpressionLiteral' in s.action_row}

    assert len(div_ss & re_ss) <= 4

    div_ss -= re_ss
    for i, o in distinguish(div_ss, re_ss, div_ss, re_ss, []):
        report(i, o)

def investigate(states):
    from jsparagus import grammar
    import collections
    import walk_graph

    # Figure out which nonterminals can match the empty string.
    empty_nts = {}
    implications = []
    for nt, nt_def in states.grammar.nonterminals.items():
        for prod in nt_def.rhs_list:
            rhs = [e for e in prod.body if grammar.is_concrete_element(e)]
            if len(rhs) == 0:
                empty_nts[nt] = True
            elif all(isinstance(e, grammar.Nt) for e in rhs):
                implications.append((rhs, nt))
    done = False
    while not done:
        done = True
        for rhs, nt in implications:
            if nt not in empty_nts and all(empty_nts.get(e) for e in rhs):
                done = False
                empty_nts[nt] = True

    def strip_body(body):
        return [e for e in body if grammar.is_concrete_element(e)]

    # Produce start sets and (analogous) finish sets
    start_contains = collections.defaultdict(set)
    start_includes = collections.defaultdict(set)
    finish_contains = collections.defaultdict(set)
    finish_includes = collections.defaultdict(set)
    for nt, nt_def in states.grammar.nonterminals.items():
        for prod in nt_def.rhs_list:
            if prod.body and isinstance(prod.body[-1], grammar.ErrorSymbol):
                continue
            rhs = strip_body(prod.body)

            for e in rhs:
                if isinstance(e, str):
                    start_contains[nt].add(e)
                    break
                elif isinstance(e, grammar.Nt):
                    start_includes[nt].add(e)
                    if e not in empty_nts:
                        break
                else:
                    break

            # Gather information for finish set
            for e in reversed(rhs):
                if isinstance(e, str):
                    finish_contains[nt].add(e)
                    break
                elif isinstance(e, grammar.Nt):
                    finish_includes[nt].add(e)
                    if e not in empty_nts:
                        break
                else:
                    break
    start = walk_graph.least_sets(start_contains, start_includes)
    finish = walk_graph.least_sets(finish_contains, finish_includes)

    for nt in sorted(finish.keys(), key=repr):
        print(nt, "starts with", sorted(start[nt]))
        print(nt, "ends with", sorted(finish[nt]))

    # from stackoverflow
    from itertools import islice

    def window(seq, n=2):
        "Returns a sliding window (of width n) over data from the iterable"
        "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
        it = iter(seq)
        result = tuple(islice(it, n))
        if len(result) == n:
            yield result
        for elem in it:
            result = result[1:] + (elem,)
            yield result

    def which_can_appear_before(t):
        # elements that can start with `t`
        targets = {t} | {nt for nt in start if t in start[nt]}

        # elements that occur just before an element that can start with `t`
        preceders = {
            e1
            for nt, nt_def in states.grammar.nonterminals.items()
            for prod in nt_def.rhs_list
            for e1, e2 in window(strip_body(prod.body))
            if e2 in targets
        }

        def expand_element(e):
            if e in states.grammar.nonterminals:
                return {e2 for e1 in finish[e] for e2 in expand_element(e1)}
            elif e in ECMASCRIPT_SYNTHETIC_TERMINALS:
                return ECMASCRIPT_SYNTHETIC_TERMINALS[e]
            else:
                return [e]

        return {
            t
            for e in preceders
            for t in expand_element(e)
        }

    print("nonterminals that can begin with RegularExpressionLiteral:")
    for nt in sorted(start.keys(), key=repr):
        if 'RegularExpressionLiteral' in start[nt]:
            print(nt)
    print()

    print("things that can appear before RegularExpressionLiteral:")
    re_prec = which_can_appear_before('RegularExpressionLiteral')
    for e in sorted(re_prec, key=repr):
        print(e)
    print()

    print("things that can appear before `/`:")
    div_prec = which_can_appear_before('/')
    for e in sorted(div_prec, key=repr):
        print(e)
    print()

    print("both:")
    for e in sorted(re_prec & div_prec, key=repr):
        print(e)
    print()

def main():
    # Read command-line options.
    parser = argparse.ArgumentParser(
        description='Ponder the ECMAScript grammar.',
        allow_abbrev=False)
    default_filename = os.path.join(os.path.dirname(__file__),
                                    "es-simplified.esgrammar")
    parser.add_argument(
        'filename', metavar='FILE', nargs='?', default=default_filename,
        help=".esgrammar (or .jsparagus_dump) input file")
    parser.add_argument(
        '-o', '--output', metavar='FILE', default='/dev/stdout',
        help="output filename for parser tables")
    parser.add_argument(
        '-v', '--verbose', action='store_true',
        help="print some debug output")
    parser.add_argument(
        '--progress', action='store_true',
        help="print a dot each time a state is analyzed (thousands of them)")
    args = parser.parse_args()

    # Check filenames.
    in_filename = args.filename
    if in_filename.endswith('.esgrammar'):
        from_source = True
    elif in_filename.endswith('.jsparagus_dump'):
        from_source = False
    else:
        raise ValueError("input file extension should be .esgrammar or .jsparagus_dump")

    out_filename = args.output
    if out_filename.endswith('.py'):
        target = 'python'
    elif out_filename.endswith('.rs'):
        target = 'rust'
    elif out_filename.endswith('.jsparagus_dump'):
        target = 'dump'
    else:
        raise ValueError("-o file extension should be .py, .rs, or .jsparagus_dump")

    # Load input and analyze it.
    if from_source:
        with open(in_filename) as f:
            text = f.read()

        grammar = parse_esgrammar(
            text,
            filename=args.filename,
            goals=ECMASCRIPT_GOAL_NTS,
            synthetic_terminals=ECMASCRIPT_SYNTHETIC_TERMINALS)
        grammar = hack_grammar(grammar)
        if args.verbose:
            grammar.dump()

        states = jsparagus.gen.generate_parser_states(
            grammar, verbose=args.verbose, progress=args.progress)
    else:
        states = jsparagus.gen.ParserStates.load(in_filename)
        investigate2(states)

    # Generate output.
    try:
        if target in ('python', 'rust'):
            with open(out_filename, 'w') as f:
                jsparagus.gen.generate_parser(f, states,
                                              target=target,
                                              verbose=args.verbose)
        else:
            assert target == 'dump'
            states.save(out_filename)
    except Exception:
        # On failure, don't leave a partial output file lying around.
        try:
            os.remove(out_filename)
        except Exception:
            pass
        raise


if __name__ == '__main__':
    main()
	"""generate_js_parser_tables.py - Generate tables from the ES grammar."""

	import argparse
	import os
	import jsparagus.gen
	import jsparagus.grammar
	from .parse_esgrammar import parse_esgrammar
	from .lexer import ECMASCRIPT_FULL_KEYWORDS, ECMASCRIPT_CONDITIONAL_KEYWORDS


	ECMASCRIPT_GOAL_NTS = [
	'Script',
	'Module',
	# 'FormalParameters',
	# 'FunctionBody',
	]

	ECMASCRIPT_SYNTHETIC_TERMINALS = {
	'IdentifierName': {
	'Name',
	*ECMASCRIPT_FULL_KEYWORDS,
	*ECMASCRIPT_CONDITIONAL_KEYWORDS
	},
	'Identifier': {
	'Name',
	*ECMASCRIPT_CONDITIONAL_KEYWORDS
	}
	}


	def hack_grammar(g):
	# We throw away most of the boolean parameters in the grammar, as the
	# current parser generator's approach of fully expanding them is a huge
	# pain.

	PARAM_WHITELIST = ['In', 'Default']

	def filter_params(params):
	return tuple(p for p in params if p in PARAM_WHITELIST)

	def filter_args(args):
	return tuple(pair for pair in args if pair[0] in PARAM_WHITELIST)

	def filter_element(e):
	""" Strip nt arguments. """
	if isinstance(e, jsparagus.grammar.Nt):
	return jsparagus.grammar.Nt(e.name, filter_args(e.args))
	elif isinstance(e, jsparagus.grammar.Optional):
	return jsparagus.grammar.Optional(filter_element(e.inner))
	else:
	return e

	def filter_condition(c):
	if c is None or c[0] not in PARAM_WHITELIST:
	return None
	return c

	def filter_production(p):
	""" Discard production conditions and nt arguments. """
	body = [filter_element(e) for e in p.body]
	return jsparagus.grammar.Production(body, p.reducer,
	condition=filter_condition(p.condition))

	nonterminals = {}
	for nt, nt_def in g.nonterminals.items():
	params = list(filter_params(nt_def.params))
	rhs_list = [filter_production(p) for p in nt_def.rhs_list]
	nonterminals[nt] = jsparagus.grammar.NtDef(params, rhs_list, nt_def.type)
	return g.with_nonterminals(nonterminals)


	def investigate2(states):
	from collections import defaultdict
	grammar = states.grammar

	back_edges = defaultdict(lambda: defaultdict(set))
	for state in states.states:
	for t, action in state.action_row.items():
	if action >= 0: # shift
	back_edges[action][t].add(state.id)
	for nt, next_state in state.ctn_row.items():
	back_edges[next_state][nt].add(state.id)

	def all_back_edges(ids):
	result = defaultdict(set)
	for i in ids:
	for symbol, js in back_edges[i].items():
	result[symbol] \|= js
	return result

	def report(seq, what):
	print(seq, "=>", what)

	def distinguish(a, b, a0, b0, postfix):
	"""See what lookbehind distinguishes two sets of states."""
	skip_set = set()
	if postfix:
	for name, target_set in [('div', a0), ('re', b0)]:
	if a & target_set and not (b & target_set):
	yield("<{} context> {}".format(name, grammar.rhs_to_str(postfix)), "div")
	skip_set.add((name, "div"))
	if b & target_set and not (a & target_set):
	yield("<{} context> {}".format(name, grammar.rhs_to_str(postfix)), "re")
	skip_set.add((name, "re"))
	za = all_back_edges(a)
	zb = all_back_edges(b)
	zau = set(za.keys()) - set(zb.keys())
	zbu = set(zb.keys()) - set(za.keys())
	if len(zau) > len(zbu):
	others = 'div'
	for symbol in zau:
	del za[symbol]
	elif len(zbu) > 1:
	others = 're'
	for symbol in zbu:
	del zb[symbol]
	else:
	others = None

	for symbol in sorted(za, key=repr):
	if symbol not in zb:
	pre = za[symbol]
	if pre & a0 and not (pre & b0) and ('div', 'div') in skip_set:
	pass
	elif pre & b0 and not (pre & a0) and ('re', 'div') in skip_set:
	pass
	else:
	yield grammar.rhs_to_str([symbol] + postfix), "div"
	for symbol in sorted(zb, key=repr):
	if symbol not in za:
	pre = zb[symbol]
	if pre & a0 and not (pre & b0) and ('div', 're') in skip_set:
	pass
	elif pre & b0 and not (pre & a0) and ('re', 're') in skip_set:
	pass
	else:
	yield grammar.rhs_to_str([symbol] + postfix), "re"
	else:
	yield from distinguish(za[symbol], zb[symbol], a0, b0, [symbol] + postfix)

	if others is not None:
	yield "<other> " + grammar.rhs_to_str(postfix), others

	div_ss = {s.id for s in states.states if '?' in s.action_row or '/=' in s.action_row}
	re_ss = {s.id for s in states.states if 'RegularExpressionLiteral' in s.action_row}

	assert len(div_ss & re_ss) <= 4

	div_ss -= re_ss
	for i, o in distinguish(div_ss, re_ss, div_ss, re_ss, []):
	report(i, o)

	def investigate(states):
	from jsparagus import grammar
	import collections
	import walk_graph

	# Figure out which nonterminals can match the empty string.
	empty_nts = {}
	implications = []
	for nt, nt_def in states.grammar.nonterminals.items():
	for prod in nt_def.rhs_list:
	rhs = [e for e in prod.body if grammar.is_concrete_element(e)]
	if len(rhs) == 0:
	empty_nts[nt] = True
	elif all(isinstance(e, grammar.Nt) for e in rhs):
	implications.append((rhs, nt))
	done = False
	while not done:
	done = True
	for rhs, nt in implications:
	if nt not in empty_nts and all(empty_nts.get(e) for e in rhs):
	done = False
	empty_nts[nt] = True

	def strip_body(body):
	return [e for e in body if grammar.is_concrete_element(e)]

	# Produce start sets and (analogous) finish sets
	start_contains = collections.defaultdict(set)
	start_includes = collections.defaultdict(set)
	finish_contains = collections.defaultdict(set)
	finish_includes = collections.defaultdict(set)
	for nt, nt_def in states.grammar.nonterminals.items():
	for prod in nt_def.rhs_list:
	if prod.body and isinstance(prod.body[-1], grammar.ErrorSymbol):
	continue
	rhs = strip_body(prod.body)

	for e in rhs:
	if isinstance(e, str):
	start_contains[nt].add(e)
	break
	elif isinstance(e, grammar.Nt):
	start_includes[nt].add(e)
	if e not in empty_nts:
	break
	else:
	break

	# Gather information for finish set
	for e in reversed(rhs):
	if isinstance(e, str):
	finish_contains[nt].add(e)
	break
	elif isinstance(e, grammar.Nt):
	finish_includes[nt].add(e)
	if e not in empty_nts:
	break
	else:
	break
	start = walk_graph.least_sets(start_contains, start_includes)
	finish = walk_graph.least_sets(finish_contains, finish_includes)

	for nt in sorted(finish.keys(), key=repr):
	print(nt, "starts with", sorted(start[nt]))
	print(nt, "ends with", sorted(finish[nt]))

	# from stackoverflow
	from itertools import islice

	def window(seq, n=2):
	"Returns a sliding window (of width n) over data from the iterable"
	" s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... "
	it = iter(seq)
	result = tuple(islice(it, n))
	if len(result) == n:
	yield result
	for elem in it:
	result = result[1:] + (elem,)
	yield result

	def which_can_appear_before(t):
	# elements that can start with `t`
	targets = {t} \| {nt for nt in start if t in start[nt]}

	# elements that occur just before an element that can start with `t`
	preceders = {
	e1
	for nt, nt_def in states.grammar.nonterminals.items()
	for prod in nt_def.rhs_list
	for e1, e2 in window(strip_body(prod.body))
	if e2 in targets
	}

	def expand_element(e):
	if e in states.grammar.nonterminals:
	return {e2 for e1 in finish[e] for e2 in expand_element(e1)}
	elif e in ECMASCRIPT_SYNTHETIC_TERMINALS:
	return ECMASCRIPT_SYNTHETIC_TERMINALS[e]
	else:
	return [e]

	return {
	t
	for e in preceders
	for t in expand_element(e)
	}

	print("nonterminals that can begin with RegularExpressionLiteral:")
	for nt in sorted(start.keys(), key=repr):
	if 'RegularExpressionLiteral' in start[nt]:
	print(nt)
	print()

	print("things that can appear before RegularExpressionLiteral:")
	re_prec = which_can_appear_before('RegularExpressionLiteral')
	for e in sorted(re_prec, key=repr):
	print(e)
	print()

	print("things that can appear before `/`:")
	div_prec = which_can_appear_before('/')
	for e in sorted(div_prec, key=repr):
	print(e)
	print()

	print("both:")
	for e in sorted(re_prec & div_prec, key=repr):
	print(e)
	print()

	def main():
	# Read command-line options.
	parser = argparse.ArgumentParser(
	description='Ponder the ECMAScript grammar.',
	allow_abbrev=False)
	default_filename = os.path.join(os.path.dirname(__file__),
	"es-simplified.esgrammar")
	parser.add_argument(
	'filename', metavar='FILE', nargs='?', default=default_filename,
	help=".esgrammar (or .jsparagus_dump) input file")
	parser.add_argument(
	'-o', '--output', metavar='FILE', default='/dev/stdout',
	help="output filename for parser tables")
	parser.add_argument(
	'-v', '--verbose', action='store_true',
	help="print some debug output")
	parser.add_argument(
	'--progress', action='store_true',
	help="print a dot each time a state is analyzed (thousands of them)")
	args = parser.parse_args()

	# Check filenames.
	in_filename = args.filename
	if in_filename.endswith('.esgrammar'):
	from_source = True
	elif in_filename.endswith('.jsparagus_dump'):
	from_source = False
	else:
	raise ValueError("input file extension should be .esgrammar or .jsparagus_dump")

	out_filename = args.output
	if out_filename.endswith('.py'):
	target = 'python'
	elif out_filename.endswith('.rs'):
	target = 'rust'
	elif out_filename.endswith('.jsparagus_dump'):
	target = 'dump'
	else:
	raise ValueError("-o file extension should be .py, .rs, or .jsparagus_dump")

	# Load input and analyze it.
	if from_source:
	with open(in_filename) as f:
	text = f.read()

	grammar = parse_esgrammar(
	text,
	filename=args.filename,
	goals=ECMASCRIPT_GOAL_NTS,
	synthetic_terminals=ECMASCRIPT_SYNTHETIC_TERMINALS)
	grammar = hack_grammar(grammar)
	if args.verbose:
	grammar.dump()

	states = jsparagus.gen.generate_parser_states(
	grammar, verbose=args.verbose, progress=args.progress)
	else:
	states = jsparagus.gen.ParserStates.load(in_filename)
	investigate2(states)

	# Generate output.
	try:
	if target in ('python', 'rust'):
	with open(out_filename, 'w') as f:
	jsparagus.gen.generate_parser(f, states,
	target=target,
	verbose=args.verbose)
	else:
	assert target == 'dump'
	states.save(out_filename)
	except Exception:
	# On failure, don't leave a partial output file lying around.
	try:
	os.remove(out_filename)
	except Exception:
	pass
	raise


	if __name__ == '__main__':
	main()