Skip to content

Instantly share code, notes, and snippets.

@jorendorff
Last active October 8, 2019 17:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jorendorff/403e1ea0c3c42dad730b3be8b6dc8cf0 to your computer and use it in GitHub Desktop.
Save jorendorff/403e1ea0c3c42dad730b3be8b6dc8cf0 to your computer and use it in GitHub Desktop.
"""generate_js_parser_tables.py - Generate tables from the ES grammar."""
import argparse
import os
import jsparagus.gen
import jsparagus.grammar
from .parse_esgrammar import parse_esgrammar
from .lexer import ECMASCRIPT_FULL_KEYWORDS, ECMASCRIPT_CONDITIONAL_KEYWORDS
ECMASCRIPT_GOAL_NTS = [
'Script',
'Module',
# 'FormalParameters',
# 'FunctionBody',
]
ECMASCRIPT_SYNTHETIC_TERMINALS = {
'IdentifierName': {
'Name',
*ECMASCRIPT_FULL_KEYWORDS,
*ECMASCRIPT_CONDITIONAL_KEYWORDS
},
'Identifier': {
'Name',
*ECMASCRIPT_CONDITIONAL_KEYWORDS
}
}
def hack_grammar(g):
# We throw away most of the boolean parameters in the grammar, as the
# current parser generator's approach of fully expanding them is a huge
# pain.
PARAM_WHITELIST = ['In', 'Default']
def filter_params(params):
return tuple(p for p in params if p in PARAM_WHITELIST)
def filter_args(args):
return tuple(pair for pair in args if pair[0] in PARAM_WHITELIST)
def filter_element(e):
""" Strip nt arguments. """
if isinstance(e, jsparagus.grammar.Nt):
return jsparagus.grammar.Nt(e.name, filter_args(e.args))
elif isinstance(e, jsparagus.grammar.Optional):
return jsparagus.grammar.Optional(filter_element(e.inner))
else:
return e
def filter_condition(c):
if c is None or c[0] not in PARAM_WHITELIST:
return None
return c
def filter_production(p):
""" Discard production conditions and nt arguments. """
body = [filter_element(e) for e in p.body]
return jsparagus.grammar.Production(body, p.reducer,
condition=filter_condition(p.condition))
nonterminals = {}
for nt, nt_def in g.nonterminals.items():
params = list(filter_params(nt_def.params))
rhs_list = [filter_production(p) for p in nt_def.rhs_list]
nonterminals[nt] = jsparagus.grammar.NtDef(params, rhs_list, nt_def.type)
return g.with_nonterminals(nonterminals)
def investigate2(states):
from collections import defaultdict
grammar = states.grammar
back_edges = defaultdict(lambda: defaultdict(set))
for state in states.states:
for t, action in state.action_row.items():
if action >= 0: # shift
back_edges[action][t].add(state.id)
for nt, next_state in state.ctn_row.items():
back_edges[next_state][nt].add(state.id)
def all_back_edges(ids):
result = defaultdict(set)
for i in ids:
for symbol, js in back_edges[i].items():
result[symbol] |= js
return result
def report(seq, what):
print(seq, "=>", what)
def distinguish(a, b, a0, b0, postfix):
"""See what lookbehind distinguishes two sets of states."""
skip_set = set()
if postfix:
for name, target_set in [('div', a0), ('re', b0)]:
if a & target_set and not (b & target_set):
yield("<{} context> {}".format(name, grammar.rhs_to_str(postfix)), "div")
skip_set.add((name, "div"))
if b & target_set and not (a & target_set):
yield("<{} context> {}".format(name, grammar.rhs_to_str(postfix)), "re")
skip_set.add((name, "re"))
za = all_back_edges(a)
zb = all_back_edges(b)
zau = set(za.keys()) - set(zb.keys())
zbu = set(zb.keys()) - set(za.keys())
if len(zau) > len(zbu):
others = 'div'
for symbol in zau:
del za[symbol]
elif len(zbu) > 1:
others = 're'
for symbol in zbu:
del zb[symbol]
else:
others = None
for symbol in sorted(za, key=repr):
if symbol not in zb:
pre = za[symbol]
if pre & a0 and not (pre & b0) and ('div', 'div') in skip_set:
pass
elif pre & b0 and not (pre & a0) and ('re', 'div') in skip_set:
pass
else:
yield grammar.rhs_to_str([symbol] + postfix), "div"
for symbol in sorted(zb, key=repr):
if symbol not in za:
pre = zb[symbol]
if pre & a0 and not (pre & b0) and ('div', 're') in skip_set:
pass
elif pre & b0 and not (pre & a0) and ('re', 're') in skip_set:
pass
else:
yield grammar.rhs_to_str([symbol] + postfix), "re"
else:
yield from distinguish(za[symbol], zb[symbol], a0, b0, [symbol] + postfix)
if others is not None:
yield "<other> " + grammar.rhs_to_str(postfix), others
div_ss = {s.id for s in states.states if '?' in s.action_row or '/=' in s.action_row}
re_ss = {s.id for s in states.states if 'RegularExpressionLiteral' in s.action_row}
assert len(div_ss & re_ss) <= 4
div_ss -= re_ss
for i, o in distinguish(div_ss, re_ss, div_ss, re_ss, []):
report(i, o)
def investigate(states):
from jsparagus import grammar
import collections
import walk_graph
# Figure out which nonterminals can match the empty string.
empty_nts = {}
implications = []
for nt, nt_def in states.grammar.nonterminals.items():
for prod in nt_def.rhs_list:
rhs = [e for e in prod.body if grammar.is_concrete_element(e)]
if len(rhs) == 0:
empty_nts[nt] = True
elif all(isinstance(e, grammar.Nt) for e in rhs):
implications.append((rhs, nt))
done = False
while not done:
done = True
for rhs, nt in implications:
if nt not in empty_nts and all(empty_nts.get(e) for e in rhs):
done = False
empty_nts[nt] = True
def strip_body(body):
return [e for e in body if grammar.is_concrete_element(e)]
# Produce start sets and (analogous) finish sets
start_contains = collections.defaultdict(set)
start_includes = collections.defaultdict(set)
finish_contains = collections.defaultdict(set)
finish_includes = collections.defaultdict(set)
for nt, nt_def in states.grammar.nonterminals.items():
for prod in nt_def.rhs_list:
if prod.body and isinstance(prod.body[-1], grammar.ErrorSymbol):
continue
rhs = strip_body(prod.body)
for e in rhs:
if isinstance(e, str):
start_contains[nt].add(e)
break
elif isinstance(e, grammar.Nt):
start_includes[nt].add(e)
if e not in empty_nts:
break
else:
break
# Gather information for finish set
for e in reversed(rhs):
if isinstance(e, str):
finish_contains[nt].add(e)
break
elif isinstance(e, grammar.Nt):
finish_includes[nt].add(e)
if e not in empty_nts:
break
else:
break
start = walk_graph.least_sets(start_contains, start_includes)
finish = walk_graph.least_sets(finish_contains, finish_includes)
for nt in sorted(finish.keys(), key=repr):
print(nt, "starts with", sorted(start[nt]))
print(nt, "ends with", sorted(finish[nt]))
# from stackoverflow
from itertools import islice
def window(seq, n=2):
"Returns a sliding window (of width n) over data from the iterable"
" s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... "
it = iter(seq)
result = tuple(islice(it, n))
if len(result) == n:
yield result
for elem in it:
result = result[1:] + (elem,)
yield result
def which_can_appear_before(t):
# elements that can start with `t`
targets = {t} | {nt for nt in start if t in start[nt]}
# elements that occur just before an element that can start with `t`
preceders = {
e1
for nt, nt_def in states.grammar.nonterminals.items()
for prod in nt_def.rhs_list
for e1, e2 in window(strip_body(prod.body))
if e2 in targets
}
def expand_element(e):
if e in states.grammar.nonterminals:
return {e2 for e1 in finish[e] for e2 in expand_element(e1)}
elif e in ECMASCRIPT_SYNTHETIC_TERMINALS:
return ECMASCRIPT_SYNTHETIC_TERMINALS[e]
else:
return [e]
return {
t
for e in preceders
for t in expand_element(e)
}
print("nonterminals that can begin with RegularExpressionLiteral:")
for nt in sorted(start.keys(), key=repr):
if 'RegularExpressionLiteral' in start[nt]:
print(nt)
print()
print("things that can appear before RegularExpressionLiteral:")
re_prec = which_can_appear_before('RegularExpressionLiteral')
for e in sorted(re_prec, key=repr):
print(e)
print()
print("things that can appear before `/`:")
div_prec = which_can_appear_before('/')
for e in sorted(div_prec, key=repr):
print(e)
print()
print("both:")
for e in sorted(re_prec & div_prec, key=repr):
print(e)
print()
def main():
# Read command-line options.
parser = argparse.ArgumentParser(
description='Ponder the ECMAScript grammar.',
allow_abbrev=False)
default_filename = os.path.join(os.path.dirname(__file__),
"es-simplified.esgrammar")
parser.add_argument(
'filename', metavar='FILE', nargs='?', default=default_filename,
help=".esgrammar (or .jsparagus_dump) input file")
parser.add_argument(
'-o', '--output', metavar='FILE', default='/dev/stdout',
help="output filename for parser tables")
parser.add_argument(
'-v', '--verbose', action='store_true',
help="print some debug output")
parser.add_argument(
'--progress', action='store_true',
help="print a dot each time a state is analyzed (thousands of them)")
args = parser.parse_args()
# Check filenames.
in_filename = args.filename
if in_filename.endswith('.esgrammar'):
from_source = True
elif in_filename.endswith('.jsparagus_dump'):
from_source = False
else:
raise ValueError("input file extension should be .esgrammar or .jsparagus_dump")
out_filename = args.output
if out_filename.endswith('.py'):
target = 'python'
elif out_filename.endswith('.rs'):
target = 'rust'
elif out_filename.endswith('.jsparagus_dump'):
target = 'dump'
else:
raise ValueError("-o file extension should be .py, .rs, or .jsparagus_dump")
# Load input and analyze it.
if from_source:
with open(in_filename) as f:
text = f.read()
grammar = parse_esgrammar(
text,
filename=args.filename,
goals=ECMASCRIPT_GOAL_NTS,
synthetic_terminals=ECMASCRIPT_SYNTHETIC_TERMINALS)
grammar = hack_grammar(grammar)
if args.verbose:
grammar.dump()
states = jsparagus.gen.generate_parser_states(
grammar, verbose=args.verbose, progress=args.progress)
else:
states = jsparagus.gen.ParserStates.load(in_filename)
investigate2(states)
# Generate output.
try:
if target in ('python', 'rust'):
with open(out_filename, 'w') as f:
jsparagus.gen.generate_parser(f, states,
target=target,
verbose=args.verbose)
else:
assert target == 'dump'
states.save(out_filename)
except Exception:
# On failure, don't leave a partial output file lying around.
try:
os.remove(out_filename)
except Exception:
pass
raise
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment