Last active
August 17, 2022 08:36
-
-
Save vrthra/5119b747ee536f7c5533379247a55760 to your computer and use it in GitHub Desktop.
Earley Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import re, string | |
START_SYMBOL = '<start>' | |
RE_NONTERMINAL = re.compile('(<[^<> ]*>)') | |
EPSILON = '' | |
def fixpoint(f): | |
def helper(arg): | |
while True: | |
sarg = str(arg) | |
arg_ = f(arg) | |
if str(arg_) == sarg: | |
return arg | |
arg = arg_ | |
return helper | |
def rules(grammar): | |
return [(key, choice) | |
for key, choices in grammar.items() | |
for choice in choices] | |
def terminals(grammar): | |
return set(token | |
for key, choice in rules(grammar) | |
for token in choice if token not in grammar) | |
def nullable_expr(expr, nullables): | |
return all(token in nullables for token in expr) | |
def nullable(grammar): | |
productions = rules(grammar) | |
@fixpoint | |
def nullable_(nullables): | |
for A, expr in productions: | |
if nullable_expr(expr, nullables): | |
nullables |= {A} | |
return (nullables) | |
return nullable_({EPSILON}) | |
def canonical(grammar, letters=False): | |
def split(expansion): | |
if isinstance(expansion, tuple): | |
expansion = expansion[0] | |
return [token for token in re.split(RE_NONTERMINAL, expansion) if token] | |
def tokenize(word): | |
return list(word) if letters else [word] | |
def canonical_expr(expression): | |
return [ | |
token for word in split(expression) | |
for token in ([word] if word in grammar else tokenize(word)) | |
] | |
return { | |
k: [canonical_expr(expression) for expression in alternatives] | |
for k, alternatives in grammar.items() | |
} | |
class Column(object): | |
def __init__(self, index, letter): | |
self.index, self.letter = index, letter | |
self.states, self._unique, self.transitives = [], {}, {} | |
def add_transitive(self, key, state): | |
assert key not in self.transitives | |
self.transitives[key] = state | |
self.add(state) | |
def __str__(self): | |
return "%s chart[%d]\n%s" % (self.letter, self.index, "\n".join( | |
str(state) for state in self.states )) | |
def __repr__(self): | |
return "%s chart[%d]\n%d states with %d finished" % (self.letter, self.index, len(self.states), len([s for s in self.states if s.finished()])) | |
def add(self, state): | |
if state in self._unique: | |
return self._unique[state] | |
self._unique[state] = state | |
self.states.append(state) | |
state.e_col = self | |
#print('>', state) | |
return self._unique[state] | |
class Item(object): | |
def __init__(self, name, expr, dot): | |
self.name, self.expr, self.dot = name, expr, dot | |
def finished(self): | |
return self.dot >= len(self.expr) | |
def advance(self, tag): | |
assert False | |
return Item(self.name, self.expr, self.dot + 1) | |
def at_dot(self): | |
return self.expr[self.dot] if self.dot < len(self.expr) else None | |
class State(Item): | |
def __init__(self, name, expr, dot, s_col): | |
super().__init__(name, expr, dot) | |
self.s_col, self.e_col = s_col, None | |
self.tag = '' | |
def __str__(self): | |
def idx(var): | |
return var.index if var else -1 | |
return self.name + ':= ' + ' '.join([ | |
str(p) | |
for p in [*self.expr[:self.dot], '|', *self.expr[self.dot:]] | |
]) + "(%d,%d) \t #%s" % (idx(self.s_col), idx(self.e_col), self.tag) | |
def __repr__(self): | |
def idx(var): | |
return var.index if var else -1 | |
return self.name + ':= ' + ' '.join([ | |
str(p) | |
for p in [*self.expr[:self.dot], '|', *self.expr[self.dot:]] | |
]) + "(%d,%d)" % (idx(self.s_col), idx(self.e_col)) | |
def copy(self): | |
s = State(self.name, self.expr, self.dot, self.s_col) | |
s.e_col = self.e_col | |
return s | |
def _t(self): | |
return (self.name, self.expr, self.dot, self.s_col.index) | |
def __hash__(self): | |
return hash(self._t()) | |
def __eq__(self, other): | |
return self._t() == other._t() | |
def advance(self, tag): | |
s = State(self.name, self.expr, self.dot + 1, self.s_col) | |
s.tag = tag | |
return s | |
class EarleyParser: | |
def __init__(self, grammar, **kwargs): | |
self._grammar = grammar | |
self._start_symbol = kwargs.get('start_symbol', START_SYMBOL) | |
self.log = kwargs.get('log') or False | |
self.tokens = kwargs.get('tokens') or set() | |
self.cgrammar = canonical(grammar, letters=True) | |
self.epsilon = nullable(self.cgrammar) | |
def chart_parse(self, words, start): | |
alt = tuple(*self.cgrammar[start]) | |
chart = [Column(i, tok) for i, tok in enumerate([None, *words])] | |
chart[0].add(State(start, alt, 0, chart[0])) | |
return self.fill_chart(chart) | |
def predict(self, col, sym, state): | |
for alt in self.cgrammar[sym]: | |
s = State(sym, tuple(alt), 0, col) | |
s.tag = 'predict' | |
col.add(s) | |
if sym in self.epsilon: | |
col.add(state.advance('*predict')) | |
def scan(self, col, state, letter): | |
if letter == col.letter: | |
col.add(state.advance('scan')) | |
def complete(self, col, state): | |
return self.earley_complete(col, state) | |
def earley_complete(self, col, state): | |
parent_states = [ | |
st for st in state.s_col.states if st.at_dot() == state.name | |
] | |
for st in parent_states: | |
col.add(st.advance('complete')) | |
def fill_chart(self, chart): | |
for i, col in enumerate(chart): | |
for state in col.states: | |
if state.finished(): | |
self.complete(col, state) | |
else: | |
sym = state.at_dot() | |
if sym in self.cgrammar: | |
self.predict(col, sym, state) | |
else: | |
if i + 1 >= len(chart): | |
continue | |
self.scan(chart[i + 1], state, sym) | |
if self.log: | |
print(col, "\n") | |
print() | |
return chart | |
def parse_prefix(self, text): | |
self.table = self.chart_parse(text, self._start_symbol) | |
for col in reversed(self.table): | |
states = [st for st in col.states if st.name == self._start_symbol] | |
if states: | |
return col.index, states | |
return -1, [] | |
class LeoParser(EarleyParser): | |
def complete(self, col, state): | |
return self.leo_complete(col, state) | |
def leo_complete(self, col, state): | |
detred = self.deterministic_reduction(state) | |
if detred: | |
print("reduce:", state, " to:", detred) | |
col.add(detred.copy()) | |
else: | |
self.earley_complete(col, state) | |
def constraint1(self, st_A): | |
col_s1 = st_A.s_col | |
matching_st_B = [ | |
s for s in col_s1.states | |
if s.expr and s.dot == len(s.expr) - 1 and s.at_dot() == st_A.name | |
] | |
uniq_st_B = {} | |
# B is the only item in I_i with dot in front of A | |
for st_B in matching_st_B: | |
key = st_B.e_col.index | |
if key not in uniq_st_B: | |
uniq_st_B[key] = st_B | |
else: | |
uniq_st_B[key] = None | |
st_Bs = [i for i in uniq_st_B.values() if i is not None] | |
if not st_Bs: | |
return None | |
st_B_inc, *empty = st_Bs | |
# by lemma 4.9 proof | |
assert empty == [] | |
return st_B_inc | |
def constraint2(self, st_B_inc, e): | |
st_Bs = [ | |
st for st in e.states | |
if st.finished() and st.s_col.index == st_B_inc.s_col.index | |
and st.name == st_B_inc.name and st.expr == st_B_inc.expr | |
] | |
if not st_Bs: | |
return None | |
st_B, *empty = st_Bs | |
assert empty == [] | |
return st_B | |
def get_top(self, state_A): | |
st_B_inc = self.constraint1(state_A) | |
if not st_B_inc: | |
return None | |
st_B = self.constraint2(st_B_inc, state_A.s_col) | |
if not st_B: | |
return None | |
t_name = st_B.name | |
if t_name in st_B_inc.e_col.transitives: | |
return st_B_inc.e_col.transitives[t_name].copy() | |
top = self.get_top(st_B) or st_B | |
st_B_inc.e_col.add_transitive(t_name, top.copy()) | |
return top.copy() | |
def deterministic_reduction(self, state): | |
return self.get_top(state) | |
LR_GRAMMAR = { | |
'<start>': ['<A>'], | |
'<A>': ['<A>a', ''], | |
} | |
RR_GRAMMAR2 = { | |
'<start>': ['<A>'], | |
'<A>': ['a<A>', ''], | |
} | |
RR_GRAMMAR3 = { | |
'<start>': ['<A>'], | |
'<A>': ['ab<A>', ''], | |
} | |
RR_GRAMMAR4 = { | |
'<start>': ['c<A>'], | |
'<A>': ['ab<A>', ''], | |
} | |
RR_GRAMMAR5 = { | |
'<start>': ['<A>c'], | |
'<A>': ['ab<A>', ''], | |
} | |
mystring = 'aaaa' | |
mystring = 'abababab' | |
mystring = 'cabababab' | |
#mystring = 'abababababababc' | |
mystring = 'aa' | |
cursor, _ = LeoParser(RR_GRAMMAR2, log=True).parse_prefix(mystring) | |
print(cursor) | |
#result = LeoParser(LR_GRAMMAR, log=True).parse(mystring) | |
#mystring = 'aa' | |
#result = LeoParser(LR_GRAMMAR, log=True).parse(mystring) | |
# for tree in result: | |
# print(tree_to_string(tree)) | |
# | |
# | |
#if __name__ == "__main__": | |
# result = LeoParser(RR_GRAMMAR, log=True).parse(mystring) | |
# | |
# | |
#if __name__ == "__main__": | |
# result = LeoParser(LR_GRAMMAR, log=True).parse(mystring) | |
# for tree in result: | |
# print(tree_to_string(tree)) | |
# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment