Skip to content

Instantly share code, notes, and snippets.

@vrthra
Last active August 17, 2022 08:36
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vrthra/5119b747ee536f7c5533379247a55760 to your computer and use it in GitHub Desktop.
Save vrthra/5119b747ee536f7c5533379247a55760 to your computer and use it in GitHub Desktop.
Earley Parser
#!/usr/bin/env python3
import re, string
START_SYMBOL = '<start>'
RE_NONTERMINAL = re.compile('(<[^<> ]*>)')
EPSILON = ''
def fixpoint(f):
def helper(arg):
while True:
sarg = str(arg)
arg_ = f(arg)
if str(arg_) == sarg:
return arg
arg = arg_
return helper
def rules(grammar):
return [(key, choice)
for key, choices in grammar.items()
for choice in choices]
def terminals(grammar):
return set(token
for key, choice in rules(grammar)
for token in choice if token not in grammar)
def nullable_expr(expr, nullables):
return all(token in nullables for token in expr)
def nullable(grammar):
productions = rules(grammar)
@fixpoint
def nullable_(nullables):
for A, expr in productions:
if nullable_expr(expr, nullables):
nullables |= {A}
return (nullables)
return nullable_({EPSILON})
def canonical(grammar, letters=False):
def split(expansion):
if isinstance(expansion, tuple):
expansion = expansion[0]
return [token for token in re.split(RE_NONTERMINAL, expansion) if token]
def tokenize(word):
return list(word) if letters else [word]
def canonical_expr(expression):
return [
token for word in split(expression)
for token in ([word] if word in grammar else tokenize(word))
]
return {
k: [canonical_expr(expression) for expression in alternatives]
for k, alternatives in grammar.items()
}
class Column(object):
def __init__(self, index, letter):
self.index, self.letter = index, letter
self.states, self._unique, self.transitives = [], {}, {}
def add_transitive(self, key, state):
assert key not in self.transitives
self.transitives[key] = state
self.add(state)
def __str__(self):
return "%s chart[%d]\n%s" % (self.letter, self.index, "\n".join(
str(state) for state in self.states ))
def __repr__(self):
return "%s chart[%d]\n%d states with %d finished" % (self.letter, self.index, len(self.states), len([s for s in self.states if s.finished()]))
def add(self, state):
if state in self._unique:
return self._unique[state]
self._unique[state] = state
self.states.append(state)
state.e_col = self
#print('>', state)
return self._unique[state]
class Item(object):
def __init__(self, name, expr, dot):
self.name, self.expr, self.dot = name, expr, dot
def finished(self):
return self.dot >= len(self.expr)
def advance(self, tag):
assert False
return Item(self.name, self.expr, self.dot + 1)
def at_dot(self):
return self.expr[self.dot] if self.dot < len(self.expr) else None
class State(Item):
def __init__(self, name, expr, dot, s_col):
super().__init__(name, expr, dot)
self.s_col, self.e_col = s_col, None
self.tag = ''
def __str__(self):
def idx(var):
return var.index if var else -1
return self.name + ':= ' + ' '.join([
str(p)
for p in [*self.expr[:self.dot], '|', *self.expr[self.dot:]]
]) + "(%d,%d) \t #%s" % (idx(self.s_col), idx(self.e_col), self.tag)
def __repr__(self):
def idx(var):
return var.index if var else -1
return self.name + ':= ' + ' '.join([
str(p)
for p in [*self.expr[:self.dot], '|', *self.expr[self.dot:]]
]) + "(%d,%d)" % (idx(self.s_col), idx(self.e_col))
def copy(self):
s = State(self.name, self.expr, self.dot, self.s_col)
s.e_col = self.e_col
return s
def _t(self):
return (self.name, self.expr, self.dot, self.s_col.index)
def __hash__(self):
return hash(self._t())
def __eq__(self, other):
return self._t() == other._t()
def advance(self, tag):
s = State(self.name, self.expr, self.dot + 1, self.s_col)
s.tag = tag
return s
class EarleyParser:
def __init__(self, grammar, **kwargs):
self._grammar = grammar
self._start_symbol = kwargs.get('start_symbol', START_SYMBOL)
self.log = kwargs.get('log') or False
self.tokens = kwargs.get('tokens') or set()
self.cgrammar = canonical(grammar, letters=True)
self.epsilon = nullable(self.cgrammar)
def chart_parse(self, words, start):
alt = tuple(*self.cgrammar[start])
chart = [Column(i, tok) for i, tok in enumerate([None, *words])]
chart[0].add(State(start, alt, 0, chart[0]))
return self.fill_chart(chart)
def predict(self, col, sym, state):
for alt in self.cgrammar[sym]:
s = State(sym, tuple(alt), 0, col)
s.tag = 'predict'
col.add(s)
if sym in self.epsilon:
col.add(state.advance('*predict'))
def scan(self, col, state, letter):
if letter == col.letter:
col.add(state.advance('scan'))
def complete(self, col, state):
return self.earley_complete(col, state)
def earley_complete(self, col, state):
parent_states = [
st for st in state.s_col.states if st.at_dot() == state.name
]
for st in parent_states:
col.add(st.advance('complete'))
def fill_chart(self, chart):
for i, col in enumerate(chart):
for state in col.states:
if state.finished():
self.complete(col, state)
else:
sym = state.at_dot()
if sym in self.cgrammar:
self.predict(col, sym, state)
else:
if i + 1 >= len(chart):
continue
self.scan(chart[i + 1], state, sym)
if self.log:
print(col, "\n")
print()
return chart
def parse_prefix(self, text):
self.table = self.chart_parse(text, self._start_symbol)
for col in reversed(self.table):
states = [st for st in col.states if st.name == self._start_symbol]
if states:
return col.index, states
return -1, []
class LeoParser(EarleyParser):
def complete(self, col, state):
return self.leo_complete(col, state)
def leo_complete(self, col, state):
detred = self.deterministic_reduction(state)
if detred:
print("reduce:", state, " to:", detred)
col.add(detred.copy())
else:
self.earley_complete(col, state)
def constraint1(self, st_A):
col_s1 = st_A.s_col
matching_st_B = [
s for s in col_s1.states
if s.expr and s.dot == len(s.expr) - 1 and s.at_dot() == st_A.name
]
uniq_st_B = {}
# B is the only item in I_i with dot in front of A
for st_B in matching_st_B:
key = st_B.e_col.index
if key not in uniq_st_B:
uniq_st_B[key] = st_B
else:
uniq_st_B[key] = None
st_Bs = [i for i in uniq_st_B.values() if i is not None]
if not st_Bs:
return None
st_B_inc, *empty = st_Bs
# by lemma 4.9 proof
assert empty == []
return st_B_inc
def constraint2(self, st_B_inc, e):
st_Bs = [
st for st in e.states
if st.finished() and st.s_col.index == st_B_inc.s_col.index
and st.name == st_B_inc.name and st.expr == st_B_inc.expr
]
if not st_Bs:
return None
st_B, *empty = st_Bs
assert empty == []
return st_B
def get_top(self, state_A):
st_B_inc = self.constraint1(state_A)
if not st_B_inc:
return None
st_B = self.constraint2(st_B_inc, state_A.s_col)
if not st_B:
return None
t_name = st_B.name
if t_name in st_B_inc.e_col.transitives:
return st_B_inc.e_col.transitives[t_name].copy()
top = self.get_top(st_B) or st_B
st_B_inc.e_col.add_transitive(t_name, top.copy())
return top.copy()
def deterministic_reduction(self, state):
return self.get_top(state)
LR_GRAMMAR = {
'<start>': ['<A>'],
'<A>': ['<A>a', ''],
}
RR_GRAMMAR2 = {
'<start>': ['<A>'],
'<A>': ['a<A>', ''],
}
RR_GRAMMAR3 = {
'<start>': ['<A>'],
'<A>': ['ab<A>', ''],
}
RR_GRAMMAR4 = {
'<start>': ['c<A>'],
'<A>': ['ab<A>', ''],
}
RR_GRAMMAR5 = {
'<start>': ['<A>c'],
'<A>': ['ab<A>', ''],
}
mystring = 'aaaa'
mystring = 'abababab'
mystring = 'cabababab'
#mystring = 'abababababababc'
mystring = 'aa'
cursor, _ = LeoParser(RR_GRAMMAR2, log=True).parse_prefix(mystring)
print(cursor)
#result = LeoParser(LR_GRAMMAR, log=True).parse(mystring)
#mystring = 'aa'
#result = LeoParser(LR_GRAMMAR, log=True).parse(mystring)
# for tree in result:
# print(tree_to_string(tree))
#
#
#if __name__ == "__main__":
# result = LeoParser(RR_GRAMMAR, log=True).parse(mystring)
#
#
#if __name__ == "__main__":
# result = LeoParser(LR_GRAMMAR, log=True).parse(mystring)
# for tree in result:
# print(tree_to_string(tree))
#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment