Created
November 13, 2015 01:49
-
-
Save cheery/bfb249b1f9a327d150f8 to your computer and use it in GitHub Desktop.
Leo algorithm?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from structures import * | |
# This is described in the marpa thesis. | |
# could someone check out that it's correct? | |
# Joop Leo presented a method for dealing with right | |
# recursion in O(n) time. With his modification, earley | |
# algorithm is O(n) for all LR-regular grammars. | |
# spot potential right recursions and memoize them. | |
# Leo restricts the memoization to where right recursion | |
# is unambiguous. | |
# Potential right recursions are memoized by earley set, | |
# using "transitive items". | |
# traditional leo item is the triple: | |
# (top_dot, transition_symbol, top_origin) | |
# Leo items need to be later expanded for evaluation. | |
def main(): | |
x = Nonterminal('x') | |
y = Nonterminal('y') | |
a = Term('a') | |
accept = x | |
grammar = [ | |
Rule(x, [a, x]), | |
Rule(x, [a]), | |
] | |
input_string = "aaaaaaaaaaaa" | |
table = [] | |
# Initial dotted rule | |
previous = set() | |
current = set() | |
for rule in grammar: | |
if rule.lhs == accept: | |
new_item = (Dot(rule, 0), 0) | |
current.add(new_item) | |
print ' INIT', new_item[0], new_item[1] | |
queue = list(current) | |
for dot, origin in queue: | |
# prediction | |
prediction(current, queue, grammar, dot.postdot(), 0) | |
table.append(current) | |
previous = current | |
current = set() | |
for location, token in enumerate(input_string, 1): | |
# completions proceed in non-deterministic manner, until | |
# everything has been completed. | |
queue = list(current) | |
print 'SET', location | |
# scanning | |
for item in previous: | |
dot, origin = item[:2] | |
postdot = dot.postdot() | |
if match(postdot, token): | |
new_item = (dot.next(), origin) | |
if new_item not in current: | |
print ' SCAN', new_item[0], new_item[1] | |
current.add(new_item) | |
queue.append(new_item) | |
for item in queue: | |
dot, origin = item[:2] | |
print ' ', dot, origin | |
# reduction | |
if dot.is_completed(): | |
# leo reduction | |
for prior_item in table[origin]: | |
if len(prior_item) == 3 and prior_item[2] == dot.rule.lhs: | |
new_item = prior_item[0], prior_item[1], dot.rule.lhs | |
if new_item not in current: | |
print ' LEO REDU', new_item[0], new_item[1], dot.rule.lhs | |
current.add(new_item) | |
queue.append(new_item) | |
break | |
else: | |
for prior_item in table[origin]: | |
if len(prior_item) == 2: | |
before, before_origin = prior_item | |
if before.postdot() == dot.rule.lhs: | |
new_item = (before.next(), before_origin, dot.rule.lhs) | |
if new_item not in current: | |
print ' REDU', new_item[0], new_item[1], dot.rule.lhs | |
current.add(new_item) | |
queue.append(new_item) | |
# prediction | |
prediction(current, queue, grammar, dot.postdot(), location) | |
if dot.rule.lhs == accept and origin == 0: | |
print ' ACCEPT' | |
table.append(current) | |
previous = current | |
current = set() | |
def prediction(current, queue, grammar, symbol, location): | |
if not isinstance(symbol, Nonterminal): | |
return | |
for rule in grammar: | |
if rule.lhs == symbol: | |
new_item = (Dot(rule, 0), location) | |
if new_item not in current: | |
print ' PRED', new_item[0], new_item[1] | |
current.add(new_item) | |
queue.append(new_item) | |
# Mechanism to match for the current input. | |
def match(symbol, token): | |
return isinstance(symbol, Term) and symbol.name == token | |
if __name__=='__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Rule(object): | |
def __init__(self, lhs, rhs): | |
self.lhs = lhs | |
self.rhs = rhs | |
def __len__(self): | |
return len(self.rhs) | |
def __repr__(self): | |
return "{} -> {}".format( | |
self.lhs, | |
' '.join(map(str, self.rhs))) | |
# Earlier I did not separate terminals from | |
# non-terminals because it was not strictly | |
# necessary. That turned out to confuse | |
# when designing grammars. | |
class Term(object): | |
def __init__(self, name): | |
self.name = name | |
def __repr__(self): | |
return "{!r}".format(self.name) | |
class Nonterminal(object): | |
def __init__(self, name): | |
self.name = name | |
def __repr__(self): | |
return "{!s}".format(self.name) | |
# The dotted rule | |
class Dot(object): | |
def __init__(self, rule, pos): | |
self.rule = rule | |
self.pos = pos | |
assert 0 <= pos <= len(rule) | |
def postdot(self): | |
if self.pos < len(self.rule): | |
return self.rule.rhs[self.pos] | |
return None | |
def next(self): | |
if self.postdot() is not None: | |
return Dot(self.rule, self.pos + 1) | |
return None | |
def penult(self): | |
if self.pos + 1 == len(self.rule): | |
return self.postdot() | |
def is_predicted(self): | |
return self.pos == 0 | |
def is_confirmed(self): | |
return self.pos > 0 | |
def is_completed(self): | |
return self.pos == len(self.rule) | |
def __hash__(self): | |
return hash((self.rule, self.pos)) | |
def __eq__(self, other): | |
return isinstance(other, Dot) and self.rule == other.rule and self.pos == other.pos | |
def __repr__(self): | |
if isinstance(self.rule, Rule): | |
lhs = repr(self.rule.lhs) | |
pre = ' '.join(map(repr, self.rule.rhs[:self.pos])) | |
pos = ' '.join(map(repr, self.rule.rhs[self.pos:])) | |
return "[{} -> {} * {}]".format(lhs, pre, pos) | |
return object.__repr__(self) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment