Last active
August 4, 2018 03:22
-
-
Save cheery/4b08ec93258e008ef7dd to your computer and use it in GitHub Desktop.
GLL parsing?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# a GLL parser, I suppose? At least in a way I understood it. | |
# Convenience construct to show symbols in printout | |
class NonTerm: | |
def __init__(self, fn, name=None): | |
self.fn = fn | |
self.name = name or self.fn.__name__ | |
def __call__(self, ch, rest): | |
return self.fn(ch, rest) | |
def __repr__(self): | |
return self.name | |
class Reduce: | |
def __init__(self, fn, rest): | |
self.fn = fn | |
self.rest = rest | |
self.closure = [] | |
self.index = index | |
def __call__(self, ch, rest): | |
print index, "reduce", self.fn | |
branch(self.rest, ch) | |
for clos in self.closure: | |
branch(clos, ch) | |
def __repr__(self): | |
return "{0.fn}.{0.index}{1}".format(self, ''.join(map(repr, self.rest))) | |
def branch(state, ch): | |
state[0](ch, state[1:]) | |
def match(what): | |
def func(ch, rest): | |
if ch == what: | |
cont.append(rest) | |
func.__name__ = what | |
return NonTerm(func) | |
def accept(ch, rest): | |
if ch is None: | |
print index, "accept" | |
else: | |
print index, "truncated result" | |
accept = NonTerm(accept, "$") | |
# Proper invocation is achievable without transforming the grammar much, | |
# but I believe collapsing the language into this kind of 'expansions', | |
# bit similar to those in LR parser, can eliminate the need for memoization | |
# with this algorithm. | |
a = match("a") | |
b = match("b") | |
@NonTerm | |
def A(ch, rest): | |
r = Reduce(A, rest) | |
r.closure.append([a, r]) | |
r.closure.append([a, b, a, r]) | |
cont.append([a, r]) | |
# Useful implementation would additionally associate a context to every | |
# invocation, to luggage intermediate results and transmit important variables | |
# without global state. | |
# This code can be likely extended to recognize precedence, just like how | |
# it's possible for LR parsing. | |
# The ambiguity of the parsing could be bounded by limiting amount of | |
# continuations in practice. | |
cont = [[a, A, b, accept]] | |
for index, ch in enumerate(["a", "a", "a", "a", "b", "a", "b", None]): | |
print index, 'state', ' '.join(''.join(map(repr, state)) for state in cont) | |
cur = cont | |
cont = [] | |
for state in cur: | |
branch(state, ch) | |
if len(cont) == 0 and ch != None: | |
raise Exception("syntax error") | |
print index, 'shift', repr(ch) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This LL parser functions on immutable data and continuation stacks. | |
# It comes in two parts. The first part forms a virtual machine to | |
# construct and run the second part. | |
# Grammar productions are allowed to be recursive, so we cannot | |
# pass the rules in the __init__. | |
class Prod(object): | |
def __init__(self, name): | |
self.name = name | |
self.rules = None | |
def __call__(self, state, sp, token, cc): | |
# In between every shift, the production is invoked only once. | |
if self in state.catche: | |
rc = state.catche[self] | |
else: | |
rc = state.catche[self] = Reduce(self) | |
for label, rule in self.rules: | |
state.branch((), token, pushcc(rule, (rc, label))) | |
# Adding continuations into the reduction allows us to | |
# replace left recursion with looping. | |
rc.closure.append((sp, cc)) | |
# This is needed because reduction may happen before all the | |
# continuations are added. | |
for red in rc.red: | |
sp = sp, red | |
state.branch(sp, token, cc) | |
def __repr__(self): | |
return self.name | |
# The production rules are terminated with this reduction command. | |
class Reduce(object): | |
def __init__(self, key): | |
self.key = key | |
self.closure = [] | |
self.red = [] | |
def __call__(self, state, sp, token, label): | |
args = [label] + unroll(sp) | |
print "reduce", args | |
if self is state.catche.get(self.key): # Caching reductions is unneeded | |
self.red.append(args) # after the reduction is out of the cache. | |
for sp, cc in self.closure: | |
sp = sp, args | |
state.branch(sp, token, cc) | |
def __repr__(self): | |
return "R({})".format(self.key) | |
class State(object): | |
def __init__(self, cont): | |
self.catche = {} # Holds reductions in every shift, so they're | |
# not invoked repeatedly and the parser doesn't jam | |
# when it encounters left recursion. | |
self.cont = cont # Holds the continuations for the current, | |
# and for next run once running. | |
def push(self, sp, cc): | |
self.cont.append((sp, cc)) | |
def branch(self, sp, token, (fn, cc)): | |
return fn(self, sp, token, cc) | |
def shift(self, token): | |
self.catche.clear() | |
cont, self.cont = self.cont, [] | |
for sp, cc in cont: | |
self.branch(sp, token, cc) | |
print "shift", token | |
# Both continuation and the data are 'immutable' stacks. | |
def pushcc(rule, cc): | |
for a in reversed(rule): | |
cc = a, cc | |
return cc | |
# The reductions hold continuations along the associated data stack, | |
# so every production rule can just dump the values it collects | |
# into its stack. | |
def unroll(sp): | |
res = [] | |
while len(sp) > 0: | |
sp, arg = sp | |
res.append(arg) | |
res.reverse() | |
return res | |
def match(what): | |
def func(state, sp, token, rest): | |
if token == what: | |
state.push((sp, token), rest) | |
func.__name__ = what | |
func.spc = 1 | |
return func | |
# The reductions and 'accept' terminate current continuation anyway, so | |
# we can pass in a label instead. | |
def accept(state, sp, token, label): | |
if token is None: | |
args = [] | |
print "accept", label, unroll(sp) | |
# This is the second part, actual grammar we match with. | |
a = match("a") | |
b = match("b") | |
c = match("c") | |
A = Prod('A') | |
C = Prod('C') | |
A.rules = [ | |
(1, [A, a, b, a]), | |
(2, [C]), # Some blatantly bad rules to test the parser. | |
(3, [C, C]), | |
(4, [C, A, a]), | |
(5, [a]), | |
] | |
C.rules = [ | |
(6, []), | |
(7, [C, c]), | |
# (8, [C]), # this explodes the parser. | |
# the addition of the rule creates infinite | |
# amount of reductions, although they do not | |
# extend the language understood by the parser. | |
] | |
# The following code just returns an error on invalid input, but | |
# practically it has plenty of information in the state | |
# in nice format for producing useful error messages. | |
init = ((), pushcc([a, A, b], (accept, 'X'))) | |
state = State([init]) | |
input = ["a", "a", "a", "a", "b", "a", "b", None] | |
for index, ch in enumerate(input): | |
state.shift(ch) | |
if len(state.cont) == 0 and ch != None: | |
raise Exception("syntax error") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment