Skip to content

Instantly share code, notes, and snippets.

@cheery
Last active August 4, 2018 03:22
Show Gist options
  • Save cheery/4b08ec93258e008ef7dd to your computer and use it in GitHub Desktop.
Save cheery/4b08ec93258e008ef7dd to your computer and use it in GitHub Desktop.
GLL parsing?
# a GLL parser, I suppose? At least in a way I understood it.
# Convenience construct to show symbols in printout
class NonTerm:
def __init__(self, fn, name=None):
self.fn = fn
self.name = name or self.fn.__name__
def __call__(self, ch, rest):
return self.fn(ch, rest)
def __repr__(self):
return self.name
class Reduce:
def __init__(self, fn, rest):
self.fn = fn
self.rest = rest
self.closure = []
self.index = index
def __call__(self, ch, rest):
print index, "reduce", self.fn
branch(self.rest, ch)
for clos in self.closure:
branch(clos, ch)
def __repr__(self):
return "{0.fn}.{0.index}{1}".format(self, ''.join(map(repr, self.rest)))
def branch(state, ch):
state[0](ch, state[1:])
def match(what):
def func(ch, rest):
if ch == what:
cont.append(rest)
func.__name__ = what
return NonTerm(func)
def accept(ch, rest):
if ch is None:
print index, "accept"
else:
print index, "truncated result"
accept = NonTerm(accept, "$")
# Proper invocation is achievable without transforming the grammar much,
# but I believe collapsing the language into this kind of 'expansions',
# bit similar to those in LR parser, can eliminate the need for memoization
# with this algorithm.
a = match("a")
b = match("b")
@NonTerm
def A(ch, rest):
r = Reduce(A, rest)
r.closure.append([a, r])
r.closure.append([a, b, a, r])
cont.append([a, r])
# Useful implementation would additionally associate a context to every
# invocation, to luggage intermediate results and transmit important variables
# without global state.
# This code can be likely extended to recognize precedence, just like how
# it's possible for LR parsing.
# The ambiguity of the parsing could be bounded by limiting amount of
# continuations in practice.
cont = [[a, A, b, accept]]
for index, ch in enumerate(["a", "a", "a", "a", "b", "a", "b", None]):
print index, 'state', ' '.join(''.join(map(repr, state)) for state in cont)
cur = cont
cont = []
for state in cur:
branch(state, ch)
if len(cont) == 0 and ch != None:
raise Exception("syntax error")
print index, 'shift', repr(ch)
# This LL parser functions on immutable data and continuation stacks.
# It comes in two parts. The first part forms a virtual machine to
# construct and run the second part.
# Grammar productions are allowed to be recursive, so we cannot
# pass the rules in the __init__.
class Prod(object):
def __init__(self, name):
self.name = name
self.rules = None
def __call__(self, state, sp, token, cc):
# In between every shift, the production is invoked only once.
if self in state.catche:
rc = state.catche[self]
else:
rc = state.catche[self] = Reduce(self)
for label, rule in self.rules:
state.branch((), token, pushcc(rule, (rc, label)))
# Adding continuations into the reduction allows us to
# replace left recursion with looping.
rc.closure.append((sp, cc))
# This is needed because reduction may happen before all the
# continuations are added.
for red in rc.red:
sp = sp, red
state.branch(sp, token, cc)
def __repr__(self):
return self.name
# The production rules are terminated with this reduction command.
class Reduce(object):
def __init__(self, key):
self.key = key
self.closure = []
self.red = []
def __call__(self, state, sp, token, label):
args = [label] + unroll(sp)
print "reduce", args
if self is state.catche.get(self.key): # Caching reductions is unneeded
self.red.append(args) # after the reduction is out of the cache.
for sp, cc in self.closure:
sp = sp, args
state.branch(sp, token, cc)
def __repr__(self):
return "R({})".format(self.key)
class State(object):
def __init__(self, cont):
self.catche = {} # Holds reductions in every shift, so they're
# not invoked repeatedly and the parser doesn't jam
# when it encounters left recursion.
self.cont = cont # Holds the continuations for the current,
# and for next run once running.
def push(self, sp, cc):
self.cont.append((sp, cc))
def branch(self, sp, token, (fn, cc)):
return fn(self, sp, token, cc)
def shift(self, token):
self.catche.clear()
cont, self.cont = self.cont, []
for sp, cc in cont:
self.branch(sp, token, cc)
print "shift", token
# Both continuation and the data are 'immutable' stacks.
def pushcc(rule, cc):
for a in reversed(rule):
cc = a, cc
return cc
# The reductions hold continuations along the associated data stack,
# so every production rule can just dump the values it collects
# into its stack.
def unroll(sp):
res = []
while len(sp) > 0:
sp, arg = sp
res.append(arg)
res.reverse()
return res
def match(what):
def func(state, sp, token, rest):
if token == what:
state.push((sp, token), rest)
func.__name__ = what
func.spc = 1
return func
# The reductions and 'accept' terminate current continuation anyway, so
# we can pass in a label instead.
def accept(state, sp, token, label):
if token is None:
args = []
print "accept", label, unroll(sp)
# This is the second part, actual grammar we match with.
a = match("a")
b = match("b")
c = match("c")
A = Prod('A')
C = Prod('C')
A.rules = [
(1, [A, a, b, a]),
(2, [C]), # Some blatantly bad rules to test the parser.
(3, [C, C]),
(4, [C, A, a]),
(5, [a]),
]
C.rules = [
(6, []),
(7, [C, c]),
# (8, [C]), # this explodes the parser.
# the addition of the rule creates infinite
# amount of reductions, although they do not
# extend the language understood by the parser.
]
# The following code just returns an error on invalid input, but
# practically it has plenty of information in the state
# in nice format for producing useful error messages.
init = ((), pushcc([a, A, b], (accept, 'X')))
state = State([init])
input = ["a", "a", "a", "a", "b", "a", "b", None]
for index, ch in enumerate(input):
state.shift(ch)
if len(state.cont) == 0 and ch != None:
raise Exception("syntax error")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment