cheery/ll.py

## ll.py
# a GLL parser, I suppose? At least in a way I understood it.

# Convenience construct to show symbols in printout
class NonTerm:
    def __init__(self, fn, name=None):
        self.fn = fn
        self.name = name or self.fn.__name__
    def __call__(self, ch, rest):
        return self.fn(ch, rest)
    def __repr__(self):
        return self.name

class Reduce:
    def __init__(self, fn, rest):
        self.fn = fn
        self.rest = rest
        self.closure = []
        self.index = index
    def __call__(self, ch, rest):
        print index, "reduce", self.fn
        branch(self.rest, ch)
        for clos in self.closure:
            branch(clos, ch)
    def __repr__(self):
        return "{0.fn}.{0.index}{1}".format(self, ''.join(map(repr, self.rest)))

def branch(state, ch):
    state[0](ch, state[1:])

def match(what):
    def func(ch, rest):
        if ch == what:
            cont.append(rest)
    func.__name__ = what
    return NonTerm(func)

def accept(ch, rest):
    if ch is None:
        print index, "accept"
    else:
        print index, "truncated result"
accept = NonTerm(accept,  "$")

# Proper invocation is achievable without transforming the grammar much,
# but I believe collapsing the language into this kind of 'expansions',
# bit similar to those in LR parser, can eliminate the need for memoization
# with this algorithm.
a = match("a")
b = match("b")
@NonTerm
def A(ch, rest):
    r = Reduce(A, rest)
    r.closure.append([a, r])
    r.closure.append([a, b, a, r])
    cont.append([a, r])

# Useful implementation would additionally associate a context to every
# invocation, to luggage intermediate results and transmit important variables
# without global state.

# This code can be likely extended to recognize precedence, just like how
# it's possible for LR parsing.

# The ambiguity of the parsing could be bounded by limiting amount of
# continuations in practice.
cont = [[a, A, b, accept]]
for index, ch in enumerate(["a", "a", "a", "a", "b", "a", "b", None]):
    print index, 'state', ' '.join(''.join(map(repr, state)) for state in cont)
    cur = cont
    cont = []
    for state in cur:
        branch(state, ch)
    if len(cont) == 0 and ch != None:
        raise Exception("syntax error")
    print index, 'shift', repr(ch)

## ll2.py
# This LL parser functions on immutable data and continuation stacks.
# It comes in two parts. The first part forms a virtual machine to
# construct and run the second part.

# Grammar productions are allowed to be recursive, so we cannot
# pass the rules in the __init__.
class Prod(object):
    def __init__(self, name):
        self.name = name
        self.rules = None

    def __call__(self, state, sp, token, cc):
        # In between every shift, the production is invoked only once.
        if self in state.catche:
            rc = state.catche[self]
        else:
            rc = state.catche[self] = Reduce(self)
            for label, rule in self.rules:
                state.branch((), token, pushcc(rule, (rc, label)))
        # Adding continuations into the reduction allows us to
        # replace left recursion with looping.
        rc.closure.append((sp, cc))
        # This is needed because reduction may happen before all the
        # continuations are added.
        for red in rc.red:
            sp = sp, red
            state.branch(sp, token, cc)

    def __repr__(self):
        return self.name

# The production rules are terminated with this reduction command.
class Reduce(object):
    def __init__(self, key):
        self.key = key
        self.closure = []
        self.red = []

    def __call__(self, state, sp, token, label):
        args = [label] + unroll(sp)
        print "reduce", args
        if self is state.catche.get(self.key): # Caching reductions is unneeded
            self.red.append(args)              # after the reduction is out of the cache.
        for sp, cc in self.closure:
            sp = sp, args
            state.branch(sp, token, cc)

    def __repr__(self):
        return "R({})".format(self.key)

class State(object):
    def __init__(self, cont):
        self.catche = {} # Holds reductions in every shift, so they're
                         # not invoked repeatedly and the parser doesn't jam
                         # when it encounters left recursion.
        self.cont = cont # Holds the continuations for the current,
                         # and for next run once running.

    def push(self, sp, cc):
        self.cont.append((sp, cc))

    def branch(self, sp, token, (fn, cc)):
        return fn(self, sp, token, cc)

    def shift(self, token):
        self.catche.clear()
        cont, self.cont = self.cont, []
        for sp, cc in cont:
            self.branch(sp, token, cc)
        print "shift", token

# Both continuation and the data are 'immutable' stacks.
def pushcc(rule, cc):
    for a in reversed(rule):
        cc = a, cc
    return cc

# The reductions hold continuations along the associated data stack,
# so every production rule can just dump the values it collects
# into its stack.
def unroll(sp):
    res = []
    while len(sp) > 0:
        sp, arg = sp
        res.append(arg)
    res.reverse()
    return res

def match(what):
    def func(state, sp, token, rest):
        if token == what:
            state.push((sp, token), rest)
    func.__name__ = what
    func.spc = 1
    return func

# The reductions and 'accept' terminate current continuation anyway, so
# we can pass in a label instead.
def accept(state, sp, token, label):
    if token is None:
        args = []
        print "accept", label, unroll(sp)

# This is the second part, actual grammar we match with.
a = match("a")
b = match("b")
c = match("c")
A = Prod('A')
C = Prod('C')
A.rules = [
    (1, [A, a, b, a]),
    (2, [C]), # Some blatantly bad rules to test the parser.
    (3, [C, C]),
    (4, [C, A, a]),
    (5, [a]),
]
C.rules = [
    (6, []),
    (7, [C, c]),
#    (8, [C]), # this explodes the parser.
               # the addition of the rule creates infinite
               # amount of reductions, although they do not
               # extend the language understood by the parser.
]

# The following code just returns an error on invalid input, but
# practically it has plenty of information in the state
# in nice format for producing useful error messages.
init = ((), pushcc([a, A, b], (accept, 'X')))
state = State([init])
input = ["a", "a", "a", "a", "b", "a", "b", None]
for index, ch in enumerate(input):
    state.shift(ch)
    if len(state.cont) == 0 and ch != None:
        raise Exception("syntax error")
	# a GLL parser, I suppose? At least in a way I understood it.

	# Convenience construct to show symbols in printout
	class NonTerm:
	def __init__(self, fn, name=None):
	self.fn = fn
	self.name = name or self.fn.__name__
	def __call__(self, ch, rest):
	return self.fn(ch, rest)
	def __repr__(self):
	return self.name

	class Reduce:
	def __init__(self, fn, rest):
	self.fn = fn
	self.rest = rest
	self.closure = []
	self.index = index
	def __call__(self, ch, rest):
	print index, "reduce", self.fn
	branch(self.rest, ch)
	for clos in self.closure:
	branch(clos, ch)
	def __repr__(self):
	return "{0.fn}.{0.index}{1}".format(self, ''.join(map(repr, self.rest)))

	def branch(state, ch):
	state[0](ch, state[1:])

	def match(what):
	def func(ch, rest):
	if ch == what:
	cont.append(rest)
	func.__name__ = what
	return NonTerm(func)

	def accept(ch, rest):
	if ch is None:
	print index, "accept"
	else:
	print index, "truncated result"
	accept = NonTerm(accept, "$")

	# Proper invocation is achievable without transforming the grammar much,
	# but I believe collapsing the language into this kind of 'expansions',
	# bit similar to those in LR parser, can eliminate the need for memoization
	# with this algorithm.
	a = match("a")
	b = match("b")
	@NonTerm
	def A(ch, rest):
	r = Reduce(A, rest)
	r.closure.append([a, r])
	r.closure.append([a, b, a, r])
	cont.append([a, r])

	# Useful implementation would additionally associate a context to every
	# invocation, to luggage intermediate results and transmit important variables
	# without global state.

	# This code can be likely extended to recognize precedence, just like how
	# it's possible for LR parsing.

	# The ambiguity of the parsing could be bounded by limiting amount of
	# continuations in practice.
	cont = [[a, A, b, accept]]
	for index, ch in enumerate(["a", "a", "a", "a", "b", "a", "b", None]):
	print index, 'state', ' '.join(''.join(map(repr, state)) for state in cont)
	cur = cont
	cont = []
	for state in cur:
	branch(state, ch)
	if len(cont) == 0 and ch != None:
	raise Exception("syntax error")
	print index, 'shift', repr(ch)
	# This LL parser functions on immutable data and continuation stacks.
	# It comes in two parts. The first part forms a virtual machine to
	# construct and run the second part.

	# Grammar productions are allowed to be recursive, so we cannot
	# pass the rules in the __init__.
	class Prod(object):
	def __init__(self, name):
	self.name = name
	self.rules = None

	def __call__(self, state, sp, token, cc):
	# In between every shift, the production is invoked only once.
	if self in state.catche:
	rc = state.catche[self]
	else:
	rc = state.catche[self] = Reduce(self)
	for label, rule in self.rules:
	state.branch((), token, pushcc(rule, (rc, label)))
	# Adding continuations into the reduction allows us to
	# replace left recursion with looping.
	rc.closure.append((sp, cc))
	# This is needed because reduction may happen before all the
	# continuations are added.
	for red in rc.red:
	sp = sp, red
	state.branch(sp, token, cc)

	def __repr__(self):
	return self.name

	# The production rules are terminated with this reduction command.
	class Reduce(object):
	def __init__(self, key):
	self.key = key
	self.closure = []
	self.red = []

	def __call__(self, state, sp, token, label):
	args = [label] + unroll(sp)
	print "reduce", args
	if self is state.catche.get(self.key): # Caching reductions is unneeded
	self.red.append(args) # after the reduction is out of the cache.
	for sp, cc in self.closure:
	sp = sp, args
	state.branch(sp, token, cc)

	def __repr__(self):
	return "R({})".format(self.key)

	class State(object):
	def __init__(self, cont):
	self.catche = {} # Holds reductions in every shift, so they're
	# not invoked repeatedly and the parser doesn't jam
	# when it encounters left recursion.
	self.cont = cont # Holds the continuations for the current,
	# and for next run once running.

	def push(self, sp, cc):
	self.cont.append((sp, cc))

	def branch(self, sp, token, (fn, cc)):
	return fn(self, sp, token, cc)

	def shift(self, token):
	self.catche.clear()
	cont, self.cont = self.cont, []
	for sp, cc in cont:
	self.branch(sp, token, cc)
	print "shift", token

	# Both continuation and the data are 'immutable' stacks.
	def pushcc(rule, cc):
	for a in reversed(rule):
	cc = a, cc
	return cc

	# The reductions hold continuations along the associated data stack,
	# so every production rule can just dump the values it collects
	# into its stack.
	def unroll(sp):
	res = []
	while len(sp) > 0:
	sp, arg = sp
	res.append(arg)
	res.reverse()
	return res

	def match(what):
	def func(state, sp, token, rest):
	if token == what:
	state.push((sp, token), rest)
	func.__name__ = what
	func.spc = 1
	return func

	# The reductions and 'accept' terminate current continuation anyway, so
	# we can pass in a label instead.
	def accept(state, sp, token, label):
	if token is None:
	args = []
	print "accept", label, unroll(sp)

	# This is the second part, actual grammar we match with.
	a = match("a")
	b = match("b")
	c = match("c")
	A = Prod('A')
	C = Prod('C')
	A.rules = [
	(1, [A, a, b, a]),
	(2, [C]), # Some blatantly bad rules to test the parser.
	(3, [C, C]),
	(4, [C, A, a]),
	(5, [a]),
	]
	C.rules = [
	(6, []),
	(7, [C, c]),
	# (8, [C]), # this explodes the parser.
	# the addition of the rule creates infinite
	# amount of reductions, although they do not
	# extend the language understood by the parser.
	]

	# The following code just returns an error on invalid input, but
	# practically it has plenty of information in the state
	# in nice format for producing useful error messages.
	init = ((), pushcc([a, A, b], (accept, 'X')))
	state = State([init])
	input = ["a", "a", "a", "a", "b", "a", "b", None]
	for index, ch in enumerate(input):
	state.shift(ch)
	if len(state.cont) == 0 and ch != None:
	raise Exception("syntax error")