Last active
March 20, 2016 05:45
-
-
Save cheery/18de72b55ad08ce78e4d to your computer and use it in GitHub Desktop.
Tried to write a derivatives based parser.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Implementing "parsing with derivatives" | |
# http://matt.might.net/articles/parsing-with-derivatives/ | |
# Matt claims the algorithm terminates with laziness and memoization | |
# Though nullable -function must be computed as a least-fixed-point | |
# The grammars are modelled as a graph of parser combinators. | |
def main(): | |
B = Name('B') | |
B.bind((B+Char('x')) | empty) | |
fixpoint_flow(B) | |
result = derivative(B.value, 'x') | |
print result, is_nullable(result) | |
result = derivative(result, 'x') | |
print result, is_nullable(result) | |
result = derivative(result, 'x') | |
print result, is_nullable(result) | |
result = derivative(result, 'x') | |
print result, is_nullable(result) | |
result = derivative(result, 'x') | |
print result, is_nullable(result) | |
# Output: | |
# D'x'(B)x | $ True | |
# D'x'(D'x'(B))x | $ True | |
# D'x'(D'x'(D'x'(B)))x | $ True | |
# D'x'(D'x'(D'x'(D'x'(B))))x | $ True | |
# D'x'(D'x'(D'x'(D'x'(D'x'(B)))))x | $ True | |
result = derivative(result, 'y') | |
print result, is_nullable(result) | |
result = derivative(result, 'x') | |
print result, is_nullable(result) | |
result = derivative(result, 'x') | |
print result, is_nullable(result) | |
# Output: | |
# 0 False | |
# 0 False | |
# 0 False | |
# Here's yet another try. | |
result = derivative(B.value, 'y') | |
print result, is_nullable(result) | |
result = derivative(result, 'y') | |
print result, is_nullable(result) | |
# Output: | |
# 0 False | |
# 0 False | |
# An another, although simple grammar. | |
E, T = Name('E'), Name('T') | |
E.bind(T | (E + Char('+') + T)) | |
T.bind(Char('x') | (Char('x') + T)) | |
result = derivative(E.value, 'x') | |
print result, is_nullable(result) | |
result = derivative(result, '+') | |
print result, is_nullable(result) | |
result = derivative(result, 'x') | |
print result, is_nullable(result) | |
result = derivative(result, 'x') | |
print result, is_nullable(result) | |
# Output: | |
# D'x'(T) | D'x'(E)+T True | |
# T False | |
# D'x'(T) True | |
# D'x'(D'x'(T)) True | |
class Combinator(object): | |
nullable = None # means for undetermined nullability | |
depends = frozenset() # set of name combinators the combinator's | |
# nullability depends on. A common case is | |
# the result doesn't depend on other combinators. | |
first = frozenset() | |
def __or__(self, other): | |
if self is fail: # avoid producing unneeded combinators when combining them. | |
return other | |
if other is fail: | |
return self | |
return Choice(self, other) | |
def __add__(self, other): | |
if self is fail or other is fail: | |
return fail | |
if self is empty: | |
return other | |
if other is empty: | |
return self | |
return Cat(self, other) | |
def depend(self): # Used to build the 'depends' -set when combining combinators. | |
return self.depends | |
def rewrite(self): # Called when the combinator needs to be 'evaluated' | |
return self # default is that the combinator doesn't reduce further. | |
class Empty(Combinator): # Accepts an empty string. | |
nullable = True | |
def __init__(self): | |
self.first = frozenset([self]) | |
def __repr__(self): | |
return "$" | |
class Fail(Combinator): # Accepts nothing, attempted to be pruned out when combining | |
nullable = False | |
def __repr__(self): | |
return "0" | |
def __or__(self, other): | |
return other | |
class Char(Combinator): # Accepts a character | |
nullable = False | |
def __init__(self, char): | |
self.char = char | |
self.first = frozenset(char) | |
def __repr__(self): | |
return self.char | |
class Name(Combinator): # The 'nonterminal' building block to | |
def __init__(self, name): # construct recursive grammars | |
self.name = name | |
self.value = fail | |
self.depends = None | |
self.flow = set() # When nullability changes, the nullability of names in | |
# this set should change too. | |
self.first = None # The characters that this combinator can start with. | |
def __repr__(self): | |
return self.name | |
def bind(self, value): # bind assigns a value to the name. | |
assert self.nullable is None # to verify nullability is correct, we prefer | |
self.value = value # to think of Name as an immutable construct. | |
self.nullable = None | |
self.depends = self.value.depend() | |
for name in self.depends: # used to calculate nullability | |
name.flow.add(self) | |
def depend(self): | |
return {self} | |
def rewrite(self): # Name should evaluate to its value. | |
return self.value | |
class Cat(Combinator): | |
def __init__(self, lhs, rhs): | |
self.lhs = lhs | |
self.rhs = rhs | |
self.depends = self.lhs.depend() | self.rhs.depend() | |
def __repr__(self): # my notation needs parentheses if Choice appears inside Cat. | |
lhs = "({})".format(repr(self.lhs)) if isinstance(self.lhs, Choice) else repr(self.lhs) | |
rhs = "({})".format(repr(self.rhs)) if isinstance(self.rhs, Choice) else repr(self.rhs) | |
return lhs + rhs | |
@property | |
def first(self): | |
lhs = self.lhs.first | |
if empty in lhs: | |
return lhs | self.rhs.first | |
return lhs | |
# the example doesn't use this function, so bit unsure about the correctness here. | |
def rewrite(self): # bias to left rewrite in Catenation. | |
return self.lhs.rewrite() + self.rhs | |
class Choice(Combinator): | |
def __init__(self, lhs, rhs): | |
self.lhs = lhs | |
self.rhs = rhs | |
self.depends = self.lhs.depend() | self.rhs.depend() | |
def __repr__(self): | |
return "{} | {}".format(self.lhs, self.rhs) | |
@property | |
def first(self): | |
return self.lhs.first | self.rhs.first | |
def rewrite(self): | |
return self.lhs.rewrite() | self.rhs.rewrite() | |
class Deriv(Combinator): # An idea of unevaluated derivative | |
def __init__(self, parser, char): | |
assert isinstance(parser, (Deriv, Name)) | |
self.parser = parser | |
self.char = char | |
self.value = None | |
def __repr__(self): | |
return "D{!r}({})".format(self.char, self.parser) | |
# Deriv is only produced for a symbol, or another deriv. The idea is | |
# such chains are produced by left recursive rules. | |
@property | |
def first(self): | |
return self.parser.first | |
def rewrite(self): | |
if self.value is None: | |
self.value = derivative(self.parser.rewrite(), self.char) | |
return self.value | |
empty = Empty() # Since they're immutable, we need only one of each of these. | |
fail = Fail() | |
# Presents taking a derivative from each combinator. | |
def derivative(parser, char): | |
if isinstance(parser, Char): | |
if char == parser.char: | |
return empty | |
elif isinstance(parser, Cat): | |
lhs = derivative(parser.lhs, char) + parser.rhs | |
rhs = derivative(parser.rhs, char) if is_nullable(parser.lhs) else fail | |
return lhs | rhs | |
elif isinstance(parser, Choice): | |
return derivative(parser.lhs, char) | derivative(parser.rhs, char) | |
elif isinstance(parser, Name): | |
if parser.first is None: | |
fixpoint_flow(parser) | |
if char in parser.first: | |
return deriv_cache(parser, char) | |
elif isinstance(parser, Deriv): | |
if char in parser.first: | |
return deriv_cache(parser, char) | |
return fail | |
# This is here because the author proposes these constructs could be cached. | |
derives = {} | |
def deriv_cache(parser, char): | |
key = (parser, char) | |
if key in derives: | |
return derives[key] | |
derives[key] = deriv = Deriv(parser, char) | |
return deriv | |
def is_nullable(parser): | |
if isinstance(parser, Cat): | |
# Nullability potentially results in a rewrite, if it's obvious the branch | |
# isn't nullable, further evaluations would be unnecessary. | |
if parser.lhs.nullable == False or parser.rhs.nullable == False: | |
return False | |
return is_nullable(parser.lhs) and is_nullable(parser.rhs) | |
elif isinstance(parser, Choice): | |
return is_nullable(parser.lhs) or is_nullable(parser.rhs) | |
elif isinstance(parser, Name) and parser.nullable is None: | |
fixpoint_flow(parser) | |
return parser.nullable | |
elif isinstance(parser, Deriv): | |
return is_nullable(parser.rewrite()) | |
else: | |
return parser.nullable | |
# For Names we calculate a fixed point if it's not been calculated yet: | |
# when something is determined to be nullable, we can consider that figured | |
# out. If it was determined to be nullable, nothing further needs to be done. | |
# If something becomes true, we should check the names depending to it | |
# again. | |
# Alongside we're determining the first -set of the grammar. | |
def fixpoint_flow(parser): | |
group = prepare_fixpoint_flow(set(), parser) | |
while len(group) > 0: | |
name = group.pop() | |
name.nullable = is_nullable(name.value) | |
F = len(name.first) | |
name.first.update(name.value.first) | |
if F < len(name.first): | |
group.update(n for n in name.flow) | |
elif name.nullable: | |
group.update(n for n in name.flow if not n.nullable) | |
# The fixed point flow is prepared by a set containing every combinator | |
# that's not been determined to be nullable. | |
def prepare_fixpoint_flow(group, parser): | |
parser.nullable = False | |
parser.first = set() | |
group.add(parser) | |
for depend in parser.depends: | |
if depend.nullable is None: | |
prepare_fixpoint_flow(group, depend) | |
for flow in parser.flow: | |
if flow.nullable is None: | |
prepare_fixpoint_flow(group, flow) | |
return group | |
if __name__=='__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment