Created
November 6, 2022 15:31
-
-
Save Trebor-Huang/9e733a28d30b39b4994b014ca225bdcd to your computer and use it in GitHub Desktop.
Parsing with derivatives
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from functools import lru_cache | |
class Map: | |
def __init__(self, f, iterable): | |
self.data = (f, iterable) | |
def __iter__(self): | |
for i in self.data[1]: | |
yield self.data[0](i) | |
class Parser: | |
def __init__(self, equations:dict, charP=None, concatP=None): | |
self.charP = charP or (lambda d: ("Char", d)) | |
self.concatP = concatP or (lambda s, t: ("Concat", s, t)) | |
self.equations = {("", name) : self.compaction(equations[name]) for name in equations} | |
def __getitem__(self, name): | |
"""Lazily evaluate derivatives.""" | |
if name in self.equations: | |
return self.equations[name] | |
cs, name = name | |
if cs == "": | |
raise ValueError("Node name not found:", name) | |
comb = self.compaction(self.derive(self[cs[:-1], name], cs[-1])) | |
self.equations[cs, name] = comb | |
return comb | |
@lru_cache(10000) | |
def compaction(self, comb): | |
if not isinstance(comb, tuple): | |
return comb | |
comb = tuple(self.compaction(c) for c in comb) | |
match comb: | |
case ("Nullify", ("Char", _)): | |
return "Reject", | |
case ("Nullify", ("Null", trees)): | |
return "Null", trees | |
case ("Nullify", ("Nullify", comb)): | |
return ("Nullify", comb) | |
case ("Concat", ("Reject",), _) | ("Concat", _, ("Reject",)): | |
return "Reject", | |
case ("Union", ("Reject",), comb) | ("Union", comb, ("Reject",)): | |
return comb | |
case ("Transform", ("Null", trees), f): | |
return "Null", Map(f, trees) | |
case ("Transform", ("Transform", comb, f), g): | |
return "Transform", comb, lambda x: g(f(x)) | |
case _: | |
return comb | |
@lru_cache(10000) | |
def derive(self, comb, c): | |
match comb: | |
case "Reject",: | |
return "Reject", | |
case "Char", d: | |
if c == d: | |
return "Null", (self.charP(d),) | |
return "Reject", | |
case "Null", _: | |
return "Reject", | |
case "Union", s, t: | |
return "Union", self.derive(s, c), self.derive(t, c) | |
case "Concat", s, t: | |
return "Union",\ | |
("Concat", self.derive(s, c), t),\ | |
("Concat", ("Nullify", s), self.derive(t, c)) | |
case "Nullify", s: | |
return "Reject", | |
case "Transform", s, f: | |
return "Transform", self.derive(s, c), f | |
case "Rec", cs, ref: | |
return "Rec", cs + c, ref | |
case _: | |
raise ValueError("Unrecognized grammar", comb) | |
def parse(self, string = '', entry = "Language"): | |
# This produces trees, will be faster if only recognizes | |
self.queue = [(string, entry)] | |
self.production = {} | |
self.notification = {} | |
while self.queue: | |
self.notify = set() | |
cs, ref = self.queue.pop(0) | |
# ("Queue: ", cs, ref) | |
notify = False | |
for tree in self.ennull(self[cs, ref]): | |
if tree in self.production[cs, ref]: | |
continue | |
# ("Produced: ", tree) | |
notify = True | |
self.production[cs, ref].add(tree) | |
if (cs, ref) == (string, entry): | |
yield tree | |
# ("Subscribing: ", self.notify) | |
for u in self.notify: | |
if u not in self.notification: | |
self.notification[u] = set() | |
self.notification[u].add((cs, ref)) | |
if notify: | |
# ("Notifying: ", self.notification[cs, ref]) | |
self.queue.extend(self.notification[cs, ref]) | |
def ennull(self, comb = ("Rec", '', "Language")): | |
match comb: | |
case "Reject",: | |
return | |
case "Char", d: | |
return | |
case "Null", trees: | |
yield from trees | |
case "Union", s, t: | |
yield from self.ennull(s) | |
yield from self.ennull(t) | |
case "Concat", s, t: | |
for left in self.ennull(s): | |
for right in self.ennull(t): | |
yield self.concatP(left, right) | |
case "Nullify", s: | |
yield from self.ennull(s) | |
case "Transform", s, f: | |
for tree in self.ennull(s): | |
yield f(tree) | |
case "Rec", cs, ref: # Least fixpoint | |
self.notify.add((cs, ref)) | |
if (cs, ref) not in self.production: | |
self.production[cs, ref] = set() | |
self.queue.append((cs, ref)) | |
yield from self.production[cs, ref] | |
case _: | |
raise ValueError("Unrecognized grammar", comb) | |
lang = Parser({ # L := L{L}L | L where '{' and '}' are terminals | |
"Language": | |
("Union", | |
("Concat", ("Rec", '', "Language"), | |
("Concat", ("Char", '{'), | |
("Concat", ("Rec", '', "Language"), | |
("Concat", ("Char", '}'), | |
("Rec", '', "Language"))))), | |
("Null", (("Null",),))) | |
}) | |
for tree in lang.parse('{{}{{}{}}}{}'): | |
print(tree) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment