Skip to content

Instantly share code, notes, and snippets.

@deliciouslytyped
Last active March 7, 2023 02:35
Show Gist options
  • Save deliciouslytyped/ec43b8a203ccc7815b3a79a4bbc04920 to your computer and use it in GitHub Desktop.
Save deliciouslytyped/ec43b8a203ccc7815b3a79a4bbc04920 to your computer and use it in GitHub Desktop.
# References:
# file:///C:/Users/Lenovo/Downloads/vaszil_forditok.pdf
# Alfred V. Aho, Monica S. Lam, Ravi Sethi, Jeffrey D. Ullman - Compilers - Principles, Techniques, and Tools-Pearson_Addison Wesley (2006).pdf
from textwrap import dedent
import string
from collections import defaultdict
import z3
from itertools import combinations
from functools import reduce
class Orsum: # todo callable?
@staticmethod
def orsum(iterable, start): # TODO make this a generator somehow?
for i in iterable:
start |= i
return start
@classmethod
def check(cls):
assert cls.orsum([{1, 2, 3}, {3, 4, 5}, {4, 5, 6}], frozenset()) == frozenset([1, 2, 3, 4, 5, 6])
orsum = Orsum.orsum
Orsum.check()
class Fix:
# maxiter is to help with infrec
@staticmethod
def fix(f, y, maxiter=10000): # TODO is this correct
x = None
i = 0
while x != y and i < maxiter:
x, y = y, f(y)
i += 1
if i == maxiter:
raise RecursionError("Iteration limit exceeded, last values: (%s, %s)" % (x, y))
return y
@classmethod
def check(cls):
import math
assert(cls.fix(lambda x: math.sqrt(x), 2) == 1.0)
def assertException(f, e):
try:
f()
except e:
return
else:
raise AssertionError("%s not raised" % e.__name__)
assertException(lambda: cls.fix(lambda x: math.sqrt(x), 2, 10), RecursionError)
Fix.check()
fix = Fix.fix
# TODO define syntax for grammar definitions
# terminals are uncapitalized words or symbols
# nonterminals are capitalized words
# right hand side alternatives must be separated by " | "
# left hand and right hand must be separated by " -> "
# currently it is assumed that the left hand of the first rule is the start symbol.
# the terminal epsilon is handled as a special case for the empty string/terminal
# TODO the following is unimplemented currently:
# if the first two lines dont contain an arrow (otherwise error) then the first line
# is taken to contain the start symbol and the second line the alphabet.
# if an alphabet is defined, this enables assertions about it #TODO
def hazi_to_grammar(grmr):
# add arrows to rules and separate alternatives, also separate adjacent characters with a space (rule right sides)
# so transform something like S A BC D to S -> A | B C | D
grmr = "\n".join(
l.split()[0] + " -> " + " | ".join(
map(lambda x: " ".join(x) if x != "epsilon" else x, l.split()[1:]) #TODO test epsilon
) if i >= 2 else l
for i, l in enumerate(grmr.splitlines()))
# split nonterminals
return grmr
class Cases:
grmr1 = dedent("""
E -> E + E | E * E | num
""").strip()
# a CYK házi
# TODO itt miért van különszedve az egyelemű és kételemű S jobboldal?
grmr2 = hazi_to_grammar(dedent("""
S
a b
S A B E C X Y Z
S YB XA
E YB XA
A a YE XC
B b XE YZ
C AA
X b
Y a
Z BB
""").strip())
leftrec1 = dedent("""
A -> A a | b
""").strip()
class MetaGrammar:
@staticmethod
def is_terminal(s: str): # TODO is this missing anything
return s.lower() == s or s in string.punctuation
@staticmethod
def is_nonterminal(s: str):
return s[0].isupper() and (len(s) == 1 or s[1:].isalnum())
class GrammarT: # TODO or you could probably just inherit from dict?
def __init__(self, s):
self.start = None
self.alphabet = None # only if defined in the second line of the grammar
self.g = self.read(s)
def __repr__(self): # TODO topologically sorted?
ret = "\n".join(f'{k} -> {" | ".join(v)}' for k, v in self.g.items())
return 'GrammarT("""%s""")' % ret
# def __len__(self): # TODO number of rules or rule alternatives?
def __getitem__(self, item):
return self.g.__getitem__(item)
#TODO toposort / ordereddict (how to deal with cycles?)
def read(self, s : str): # TODO handle alphabet and start symbol definition lines
#TODO find start symbol by attempting topological sort?
lines = s.splitlines()
maybe_start = lines[0].split()
if len(maybe_start) == 1: # assume start symbol and alphabet declaration
self.start = maybe_start[0]
self.alphabet = lines[1].split()
lines = lines[2:]
else:
self.start = lines[0].split(" -> ")[0]
grmr = defaultdict(list)
for l in lines:
k, v = l.split(" -> ")
grmr[k].extend(v.split(" | "))
return grmr
def terminals(self):
grmr_items = set(sum([x.split() for x in sum(self.g.values(), [])], []))
return {e for e in grmr_items if MetaGrammar.is_terminal(e)}
def nonterminals(self): # TODO technically shouldnt it be enough to just look at the left hand sides for nonterminals?
return set(self.g.keys()) | orsum((self._nonterminals_of(left) for left in self.g.keys()), frozenset())
#grmr_items = set(self.g.keys()) | set(sum([x.split() for x in sum(self.g.values(), [])], []))
#return {e for e in grmr_items if MetaGrammar.is_nonterminal(e)}
def _nonterminals_of(self, nt):
return orsum(({x for x in alt.split() if MetaGrammar.is_nonterminal(x)} for alt in self.g[nt]), frozenset())
def nonterminals_of(self, iterable): # TODO not sure if this is correct
try:
i = iter(iterable)
if issubclass(iterable.__class__, str):
raise TypeError
except TypeError: # assume one item
return self._nonterminals_of(iterable)
else: # assume set of items
return orsum((self._nonterminals_of(x) for x in iterable), frozenset())
# undecidable?
def is_ambiguous(self):
raise NotImplementedError
# TODO
# for each rule, any is in it's in it's own closure, its leftrec
# TODO check if this is correct
# TODO two definitions of leftrec? 1) any rule is left rec, 2) S is left rec? 3) it is a parent of a leftrec rule?
# Aho 4.3.3 lists \exists rule such that A *=> Aa as the def.
def is_left_rec(self, nt=None):
if not nt:
lrs = frozenset()
for r in self.g.keys():
nts = fix(lambda x: frozenset(x) | self.nonterminals_of(x), self.nonterminals_of(r))
if r in nts: # TODO this is wrong; the condition needs to be that ...A... has a nullable prefix
lrs |= {r}
return lrs
else:
nts = fix(lambda x: frozenset(x) | self.nonterminals_of(x), self.nonterminals_of(nt))
return nt in nts
# if lr is not specified, do the below for every rule (does the grammar have any epsilon rules?)
# TODO/NOTE prefix if set does not make sense with idx=None? because every alternative could have a different prefix of interest
# if lr is specified, return if the rule right side is nullable up to prefix
# i.e. if prefix is the default -1, the whole rule can return the empty string #TODO/NOTE https://stackoverflow.com/questions/11337941/python-negative-zero-slicing
# otherwise prefix is the index inclusive up to which the rule is prefixable with epsilon
#
# idx specifies the index of a specific alternative in the rule #TODO
#
# This is used in LRE.
#TODO tests
def is_nullable(self, lr=None, prefix=None, idx=None): # TODO what do I need to do here again?
assert not idx and prefix # idx implies prefix
if lr:
alts = [self.g[lr][idx]] if idx else self.g[lr]
for alt in alts:
altnullable = True
for sym in alt[:prefix]: # TODO need to iter over alternatives
if MetaGrammar.is_terminal(sym):
altnullable = False #TODO needs to pass all alternatives
break
if self.is_nullable(sym):
continue
return
return False
else:
raise NotImplementedError
# TODO
# TODO look at https://en.wikipedia.org/wiki/Left_recursion#cite_note-Moore2000-2 and its wiki page
# The wiki page suggests that LRE algorithms may change the semantics?
def LRE(self):
# Precondition by aho figure 4.11
# INPUT: Grammar G with no cycles or epsilon productions # TODO this means no epsilon rules right?
assert self.has_cycles() == False and self.is_nullable() == False
# Assert leftrec eliminiation succeeded, Aho 4.3.3 suggests the above procedure doesnt always work? # TODO
assert bool(self.is_left_rec()) == False
return self
# TODO
def left_factor(self):
raise NotImplementedError
### CNF ###
# http://informatikdidaktik.de/InformaticaDidactica/LangeLeiss2009
# https://en.wikipedia.org/wiki/Chomsky_normal_form
def START(self):
return self
def TERM(self):
return self
def BIN(self):
return self
def UNIT(self):
return self
def DEL(self):
return self
### LL grammar ops ###
def FIRST(self, rule):
return self
def FOLLOW(self, rule):
return self
#TODO
class TestGrammarT: # TODO # TODO I should really be using unittest for this shouldnt I?
@classmethod
def check(cls):
cls.test_grmr1()
cls.test_grmr2()
cls.test_leftrec1()
@staticmethod
def test_grmr1():
g = GrammarT(Cases.grmr1)
print(g)
assert g._nonterminals_of("E") == {"E"}
#assert g._nonterminals_of(["E", "E", "E"]) == {"E"} # only accepts one item
assert g.nonterminals_of("E") == {"E"}
assert g.nonterminals_of(["E", "E", "E"]) == {"E"}
#assert g._terminals_of("") #TODO
assert g.nonterminals() == {"E"}
assert g.terminals() == {"*", "+", "num"}
assert g.start == "E"
#assert g. # TODO assert rule set
#TODO I think book said ambiguity is undecidable so we cant test that?
assert bool(g.is_left_rec())
pass # assert g.LRE().is_left_rec() is False
@staticmethod
def test_grmr2(): # TODO
g = GrammarT(Cases.grmr2)
assert g.is_left_rec() == {'Z', 'C', 'A', 'B', 'E'} # TODO check this
assert bool(g.is_left_rec("S")) is False
assert g.nonterminals() - g.is_left_rec() == {"Y", "S", "X"} # TODO check this #TODO probably wrong, previous definition of is_left_rec didnt account for nullable prefixes
print(g)
#rr = g.z3()
#pass
@staticmethod
def test_leftrec1():
g = GrammarT(Cases.leftrec1)
#assert g.LRE() == GrammarT(dedent("""
# A -> b A2
# A2 -> a A2 | epsilon
# """).strip())
print(g)
if __name__ == "__main__":
TestGrammarT.check()
from z3 import *
from textwrap import dedent
from grammar_trf import GrammarT, MetaGrammar
from itertools import combinations
import string
from functools import reduce
_grmr1 = dedent("""
A -> A a | b
""").strip()
_grmr2 = dedent("""
B -> B a | b
""").strip()
grmr1 = GrammarT(_grmr1)
grmr2 = GrammarT(_grmr2)
flip = lambda f: lambda a, b: f(b, a)
foldr = lambda f, l: reduce(flip(f), reversed(l))
def to_ruleset(grmr: GrammarT, rulesetnum):
syms = {x: String(f"r{rulesetnum}_{x}") for x in grmr.nonterminals()}
distinct_constraints = [a != b for a, b in combinations(syms.values(), 2)]
domain = list(string.ascii_uppercase[:len(syms)])
domain_constraints = [foldr(lambda a, b: Or((x == a), b), domain + [False]) for x in syms.values()]
constraints = list()
rules = Array("rules%s" % rulesetnum, StringSort(), SetSort(ArraySort(IntSort(), StringSort())))
for k, v in grmr.g.items():
_alts = EmptySet(ArraySort(IntSort(), StringSort()))
for i, alt in enumerate(v):
_alt = Array(f"r{rulesetnum}_{k}{i}", IntSort(), StringSort())
_alts = SetAdd(_alts, _alt)
constraints.extend([Select(_alt, j) == (syms[e] if MetaGrammar.is_nonterminal(e) else e)
for j, e in enumerate(alt.split())])
constraints.append(IsMember(_alt, Select(rules, syms[k])))
constraints.append(IsSubset(Select(rules, syms[k]), _alts))
return syms, rules, sum([distinct_constraints, domain_constraints, constraints], [])
def ruleset_eq(idom, jdom, rs1: Array, rs2: Array):
assert rs1.sort() == ArraySort(StringSort(), SetSort(ArraySort(IntSort(), StringSort())))
assert rs2.sort() == ArraySort(StringSort(), SetSort(ArraySort(IntSort(), StringSort())))
constr = list()
for i in idom: # forall i there exists j such that the set of alternatives for the rule are the same
expr = lambda a, b: Or(
And(SetUnion(Select(rs1, i), Select(rs2, a)) == Select(rs1, i),
SetUnion(Select(rs1, i), Select(rs2, a)) == Select(rs2, a)),
b)
constr.append(foldr(expr, list(jdom) + [False]))
return constr
def eq(g1, g2):
s1, r1, constr1 = to_ruleset(g1, 1)
s2, r2, constr2 = to_ruleset(g2, 2)
constr3 = ruleset_eq(s1.values(), s2.values(), r1, r2)
s = Solver()
constraints = constr1 + constr2 + constr3
s.add(*constraints)
if s.check() == sat:
return True
else:
return False
if __name__ == "__main__": # TODO refactor
print(eq(grmr1, grmr2))
print(eq(GrammarT("A -> c"), GrammarT("B -> a b")))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment