Created
September 26, 2012 18:33
-
-
Save aprimc/3789707 to your computer and use it in GitHub Desktop.
Packrat parser for Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from copy import copy | |
class T: | |
def __init__(self, tag, value = None, props = None, comment = None, head = False, assign = None, agree = None, consume = None): | |
# tag | |
self.tag = tag | |
self.value = value | |
self.props = props or {} | |
# rule; TODO subclass? | |
self.comment = comment # comment value | |
self.head = head # head matches with top | |
self.assign = assign or [] # non-head assigns property; e.g. subject | |
self.agree = agree or [] # non-head agrees with property; e.g. adjective | |
self.consume = consume or [] # non-head is a marker for property; property in head is consumed; e.g. auxiliary | |
# build single list | |
self.assign_agree_consume = self.assign + self.agree + self.consume | |
# some checking; TODO move to rule? | |
assert self.head == True and self.assign_agree_consume == [] or self.head == False | |
assert self.assign != [] and self.agree == [] and self.consume == [] or self.assign == [] | |
assert self.value is None or self.comment is None | |
def __repr__(self): | |
return ''.join(['T(', | |
repr(self.tag), | |
(self.value is not None and ', ' + repr(self.value) or ''), | |
(self.props != {} and ', props = ' + repr(self.props) or ''), | |
(self.comment is not None and ', comment = ' + repr(self.comment) or ''), | |
(self.head and ', head = True' or ''), | |
(self.assign != [] and ', assign = ' + repr(self.assign) or ''), | |
(self.agree != [] and ', agree = ' + repr(self.agree) or ''), | |
(self.consume != [] and ', consume = ' + repr(self.consume) or ''), | |
')']) | |
def __copy__(self): | |
# TODO consider whether a partial copy below breaks anything; not here but elsewhere | |
return T(self.tag, value = copy(self.value), props = copy(self.props)) | |
def match(self, tag): | |
# TODO deprecate | |
'''return tag or none''' | |
target = copy(self) | |
if match_tag(target, tag): | |
return target | |
return None | |
# | |
# | |
def match_props(target, source, but = None): | |
'''returns True or False, modifies target''' | |
for key, val in source.items(): | |
assert type(val) in [str, unicode] | |
if key in target: | |
if target[key] != val and (but is None or key not in but): | |
return False | |
else: | |
if but is None or key not in but: | |
target[key] = val | |
return True | |
def assign_props(target, source, keys): | |
'''returns True or False, modifies target''' | |
for key in keys: | |
if key in source: | |
val = source[key] | |
assert type(val) in [str, unicode] | |
if key in target: | |
if target[key] != val: | |
return False | |
else: | |
target[key] = val | |
return True | |
def match_tag(target, source, but = None): | |
if target.tag != source.tag: | |
return False | |
if target.value is None: | |
target.value = source.value | |
elif source.value is None: | |
pass | |
elif target.value != source.value: | |
return False | |
if not match_props(target.props, source.props, but = but): | |
return False | |
return True | |
def match_rule_frag(target_top, target_bottom, rule_top, rule_bottom, ignore_value = False): | |
'''match target_top; return True or False''' | |
# match target with rule bottom | |
tb = copy(target_bottom) | |
if not match_tag(tb, rule_bottom): | |
return False | |
# match properties for head, excluding those set by rule top | |
if rule_bottom.head: | |
if not match_props(target_top.props, tb.props, but = rule_top.head_ignore): | |
return False | |
# match assigned, agreed or marked properties | |
if not assign_props(target_top.props, target_bottom.props, rule_bottom.assign_agree_consume): | |
return False | |
# append value if not set by rule | |
if not ignore_value: | |
if target_top.value is None: | |
target_top.value = [] | |
if rule_bottom.value is None: | |
value = target_bottom.value | |
# comment | |
if rule_bottom.comment is not None: | |
value = [rule_bottom.comment, value] | |
target_top.value.append(value) | |
# ok | |
return True | |
def unmatch_rule_frag(target_top, target_bottom, rule_top, rule_bottom, ignore_value = False): | |
'''match target_bottom; consume value in target_top; return True or False''' | |
# match target with rule bottom | |
if not match_tag(target_bottom, rule_bottom): | |
return False | |
# match properties for head, excluding those set by rule top | |
if rule_bottom.head: | |
if not match_props(target_bottom.props, target_top.props, but = rule_top.head_ignore): | |
return False | |
# match assigned, agreed or marked properties | |
if not assign_props(target_bottom.props, target_top.props, rule_bottom.assign_agree_consume): | |
return False | |
# get value from target_top or from rule_bottom | |
if not ignore_value: | |
if rule_bottom.value is None: | |
if target_top.value == []: | |
return False | |
if type(target_top.value) is not list: | |
return False # flattening causes ambiguity :) | |
value = target_top.value[0] | |
# uncomment | |
if rule_bottom.comment is not None: | |
if rule_bottom.comment != value[0]: | |
return False | |
assert len(value) == 2 | |
value = value[1] | |
target_bottom.value = value | |
target_top.value = target_top.value[1:] | |
else: | |
assert target_bottom.value == rule_bottom.value | |
# ok | |
return True | |
# | |
# | |
class Rule: | |
def __init__(self, top, bottom): | |
self.top = top | |
self.bottom = bottom | |
# TODO comment why flattening is necessary | |
self.is_flat = False | |
value_ct = 0 | |
for t in self.bottom: | |
if t.value is None: | |
value_ct += 1 | |
if value_ct == 1: | |
self.is_flat = True | |
# head_ignore | |
head_ignore = [] | |
# ignore properties set in rule top | |
for prop in self.top.props: | |
if prop not in head_ignore: | |
head_ignore.append(prop) | |
# ignore marked and explicitly set in head | |
for tag in self.bottom: | |
for prop in tag.consume: | |
if prop not in head_ignore: | |
head_ignore.append(prop) | |
if tag.head: | |
assert tag.agree == [] and tag.assign == [] and tag.consume == [] | |
for prop in tag.props: | |
if prop not in head_ignore: | |
head_ignore.append(prop) | |
# set it into top | |
self.top.head_ignore = head_ignore | |
def __repr__(self): | |
return 'Rule(%s, %s)' % (repr(self.top), repr(self.bottom)) | |
def match(self, bottom): | |
'''tag or none''' | |
if len(self.bottom) != len(bottom): | |
return None | |
result = copy(self.top) | |
# match fragments | |
for rule_bottom, target_bottom in zip(self.bottom, bottom): | |
if not match_rule_frag(result, target_bottom, self.top, rule_bottom): | |
return None | |
# flatten | |
if self.is_flat: | |
result.value = result.value[0] | |
# comment | |
if self.top.comment is not None: | |
result.value = [self.top.comment, result.value] | |
return result | |
def unmatch(self, top): | |
target_top = copy(top) | |
if not match_tag(target_top, self.top): | |
return None | |
# uncomment | |
if self.top.comment is not None: | |
if self.top.comment != target_top.value[0]: | |
return None | |
assert len(target_top.value) == 2 | |
target_top.value = target_top.value[1] | |
# unflatten | |
if self.is_flat: | |
target_top.value = [target_top.value] | |
bottom = [] | |
for tag in self.bottom: | |
target_bottom = copy(tag) | |
if not unmatch_rule_frag(target_top, target_bottom, self.top, tag): | |
return None | |
bottom.append(target_bottom) | |
return bottom | |
class Grammar: | |
def __init__(self, rules): | |
#self.rules = rules | |
self.ruletab = {} | |
for rule in rules: | |
tag = rule.top.tag | |
if tag not in self.ruletab: | |
self.ruletab[tag] = [rule] | |
else: | |
self.ruletab[tag].append(rule) | |
def rules(self, tag): | |
assert isinstance(tag, T) | |
if tag.tag not in self.ruletab: | |
return [] | |
return self.ruletab[tag.tag] | |
class Lexicon: | |
def __init__(self, wordlist = None): | |
self.wordtab = {} | |
self.tagtab = {} | |
self.terms = {} | |
if wordlist is not None: | |
for word, tag in wordlist: | |
if type(tag) is str: | |
tag = T(tag, word) | |
self.add(word, tag) | |
def add(self, word, tag): | |
assert isinstance(tag, T) | |
if word not in self.wordtab: | |
self.wordtab[word] = [tag] | |
else: | |
self.wordtab[word].append(tag) | |
if tag.value not in self.tagtab: | |
self.tagtab[tag.value] = [(tag, word)] | |
else: | |
self.tagtab[tag.value].append( (tag, word) ) | |
self.terms[tag.tag] = self.terms.get(tag.tag, 0) + 1 | |
def matchword(self, word, tag): | |
if word not in self.wordtab: | |
return None | |
for t in self.wordtab[word]: | |
m = tag.match(t) | |
if m is not None: | |
return m | |
return None | |
def matchword_all(self, word, tag): | |
if word not in self.wordtab: | |
return [] | |
r = [] | |
for t in self.wordtab[word]: | |
m = tag.match(t) | |
if m is not None: | |
r.append(m) | |
return r | |
def getwordmatch(self, tag): | |
if tag.value not in self.tagtab: | |
return None | |
for t, word in self.tagtab[tag.value]: | |
m = tag.match(t) | |
if m is not None: | |
return word, m | |
return None | |
def getword(self, tag): | |
word_match = self.getwordmatch(tag) | |
if word_match is not None: | |
return word_match[0] | |
return None | |
def isterm(self, tag): | |
assert isinstance(tag, T) | |
return tag.tag in self.terms | |
# top-down parser | |
# + handles left recursion bottom-up | |
# + memoizes partial results | |
# TODO + blocks left circular recursion; should it crash instead? | |
# | |
def parse_all(goal, tokens, index, grammar, lexicon, memo = None): | |
assert isinstance(goal, T) | |
if index == len(tokens): | |
return [] | |
assert index <= len(tokens) | |
# check memo | |
if memo is None: | |
memo = {} | |
# TODO make lighter key; perhaps a hash | |
memo_key = (index, goal.tag, goal.value, tuple(goal.props.items())) | |
if memo_key in memo: | |
return memo[memo_key] | |
# avoid circular left recursion | |
memo[memo_key] = [] # TODO ignore or break? | |
# match word | |
if lexicon.isterm(goal): | |
matches = lexicon.matchword_all(tokens[index], goal) | |
r = [(i, index + 1) for i in matches] | |
memo[memo_key] = r # store in memo | |
return r | |
# match rule | |
r = [] | |
flag = False | |
for rule in grammar.rules(goal): | |
if rule.bottom[0].tag == goal.tag: | |
# skip left recursive rule | |
assert len(rule.bottom) > 1 | |
# TODO is order relevant? if so, preserve. but how? :) | |
flag = True | |
continue | |
for match, next in parse_rule(rule, tokens, index, grammar, lexicon, memo): | |
r.append( (match, next) ) | |
if flag: | |
# handle left recursive rules bottom-up | |
for match, next in r: | |
if match.tag == goal.tag: | |
for rule in grammar.rules(goal): | |
if rule.bottom[0].tag == goal.tag: | |
# ^ only these apply for LR bottom-up | |
for match2, next2 in parse_rule_first(rule, match, tokens, next, grammar, lexicon, memo): | |
r.append( (match2, next2) ) # NB appended in place; topmost loop will pick it up | |
memo[memo_key] = r # store in memo | |
return r | |
def parse_rule(rule, tokens, index, grammar, lexicon, memo): | |
r = [] | |
for tags, next in parse_rule_rest(rule.bottom, tokens, index, grammar, lexicon, memo): | |
match = rule.match(tags) | |
if match is not None: | |
r.append( (match, next) ) | |
return r | |
def parse_rule_first(rule, tag, tokens, index, grammar, lexicon, memo): | |
r = [] | |
for tags, next in parse_rule_rest(rule.bottom[1:], tokens, index, grammar, lexicon, memo): | |
tags0 = [tag] + tags | |
match = rule.match(tags0) | |
if match is not None: | |
r.append( (match, next) ) | |
return r | |
def parse_rule_rest(tags, tokens, index, grammar, lexicon, memo): | |
assert len(tags) > 0 | |
r = [] | |
for tag, next in parse_all(tags[0], tokens, index, grammar, lexicon, memo): | |
if len(tags) == 1: | |
r.append( ([tag], next) ) | |
else: | |
for tags2, next2 in parse_rule_rest(tags[1:], tokens, next, grammar, lexicon, memo): | |
r.append( ([tag] + tags2, next2) ) | |
return r | |
# generator | |
# | |
def _unparse(tag, grammar, lexicon): | |
# TODO rename tag~tags to top~bottom | |
# from lexicon; note the type! | |
if lexicon.isterm(tag) and type(tag.value) is str: | |
word_match = lexicon.getwordmatch(tag) | |
if word_match: | |
word, match = word_match | |
return [word], match | |
return None, None | |
# match rule | |
for rule in grammar.rules(tag): | |
# get tags | |
tags = rule.unmatch(tag) | |
if tags is None: | |
continue | |
top = copy(tag) | |
struct = [ [t, r, None] for t, r in zip(tags, rule.bottom) ] | |
# match in order: assign > head > other | |
# to percolate lexical properties, which may not be present in the top tag | |
flag = True | |
for order in [1, 2, 3]: | |
for trw in struct: | |
bt, br, dummy = trw | |
# that orders are exclusive should be guaranteed by T.__init__ | |
if (order == 1 and br.assign != [] | |
or order == 2 and br.head | |
or order == 3 and br.assign == [] and not br.head): | |
flag = False | |
# update bottom from top | |
if not unmatch_rule_frag(top, bt, rule.top, br, ignore_value = True): | |
break | |
# generate fragment | |
ws, t = _unparse(bt, grammar, lexicon) | |
if t is None: | |
break | |
# update top from bottom | |
if not match_rule_frag(top, t, rule.top, br, ignore_value = True): | |
break | |
trw[2] = ws | |
flag = True | |
if not flag: | |
break | |
if not flag: | |
continue | |
# collect words | |
words = [] | |
for t, r, ws in struct: | |
for w in ws: | |
words.append(w) | |
return words, top | |
return None, None | |
def unparse(tag, grammar, lexicon): | |
words, dummy = _unparse(tag, grammar, lexicon) | |
return words | |
# test grammar and lexicon | |
# | |
grammar = Grammar([ | |
Rule(T('NP'), [T('DET'), T('N1', head = True)]), | |
Rule(T('NP'), [T('N1', head = True)]), | |
Rule(T('N1'), [T('A'), T('N1', head = True)]), | |
Rule(T('N1', head = True, props = {'pers': '3'}), [T('N')]), | |
Rule(T('NP'), [T('NP', head = True), T('PP', comment = 'MOD')]), | |
Rule(T('PP'), [T('PREP'), T('NP')]), | |
Rule(T('S'), [ | |
T('NP', assign = ['num', 'pers'], comment = 'SUBJ'), | |
T('V1', head = True)]), | |
Rule(T('V1', props = {'tense': 'past perfect'}), [ | |
T('V', 'have', props = {'tense': 'present'}, consume =['num', 'pers']), | |
T('V1', head = True, props = {'tense': 'part'})]), | |
Rule(T('V1'), [T('V', head = True), T('NP', comment = 'IOBJ'), T('NP', comment = 'OBJ')]), | |
Rule(T('V1'), [T('V', head = True), T('NP', comment = 'OBJ')]), | |
Rule(T('V1'), [T('V', head = True)]), | |
Rule(T('V1'), [T('V1', head = True), T('PP', comment = 'MOD')]), | |
]) | |
lexicon = Lexicon([ | |
('the', 'DET'), | |
('a', 'DET'), | |
('cat', 'N'), | |
('bird', 'N'), | |
('tree', 'N'), | |
('in', 'PREP'), | |
('eats', T('V', 'eat', props = {'tense': 'present', 'num': 'sg', 'pers': '3'})), | |
('eat', T('V', 'eat', props = {'tense': 'present'})), | |
('ate', T('V', 'eat', props = {'tense': 'past'})), | |
('eaten', T('V', 'eat', props = {'tense': 'part'})), | |
('has', T('V', 'have', props = {'tense': 'present', 'num': 'sg', 'pers': '3'})), | |
('have', T('V', 'have', props = {'tense': 'present'})), | |
('had', T('V', 'have', props = {'tense': 'past'})), | |
('had', T('V', 'have', props = {'tense': 'part'})), | |
('fat', 'A'), | |
('little', 'A'), | |
]) | |
sentences = [ | |
#'the cat eats a bird', | |
#'the cat ate a bird', | |
'the cat has eaten a bird', | |
'the cat ate a bird in the tree', | |
'a fat cat ate a little bird', | |
] | |
# frobnicate | |
# | |
if __name__ == '__main__': | |
for s in sentences: | |
print '<<', s | |
w = s.split() | |
l = parse_all(T('S'), w, 0, grammar, lexicon) | |
for t,r in l: | |
if r == len(w): | |
print '==', t.value#, t.props | |
print '>>', ' '.join(unparse(t, grammar, lexicon)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment