Skip to content

Instantly share code, notes, and snippets.

@aprimc
Created September 26, 2012 18:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aprimc/3789707 to your computer and use it in GitHub Desktop.
Save aprimc/3789707 to your computer and use it in GitHub Desktop.
Packrat parser for Python
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from copy import copy
class T:
def __init__(self, tag, value = None, props = None, comment = None, head = False, assign = None, agree = None, consume = None):
# tag
self.tag = tag
self.value = value
self.props = props or {}
# rule; TODO subclass?
self.comment = comment # comment value
self.head = head # head matches with top
self.assign = assign or [] # non-head assigns property; e.g. subject
self.agree = agree or [] # non-head agrees with property; e.g. adjective
self.consume = consume or [] # non-head is a marker for property; property in head is consumed; e.g. auxiliary
# build single list
self.assign_agree_consume = self.assign + self.agree + self.consume
# some checking; TODO move to rule?
assert self.head == True and self.assign_agree_consume == [] or self.head == False
assert self.assign != [] and self.agree == [] and self.consume == [] or self.assign == []
assert self.value is None or self.comment is None
def __repr__(self):
return ''.join(['T(',
repr(self.tag),
(self.value is not None and ', ' + repr(self.value) or ''),
(self.props != {} and ', props = ' + repr(self.props) or ''),
(self.comment is not None and ', comment = ' + repr(self.comment) or ''),
(self.head and ', head = True' or ''),
(self.assign != [] and ', assign = ' + repr(self.assign) or ''),
(self.agree != [] and ', agree = ' + repr(self.agree) or ''),
(self.consume != [] and ', consume = ' + repr(self.consume) or ''),
')'])
def __copy__(self):
# TODO consider whether a partial copy below breaks anything; not here but elsewhere
return T(self.tag, value = copy(self.value), props = copy(self.props))
def match(self, tag):
# TODO deprecate
'''return tag or none'''
target = copy(self)
if match_tag(target, tag):
return target
return None
#
#
def match_props(target, source, but = None):
'''returns True or False, modifies target'''
for key, val in source.items():
assert type(val) in [str, unicode]
if key in target:
if target[key] != val and (but is None or key not in but):
return False
else:
if but is None or key not in but:
target[key] = val
return True
def assign_props(target, source, keys):
'''returns True or False, modifies target'''
for key in keys:
if key in source:
val = source[key]
assert type(val) in [str, unicode]
if key in target:
if target[key] != val:
return False
else:
target[key] = val
return True
def match_tag(target, source, but = None):
if target.tag != source.tag:
return False
if target.value is None:
target.value = source.value
elif source.value is None:
pass
elif target.value != source.value:
return False
if not match_props(target.props, source.props, but = but):
return False
return True
def match_rule_frag(target_top, target_bottom, rule_top, rule_bottom, ignore_value = False):
'''match target_top; return True or False'''
# match target with rule bottom
tb = copy(target_bottom)
if not match_tag(tb, rule_bottom):
return False
# match properties for head, excluding those set by rule top
if rule_bottom.head:
if not match_props(target_top.props, tb.props, but = rule_top.head_ignore):
return False
# match assigned, agreed or marked properties
if not assign_props(target_top.props, target_bottom.props, rule_bottom.assign_agree_consume):
return False
# append value if not set by rule
if not ignore_value:
if target_top.value is None:
target_top.value = []
if rule_bottom.value is None:
value = target_bottom.value
# comment
if rule_bottom.comment is not None:
value = [rule_bottom.comment, value]
target_top.value.append(value)
# ok
return True
def unmatch_rule_frag(target_top, target_bottom, rule_top, rule_bottom, ignore_value = False):
'''match target_bottom; consume value in target_top; return True or False'''
# match target with rule bottom
if not match_tag(target_bottom, rule_bottom):
return False
# match properties for head, excluding those set by rule top
if rule_bottom.head:
if not match_props(target_bottom.props, target_top.props, but = rule_top.head_ignore):
return False
# match assigned, agreed or marked properties
if not assign_props(target_bottom.props, target_top.props, rule_bottom.assign_agree_consume):
return False
# get value from target_top or from rule_bottom
if not ignore_value:
if rule_bottom.value is None:
if target_top.value == []:
return False
if type(target_top.value) is not list:
return False # flattening causes ambiguity :)
value = target_top.value[0]
# uncomment
if rule_bottom.comment is not None:
if rule_bottom.comment != value[0]:
return False
assert len(value) == 2
value = value[1]
target_bottom.value = value
target_top.value = target_top.value[1:]
else:
assert target_bottom.value == rule_bottom.value
# ok
return True
#
#
class Rule:
def __init__(self, top, bottom):
self.top = top
self.bottom = bottom
# TODO comment why flattening is necessary
self.is_flat = False
value_ct = 0
for t in self.bottom:
if t.value is None:
value_ct += 1
if value_ct == 1:
self.is_flat = True
# head_ignore
head_ignore = []
# ignore properties set in rule top
for prop in self.top.props:
if prop not in head_ignore:
head_ignore.append(prop)
# ignore marked and explicitly set in head
for tag in self.bottom:
for prop in tag.consume:
if prop not in head_ignore:
head_ignore.append(prop)
if tag.head:
assert tag.agree == [] and tag.assign == [] and tag.consume == []
for prop in tag.props:
if prop not in head_ignore:
head_ignore.append(prop)
# set it into top
self.top.head_ignore = head_ignore
def __repr__(self):
return 'Rule(%s, %s)' % (repr(self.top), repr(self.bottom))
def match(self, bottom):
'''tag or none'''
if len(self.bottom) != len(bottom):
return None
result = copy(self.top)
# match fragments
for rule_bottom, target_bottom in zip(self.bottom, bottom):
if not match_rule_frag(result, target_bottom, self.top, rule_bottom):
return None
# flatten
if self.is_flat:
result.value = result.value[0]
# comment
if self.top.comment is not None:
result.value = [self.top.comment, result.value]
return result
def unmatch(self, top):
target_top = copy(top)
if not match_tag(target_top, self.top):
return None
# uncomment
if self.top.comment is not None:
if self.top.comment != target_top.value[0]:
return None
assert len(target_top.value) == 2
target_top.value = target_top.value[1]
# unflatten
if self.is_flat:
target_top.value = [target_top.value]
bottom = []
for tag in self.bottom:
target_bottom = copy(tag)
if not unmatch_rule_frag(target_top, target_bottom, self.top, tag):
return None
bottom.append(target_bottom)
return bottom
class Grammar:
def __init__(self, rules):
#self.rules = rules
self.ruletab = {}
for rule in rules:
tag = rule.top.tag
if tag not in self.ruletab:
self.ruletab[tag] = [rule]
else:
self.ruletab[tag].append(rule)
def rules(self, tag):
assert isinstance(tag, T)
if tag.tag not in self.ruletab:
return []
return self.ruletab[tag.tag]
class Lexicon:
def __init__(self, wordlist = None):
self.wordtab = {}
self.tagtab = {}
self.terms = {}
if wordlist is not None:
for word, tag in wordlist:
if type(tag) is str:
tag = T(tag, word)
self.add(word, tag)
def add(self, word, tag):
assert isinstance(tag, T)
if word not in self.wordtab:
self.wordtab[word] = [tag]
else:
self.wordtab[word].append(tag)
if tag.value not in self.tagtab:
self.tagtab[tag.value] = [(tag, word)]
else:
self.tagtab[tag.value].append( (tag, word) )
self.terms[tag.tag] = self.terms.get(tag.tag, 0) + 1
def matchword(self, word, tag):
if word not in self.wordtab:
return None
for t in self.wordtab[word]:
m = tag.match(t)
if m is not None:
return m
return None
def matchword_all(self, word, tag):
if word not in self.wordtab:
return []
r = []
for t in self.wordtab[word]:
m = tag.match(t)
if m is not None:
r.append(m)
return r
def getwordmatch(self, tag):
if tag.value not in self.tagtab:
return None
for t, word in self.tagtab[tag.value]:
m = tag.match(t)
if m is not None:
return word, m
return None
def getword(self, tag):
word_match = self.getwordmatch(tag)
if word_match is not None:
return word_match[0]
return None
def isterm(self, tag):
assert isinstance(tag, T)
return tag.tag in self.terms
# top-down parser
# + handles left recursion bottom-up
# + memoizes partial results
# TODO + blocks left circular recursion; should it crash instead?
#
def parse_all(goal, tokens, index, grammar, lexicon, memo = None):
assert isinstance(goal, T)
if index == len(tokens):
return []
assert index <= len(tokens)
# check memo
if memo is None:
memo = {}
# TODO make lighter key; perhaps a hash
memo_key = (index, goal.tag, goal.value, tuple(goal.props.items()))
if memo_key in memo:
return memo[memo_key]
# avoid circular left recursion
memo[memo_key] = [] # TODO ignore or break?
# match word
if lexicon.isterm(goal):
matches = lexicon.matchword_all(tokens[index], goal)
r = [(i, index + 1) for i in matches]
memo[memo_key] = r # store in memo
return r
# match rule
r = []
flag = False
for rule in grammar.rules(goal):
if rule.bottom[0].tag == goal.tag:
# skip left recursive rule
assert len(rule.bottom) > 1
# TODO is order relevant? if so, preserve. but how? :)
flag = True
continue
for match, next in parse_rule(rule, tokens, index, grammar, lexicon, memo):
r.append( (match, next) )
if flag:
# handle left recursive rules bottom-up
for match, next in r:
if match.tag == goal.tag:
for rule in grammar.rules(goal):
if rule.bottom[0].tag == goal.tag:
# ^ only these apply for LR bottom-up
for match2, next2 in parse_rule_first(rule, match, tokens, next, grammar, lexicon, memo):
r.append( (match2, next2) ) # NB appended in place; topmost loop will pick it up
memo[memo_key] = r # store in memo
return r
def parse_rule(rule, tokens, index, grammar, lexicon, memo):
r = []
for tags, next in parse_rule_rest(rule.bottom, tokens, index, grammar, lexicon, memo):
match = rule.match(tags)
if match is not None:
r.append( (match, next) )
return r
def parse_rule_first(rule, tag, tokens, index, grammar, lexicon, memo):
r = []
for tags, next in parse_rule_rest(rule.bottom[1:], tokens, index, grammar, lexicon, memo):
tags0 = [tag] + tags
match = rule.match(tags0)
if match is not None:
r.append( (match, next) )
return r
def parse_rule_rest(tags, tokens, index, grammar, lexicon, memo):
assert len(tags) > 0
r = []
for tag, next in parse_all(tags[0], tokens, index, grammar, lexicon, memo):
if len(tags) == 1:
r.append( ([tag], next) )
else:
for tags2, next2 in parse_rule_rest(tags[1:], tokens, next, grammar, lexicon, memo):
r.append( ([tag] + tags2, next2) )
return r
# generator
#
def _unparse(tag, grammar, lexicon):
# TODO rename tag~tags to top~bottom
# from lexicon; note the type!
if lexicon.isterm(tag) and type(tag.value) is str:
word_match = lexicon.getwordmatch(tag)
if word_match:
word, match = word_match
return [word], match
return None, None
# match rule
for rule in grammar.rules(tag):
# get tags
tags = rule.unmatch(tag)
if tags is None:
continue
top = copy(tag)
struct = [ [t, r, None] for t, r in zip(tags, rule.bottom) ]
# match in order: assign > head > other
# to percolate lexical properties, which may not be present in the top tag
flag = True
for order in [1, 2, 3]:
for trw in struct:
bt, br, dummy = trw
# that orders are exclusive should be guaranteed by T.__init__
if (order == 1 and br.assign != []
or order == 2 and br.head
or order == 3 and br.assign == [] and not br.head):
flag = False
# update bottom from top
if not unmatch_rule_frag(top, bt, rule.top, br, ignore_value = True):
break
# generate fragment
ws, t = _unparse(bt, grammar, lexicon)
if t is None:
break
# update top from bottom
if not match_rule_frag(top, t, rule.top, br, ignore_value = True):
break
trw[2] = ws
flag = True
if not flag:
break
if not flag:
continue
# collect words
words = []
for t, r, ws in struct:
for w in ws:
words.append(w)
return words, top
return None, None
def unparse(tag, grammar, lexicon):
words, dummy = _unparse(tag, grammar, lexicon)
return words
# test grammar and lexicon
#
grammar = Grammar([
Rule(T('NP'), [T('DET'), T('N1', head = True)]),
Rule(T('NP'), [T('N1', head = True)]),
Rule(T('N1'), [T('A'), T('N1', head = True)]),
Rule(T('N1', head = True, props = {'pers': '3'}), [T('N')]),
Rule(T('NP'), [T('NP', head = True), T('PP', comment = 'MOD')]),
Rule(T('PP'), [T('PREP'), T('NP')]),
Rule(T('S'), [
T('NP', assign = ['num', 'pers'], comment = 'SUBJ'),
T('V1', head = True)]),
Rule(T('V1', props = {'tense': 'past perfect'}), [
T('V', 'have', props = {'tense': 'present'}, consume =['num', 'pers']),
T('V1', head = True, props = {'tense': 'part'})]),
Rule(T('V1'), [T('V', head = True), T('NP', comment = 'IOBJ'), T('NP', comment = 'OBJ')]),
Rule(T('V1'), [T('V', head = True), T('NP', comment = 'OBJ')]),
Rule(T('V1'), [T('V', head = True)]),
Rule(T('V1'), [T('V1', head = True), T('PP', comment = 'MOD')]),
])
lexicon = Lexicon([
('the', 'DET'),
('a', 'DET'),
('cat', 'N'),
('bird', 'N'),
('tree', 'N'),
('in', 'PREP'),
('eats', T('V', 'eat', props = {'tense': 'present', 'num': 'sg', 'pers': '3'})),
('eat', T('V', 'eat', props = {'tense': 'present'})),
('ate', T('V', 'eat', props = {'tense': 'past'})),
('eaten', T('V', 'eat', props = {'tense': 'part'})),
('has', T('V', 'have', props = {'tense': 'present', 'num': 'sg', 'pers': '3'})),
('have', T('V', 'have', props = {'tense': 'present'})),
('had', T('V', 'have', props = {'tense': 'past'})),
('had', T('V', 'have', props = {'tense': 'part'})),
('fat', 'A'),
('little', 'A'),
])
sentences = [
#'the cat eats a bird',
#'the cat ate a bird',
'the cat has eaten a bird',
'the cat ate a bird in the tree',
'a fat cat ate a little bird',
]
# frobnicate
#
if __name__ == '__main__':
print
for s in sentences:
print '<<', s
w = s.split()
l = parse_all(T('S'), w, 0, grammar, lexicon)
for t,r in l:
if r == len(w):
print '==', t.value#, t.props
print '>>', ' '.join(unparse(t, grammar, lexicon))
print
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment