Skip to content

Instantly share code, notes, and snippets.

@bogsio
Created August 13, 2014 22:47
Show Gist options
  • Save bogsio/01fa7482af2d9b47a13d to your computer and use it in GitHub Desktop.
Save bogsio/01fa7482af2d9b47a13d to your computer and use it in GitHub Desktop.
PCFGViterbiParser
import nltk
from nltk.grammar import WeightedProduction, Nonterminal
from util import corpus2trees, trees2productions
class PCFGViterbiParser(nltk.ViterbiParser):
def __init__(self, grammar, trace=0):
super(PCFGViterbiParser, self).__init__(grammar, trace)
@staticmethod
def _preprocess(tokens):
replacements = {
"(": "-LBR-",
")": "-RBR-",
}
for idx, tok in enumerate(tokens):
if tok in replacements:
tokens[idx] = replacements[tok]
return tokens
@classmethod
def train(cls, content, root):
if not isinstance(content, basestring):
content = content.read()
trees = corpus2trees(content)
productions = trees2productions(trees)
pcfg = nltk.grammar.induce_pcfg(nltk.grammar.Nonterminal(root), productions)
return cls(pcfg)
def parse(self, tokens):
tokens = self._preprocess(list(tokens))
tagged = nltk.pos_tag(tokens)
missing = False
for tok, pos in tagged:
if not self._grammar._lexical_index.get(tok):
missing = True
self._grammar._productions.append(WeightedProduction(Nonterminal(pos), [tok], prob=0.000001))
if missing:
self._grammar._calculate_indexes()
return super(PCFGViterbiParser, self).parse(tokens)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment