Skip to content

Instantly share code, notes, and snippets.

@drussellmrichie
Created March 13, 2015 18:17
Show Gist options
  • Save drussellmrichie/ee9ebe63610553bd2d27 to your computer and use it in GitHub Desktop.
Save drussellmrichie/ee9ebe63610553bd2d27 to your computer and use it in GitHub Desktop.
Lookup tagger from Chp 5 of NLTK book
# Natural Language Toolkit: code_baseline_tagger
# functions from http://www.nltk.org/book/ch05.html
from nltk.corpus import brown
import nltk
def performance(cfd, wordlist):
lt = dict((word, cfd[word].max()) for word in wordlist)
baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))
def display():
import pylab
words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
sizes = 2 ** pylab.arange(15)
perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
pylab.plot(sizes, perfs, '-bo')
pylab.title('Lookup Tagger Performance with Varying Model Size')
pylab.xlabel('Model Size')
pylab.ylabel('Perfdlormance')
pylab.show()
display()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment