Skip to content

Instantly share code, notes, and snippets.

Created June 24, 2010 19:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save anonymous/451880 to your computer and use it in GitHub Desktop.
Save anonymous/451880 to your computer and use it in GitHub Desktop.
import nltk
import random
class Categorizer(object):
def __init__(self, categories):
self.categories = categories
def prep_seed(self, content):
"""Convert seed content into nltk.Text"""
raw = nltk.clean_html(content)
tokens = nltk.word_tokenize(raw)
return nltk.Text(tokens)
def doc_prep(self, categories, output_list):
"""categories is a Django Queryset of categories"""
for c in categories:
seeds = c.seedarticle_set.all().iterator()
output_list.extend([(self.prep_seed(s.seed), c.name) for s in seeds])
return output_list
def document_features(self, document):
document_words = set(document)
features = {}
for word in self.word_features:
features['contains(%s)' % word] = (word in document_words)
return features
def test_accuracy(self):
documents = self.doc_prep(self.categories, [])
random.shuffle(documents)
words = []
for d in documents:
words.extend(d[0].tokens)
all_words = nltk.FreqDist(w.lower() for w in words)
del words
self.word_features = all_words.keys()[:2000]
featuresets = [(self.document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, test_set)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment