Skip to content
Create a gist now

Instantly share code, notes, and snippets.

anonymous /

Embed URL


Subversion checkout URL

You can clone with
Download ZIP
import nltk
import random
class Categorizer(object):
def __init__(self, categories):
self.categories = categories
def prep_seed(self, content):
"""Convert seed content into nltk.Text"""
raw = nltk.clean_html(content)
tokens = nltk.word_tokenize(raw)
return nltk.Text(tokens)
def doc_prep(self, categories, output_list):
"""categories is a Django Queryset of categories"""
for c in categories:
seeds = c.seedarticle_set.all().iterator()
output_list.extend([(self.prep_seed(s.seed), for s in seeds])
return output_list
def document_features(self, document):
document_words = set(document)
features = {}
for word in self.word_features:
features['contains(%s)' % word] = (word in document_words)
return features
def test_accuracy(self):
documents = self.doc_prep(self.categories, [])
words = []
for d in documents:
all_words = nltk.FreqDist(w.lower() for w in words)
del words
self.word_features = all_words.keys()[:2000]
featuresets = [(self.document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, test_set)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.