public
Last active

  • Download Gist
NaiveBayesClassifier.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
import nltk
import random
 
class Categorizer(object):
def __init__(self, categories):
self.categories = categories
 
def prep_seed(self, content):
"""Convert seed content into nltk.Text"""
raw = nltk.clean_html(content)
tokens = nltk.word_tokenize(raw)
return nltk.Text(tokens)
 
def doc_prep(self, categories, output_list):
"""categories is a Django Queryset of categories"""
for c in categories:
seeds = c.seedarticle_set.all().iterator()
output_list.extend([(self.prep_seed(s.seed), c.name) for s in seeds])
return output_list
 
 
def document_features(self, document):
document_words = set(document)
features = {}
for word in self.word_features:
features['contains(%s)' % word] = (word in document_words)
return features
 
def test_accuracy(self):
documents = self.doc_prep(self.categories, [])
random.shuffle(documents)
 
words = []
for d in documents:
words.extend(d[0].tokens)
all_words = nltk.FreqDist(w.lower() for w in words)
del words
self.word_features = all_words.keys()[:2000]
featuresets = [(self.document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, test_set)

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.