Skip to content

Instantly share code, notes, and snippets.

@jamak
Created October 16, 2011 04:10
Show Gist options
  • Save jamak/1290498 to your computer and use it in GitHub Desktop.
Save jamak/1290498 to your computer and use it in GitHub Desktop.
Naive Bayesian classifier
from stemming import porter
from operator import mul
class NaiveBayes (object):
#provide a list of classifiers for this
def __init__(categories,threshold = 0):
self.words = dict([])
self.total_words = 0
#keeps a dictionary containing documents trained for each category
self.categories_documents = dict([])
self.total_documents = 0
self.threshold = threshold or 0
print "Threshold is now ",threshold
# keeps a dictionary of the number of the numer of words in each category
self.categories_words = dict([])
for cat in categories:
self.words(cat) = dict([])
self.categories_documents[cat] = 0
self.categories_words[cat] = 0
# train the document
def train(category,document):
"""trains the document by taking word count"""
count = 0
for word in word_count(document):
self.words[category][word] = self.words[category][word] or 0
self.words[category][word] += count
self.total_words += count
self.categories_words[category] += count
def probabilities(document):
"""find the probabilities for each category and return a hash."""
probabilities = dict([])
for category in self.words.keys():
probabilities[category] = probability(category,document)
return probabilities
def classify(document,default = 'unknown'):
"""classify the document into one of the categories"""
p = probabilities(document)
total = p["noise"] + p["signal"]
noise = p["noise"]/total
signal = p["signal"]/total
if noise > self.threshold:
return "noise"
elif signal > self.threshold:
return signal
else:
return default
def prettify_probabilities(document):
"""prettify the probabilities"""
probs = probabilities(document)
def word_probability(category,word):
"""
the probability of a word being in this category.
Uses weighted probabilities to avoid zero probability
"""
return (1 +float(self.words[category][word]))/self.categories_words[category]
def doc_probability(category,document):
"""
prob of a doc being in ths category.
Just the product series of all probabilities in this category
"""
doc_prob = 1
word_probs = []
for k in word_count(document):
word_probs.append(word_probability(category,word[0]))
return reduce(mul,word_probs[:-16:-1],doc_prob)# it was suggested that only the 15 most significant be used
def category_probability(category):
"""the prob that a randomly chosen document is in the category"""
return self.categories_documents[category]/float(self.total_documents)
def probability(category,document):
"""The un-normalized probability that document belongs to category"""
return doc_probability(category,document) * category_probability(category)
def word_count(document):
"""get a dict of the number of times a word appears in any document"""
words = re.sub(r'/[^\w\s]/','',document.split(''))
d = dict([])
for w in words:
key = w
d[key] = d[key] or 0
d[key] += 1
return d
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment