Created
October 16, 2011 04:10
-
-
Save jamak/1290498 to your computer and use it in GitHub Desktop.
Naive Bayesian classifier
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from stemming import porter | |
from operator import mul | |
class NaiveBayes (object): | |
#provide a list of classifiers for this | |
def __init__(categories,threshold = 0): | |
self.words = dict([]) | |
self.total_words = 0 | |
#keeps a dictionary containing documents trained for each category | |
self.categories_documents = dict([]) | |
self.total_documents = 0 | |
self.threshold = threshold or 0 | |
print "Threshold is now ",threshold | |
# keeps a dictionary of the number of the numer of words in each category | |
self.categories_words = dict([]) | |
for cat in categories: | |
self.words(cat) = dict([]) | |
self.categories_documents[cat] = 0 | |
self.categories_words[cat] = 0 | |
# train the document | |
def train(category,document): | |
"""trains the document by taking word count""" | |
count = 0 | |
for word in word_count(document): | |
self.words[category][word] = self.words[category][word] or 0 | |
self.words[category][word] += count | |
self.total_words += count | |
self.categories_words[category] += count | |
def probabilities(document): | |
"""find the probabilities for each category and return a hash.""" | |
probabilities = dict([]) | |
for category in self.words.keys(): | |
probabilities[category] = probability(category,document) | |
return probabilities | |
def classify(document,default = 'unknown'): | |
"""classify the document into one of the categories""" | |
p = probabilities(document) | |
total = p["noise"] + p["signal"] | |
noise = p["noise"]/total | |
signal = p["signal"]/total | |
if noise > self.threshold: | |
return "noise" | |
elif signal > self.threshold: | |
return signal | |
else: | |
return default | |
def prettify_probabilities(document): | |
"""prettify the probabilities""" | |
probs = probabilities(document) | |
def word_probability(category,word): | |
""" | |
the probability of a word being in this category. | |
Uses weighted probabilities to avoid zero probability | |
""" | |
return (1 +float(self.words[category][word]))/self.categories_words[category] | |
def doc_probability(category,document): | |
""" | |
prob of a doc being in ths category. | |
Just the product series of all probabilities in this category | |
""" | |
doc_prob = 1 | |
word_probs = [] | |
for k in word_count(document): | |
word_probs.append(word_probability(category,word[0])) | |
return reduce(mul,word_probs[:-16:-1],doc_prob)# it was suggested that only the 15 most significant be used | |
def category_probability(category): | |
"""the prob that a randomly chosen document is in the category""" | |
return self.categories_documents[category]/float(self.total_documents) | |
def probability(category,document): | |
"""The un-normalized probability that document belongs to category""" | |
return doc_probability(category,document) * category_probability(category) | |
def word_count(document): | |
"""get a dict of the number of times a word appears in any document""" | |
words = re.sub(r'/[^\w\s]/','',document.split('')) | |
d = dict([]) | |
for w in words: | |
key = w | |
d[key] = d[key] or 0 | |
d[key] += 1 | |
return d |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment