Last active
December 11, 2015 21:08
-
-
Save metal3d/4659704 to your computer and use it in GitHub Desktop.
Bayesian class that train datas and compute bayesian calculation.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding: utf-8 -*- | |
""" Simple Bayesian calculation | |
After training datas by categories, you can use Bayes.bayes method to compute | |
bayesian calculation to find probality for a content | |
matches some categories | |
Example: | |
>>> b = Bayes() | |
>>> b.train('A', ("white " * 30) + ("black " * 10)) | |
>>> b.train('B', ("white " * 20) + ("black " * 20)) | |
A has 30 white balls, and 10 black balls | |
B has 20 white balls, and 20 black balls | |
To know the chance to get white ball in each categories: | |
>>> print b.bayes('white') | |
{'A': 60.0, 'B': 40.0} | |
And to know the best chance to get white ball: | |
>>> print b.get_best_match("white") | |
A | |
You can use is to get nice probability. Don't forget to use unicode... | |
>>> b = Bayes() | |
>>> b.train("french",u"Bonjour je m'appelle Patrice et je suis ingénieur en informatique.") | |
>>> b.train("french",u"Il était une fois, un ogre nommé black monster est venu dans sharewood") | |
>>> b.train("english","Hi, my name is Patrice and I'm development engeneer") | |
>>> b.train("english","One upon a time, an ogre namd black monster came in sharewood") | |
Now, try to know the lang of a sentence: | |
>>> b.bayes(u"Je me pense que ce texte est en français") | |
{'french': 33.333333333333336, 'english': 0.0} | |
That means that there is 33% chance that the sentence is in french. | |
That's a logical result because some words are found in both english | |
and french texts (black, sharewood... ogre...) | |
To get the detected langage, use get_best_match: | |
>>> b.get_best_match(u"Je me pense que ce texte est en français") | |
'french' | |
To be precise, we append some english word inside the french test sentence: | |
>>> b.bayes(u"Je me pense que ce texte est en français, même si je mets le mot black ou ogre") | |
{'french': 27.536231884057973, 'english': 5.797101449275362} | |
Even if we insert some english words, the probabilty | |
that sentence is is french is high (27%) | |
""" | |
__author__ = "Patrice FERLET <metal3d@gmail.com" | |
__license__ = "BSD" | |
from string import maketrans, translate, punctuation | |
class Bayes(): | |
def __init__(self): | |
self.__T = maketrans(punctuation, ' '*len(punctuation)) | |
self.datas = {} | |
def train(self, cat, content): | |
"""Record a content to a category""" | |
if cat not in self.datas: | |
self.datas[cat] = [] | |
content = translate(content.encode("utf-8"), self.__T).split() | |
self.datas[cat].append(content) | |
def proba(self, word, words): | |
"""gGet probability of word in words as array of array""" | |
f = 0 | |
t = 0 | |
for wordlist in words: | |
f += len(filter(lambda x: x.lower() == word.lower(), wordlist)) | |
t += len(wordlist) | |
return float(f)/float(t) * 100 | |
def _get_category_quota(self, cat): | |
"""Get given category quota in percent""" | |
counter = 0 | |
for k in self.datas: | |
for i in self.datas[k]: | |
counter+=1 | |
return (float(len(self.datas[cat])) / counter) * 100 | |
def bayes(self, content): | |
"""Get bayes calculation for a content""" | |
content = translate(content.encode("utf-8"), self.__T).split() | |
keys = self.datas.keys() | |
res = {} | |
for word in content: | |
r = self.one_word_bayes(word) | |
for k in keys: | |
if k not in res: | |
res[k] = [] | |
res[k].append(r[k]) | |
for i in res: | |
res[i] = sum(res[i]) / len(res[i]) | |
return res | |
def get_best_match(self, content): | |
"""Fetch the best match for given content""" | |
res = self.bayes(content) | |
latest = -1 | |
for i in res: | |
if latest < res[i]: | |
ret = i | |
latest = res[i] | |
return ret | |
def one_word_bayes(self, m): | |
"""Fetch bayesian calculation for one word""" | |
datas = self.datas | |
den = 0 | |
ret = {} | |
for data in datas: | |
# numerator is probability to find m in on category * proba to hit this category | |
# devide by the sum of proba to find each category * proba to find m in this category | |
num = self._get_category_quota(data) * self.proba(m, datas[data]) | |
den = 0 | |
for rec in datas: | |
#get sum of proba to match "m" in each categories | |
p = self.proba(m, datas[rec]) | |
den += self._get_category_quota(rec) * p | |
if den > 0: | |
ret[data] = (num/den) * 100 | |
else: | |
ret[data] = 0 | |
return ret | |
if __name__ == "__main__": | |
b = Bayes() | |
b.train('A', ("white " * 30) + ("black " * 10)) | |
b.train('B', ("white " * 20) + ("black " * 20)) | |
print b.bayes('white') | |
b = Bayes() | |
b.train(u"dessert", u"tarte au citron meringuée sucre farine levure citron") | |
b.train(u"entree", u"crevette sel citron salade de crevettes laitue") | |
b.train(u"dessert", u"pommes sucre vanillé beurre") | |
b.train(u"entree", u"choux rouge vinaigre pomme") | |
b.train(u"entree", u"salade de laitue chèvre et miel") | |
cook = "Salade de thon au vinaigre de vin" | |
print "Best match for %s :: %s" % ( cook, b.get_best_match(cook)) | |
cook = "Tarte aux pommes maison" | |
print "Best match for %s :: %s" % ( cook, b.get_best_match(cook)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment