Skip to content

Instantly share code, notes, and snippets.

@metal3d
Last active December 11, 2015 21:08
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save metal3d/4659704 to your computer and use it in GitHub Desktop.
Save metal3d/4659704 to your computer and use it in GitHub Desktop.
Bayesian class that train datas and compute bayesian calculation.
# -*- encoding: utf-8 -*-
""" Simple Bayesian calculation
After training datas by categories, you can use Bayes.bayes method to compute
bayesian calculation to find probality for a content
matches some categories
Example:
>>> b = Bayes()
>>> b.train('A', ("white " * 30) + ("black " * 10))
>>> b.train('B', ("white " * 20) + ("black " * 20))
A has 30 white balls, and 10 black balls
B has 20 white balls, and 20 black balls
To know the chance to get white ball in each categories:
>>> print b.bayes('white')
{'A': 60.0, 'B': 40.0}
And to know the best chance to get white ball:
>>> print b.get_best_match("white")
A
You can use is to get nice probability. Don't forget to use unicode...
>>> b = Bayes()
>>> b.train("french",u"Bonjour je m'appelle Patrice et je suis ingénieur en informatique.")
>>> b.train("french",u"Il était une fois, un ogre nommé black monster est venu dans sharewood")
>>> b.train("english","Hi, my name is Patrice and I'm development engeneer")
>>> b.train("english","One upon a time, an ogre namd black monster came in sharewood")
Now, try to know the lang of a sentence:
>>> b.bayes(u"Je me pense que ce texte est en français")
{'french': 33.333333333333336, 'english': 0.0}
That means that there is 33% chance that the sentence is in french.
That's a logical result because some words are found in both english
and french texts (black, sharewood... ogre...)
To get the detected langage, use get_best_match:
>>> b.get_best_match(u"Je me pense que ce texte est en français")
'french'
To be precise, we append some english word inside the french test sentence:
>>> b.bayes(u"Je me pense que ce texte est en français, même si je mets le mot black ou ogre")
{'french': 27.536231884057973, 'english': 5.797101449275362}
Even if we insert some english words, the probabilty
that sentence is is french is high (27%)
"""
__author__ = "Patrice FERLET <metal3d@gmail.com"
__license__ = "BSD"
from string import maketrans, translate, punctuation
class Bayes():
def __init__(self):
self.__T = maketrans(punctuation, ' '*len(punctuation))
self.datas = {}
def train(self, cat, content):
"""Record a content to a category"""
if cat not in self.datas:
self.datas[cat] = []
content = translate(content.encode("utf-8"), self.__T).split()
self.datas[cat].append(content)
def proba(self, word, words):
"""gGet probability of word in words as array of array"""
f = 0
t = 0
for wordlist in words:
f += len(filter(lambda x: x.lower() == word.lower(), wordlist))
t += len(wordlist)
return float(f)/float(t) * 100
def _get_category_quota(self, cat):
"""Get given category quota in percent"""
counter = 0
for k in self.datas:
for i in self.datas[k]:
counter+=1
return (float(len(self.datas[cat])) / counter) * 100
def bayes(self, content):
"""Get bayes calculation for a content"""
content = translate(content.encode("utf-8"), self.__T).split()
keys = self.datas.keys()
res = {}
for word in content:
r = self.one_word_bayes(word)
for k in keys:
if k not in res:
res[k] = []
res[k].append(r[k])
for i in res:
res[i] = sum(res[i]) / len(res[i])
return res
def get_best_match(self, content):
"""Fetch the best match for given content"""
res = self.bayes(content)
latest = -1
for i in res:
if latest < res[i]:
ret = i
latest = res[i]
return ret
def one_word_bayes(self, m):
"""Fetch bayesian calculation for one word"""
datas = self.datas
den = 0
ret = {}
for data in datas:
# numerator is probability to find m in on category * proba to hit this category
# devide by the sum of proba to find each category * proba to find m in this category
num = self._get_category_quota(data) * self.proba(m, datas[data])
den = 0
for rec in datas:
#get sum of proba to match "m" in each categories
p = self.proba(m, datas[rec])
den += self._get_category_quota(rec) * p
if den > 0:
ret[data] = (num/den) * 100
else:
ret[data] = 0
return ret
if __name__ == "__main__":
b = Bayes()
b.train('A', ("white " * 30) + ("black " * 10))
b.train('B', ("white " * 20) + ("black " * 20))
print b.bayes('white')
b = Bayes()
b.train(u"dessert", u"tarte au citron meringuée sucre farine levure citron")
b.train(u"entree", u"crevette sel citron salade de crevettes laitue")
b.train(u"dessert", u"pommes sucre vanillé beurre")
b.train(u"entree", u"choux rouge vinaigre pomme")
b.train(u"entree", u"salade de laitue chèvre et miel")
cook = "Salade de thon au vinaigre de vin"
print "Best match for %s :: %s" % ( cook, b.get_best_match(cook))
cook = "Tarte aux pommes maison"
print "Best match for %s :: %s" % ( cook, b.get_best_match(cook))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment