metal3d/bayesian.py

## bayesian.py
# -*- encoding: utf-8 -*-
""" Simple Bayesian calculation

After training datas by categories, you can use Bayes.bayes method to compute
bayesian calculation to find probality for a content
matches some categories

Example:

>>> b = Bayes()
>>> b.train('A', ("white " * 30) + ("black " * 10))
>>> b.train('B', ("white " * 20) + ("black " * 20))

A has 30 white balls, and 10 black balls
B has 20 white balls, and 20 black balls

To know the chance to get white ball in each categories:

>>> print b.bayes('white')
{'A': 60.0, 'B': 40.0}

And to know the best chance to get white ball:

>>> print b.get_best_match("white")
A

You can use is to get nice probability. Don't forget to use unicode...


>>> b = Bayes()
>>> b.train("french",u"Bonjour je m'appelle Patrice et je suis ingénieur en informatique.")
>>> b.train("french",u"Il était une fois, un ogre nommé black monster est venu dans sharewood")
>>> b.train("english","Hi, my name is Patrice and I'm development engeneer")
>>> b.train("english","One upon a time, an ogre namd black monster came in sharewood")

Now, try to know the lang of a sentence:

>>> b.bayes(u"Je me pense que ce texte est en français")
{'french': 33.333333333333336, 'english': 0.0}

That means that there is 33% chance that the sentence is in french.
That's a logical result because some words are found in both english
and french texts (black, sharewood... ogre...)

To get the detected langage, use get_best_match:

>>> b.get_best_match(u"Je me pense que ce texte est en français")
'french'

To be precise, we append some english word inside the french test sentence:

>>> b.bayes(u"Je me pense que ce texte est en français, même si je mets le mot black ou ogre")
{'french': 27.536231884057973, 'english': 5.797101449275362}


Even if we insert some english words, the probabilty
that sentence is is french is high (27%)

"""
__author__ = "Patrice FERLET <metal3d@gmail.com"
__license__ = "BSD"

from string import maketrans, translate, punctuation

class Bayes():

    def __init__(self):
        self.__T = maketrans(punctuation, ' '*len(punctuation))
        self.datas = {}

    def train(self, cat, content):
        """Record a content to a category"""
        if cat not in self.datas:
            self.datas[cat] = []

        content = translate(content.encode("utf-8"), self.__T).split()
        self.datas[cat].append(content)

    def proba(self, word, words):
        """gGet probability of word in words as array of array"""
        f = 0
        t = 0
        for wordlist in words:
            f += len(filter(lambda x: x.lower() == word.lower(), wordlist))
            t += len(wordlist)

        return float(f)/float(t) * 100


    def _get_category_quota(self, cat):
        """Get given category quota in percent"""
        counter = 0
        for k in self.datas:
            for i in self.datas[k]:
                counter+=1

        return (float(len(self.datas[cat])) / counter) * 100

    def bayes(self, content):
        """Get bayes calculation for a content"""
        content = translate(content.encode("utf-8"), self.__T).split()
        keys = self.datas.keys()

        res = {}
        for word in content:
            r = self.one_word_bayes(word)
            for k in keys:
                if k not in res:
                    res[k] = []
                res[k].append(r[k])
        for i in res:
            res[i] = sum(res[i]) / len(res[i])

        return res

    def get_best_match(self, content):
        """Fetch the best match for given content"""
        res = self.bayes(content)

        latest = -1
        for i in res:
            if latest < res[i]:
                ret = i
                latest = res[i]
        return ret


    def one_word_bayes(self, m):
        """Fetch bayesian calculation for one word"""
        datas = self.datas
        den = 0
        ret = {}
        for data in datas:
            # numerator is probability to find m in on category * proba to hit this category
            # devide by the sum of proba to find each category * proba to find m in this category
            num = self._get_category_quota(data) * self.proba(m, datas[data])
            den = 0
            for rec in datas:
                #get sum of proba to match "m" in each categories
                p = self.proba(m, datas[rec])
                den += self._get_category_quota(rec) * p
            if den > 0:
                ret[data] = (num/den) * 100
            else:
                ret[data] = 0
        return ret

if __name__ == "__main__":

    b = Bayes()
    b.train('A', ("white " * 30) + ("black " * 10))
    b.train('B', ("white " * 20) + ("black " * 20))
    print b.bayes('white')

    b = Bayes()
    b.train(u"dessert", u"tarte au citron meringuée sucre farine levure citron")
    b.train(u"entree", u"crevette sel citron salade de crevettes laitue")
    b.train(u"dessert", u"pommes sucre vanillé beurre")
    b.train(u"entree", u"choux rouge vinaigre pomme")
    b.train(u"entree", u"salade de laitue chèvre et miel")

    cook = "Salade de thon au vinaigre de vin"
    print "Best match for %s :: %s" % ( cook,  b.get_best_match(cook))
    cook = "Tarte aux pommes maison"
    print "Best match for %s :: %s" % ( cook,  b.get_best_match(cook))
	# -- encoding: utf-8 --
	""" Simple Bayesian calculation

	After training datas by categories, you can use Bayes.bayes method to compute
	bayesian calculation to find probality for a content
	matches some categories

	Example:

	>>> b = Bayes()
	>>> b.train('A', ("white " * 30) + ("black " * 10))
	>>> b.train('B', ("white " * 20) + ("black " * 20))

	A has 30 white balls, and 10 black balls
	B has 20 white balls, and 20 black balls

	To know the chance to get white ball in each categories:

	>>> print b.bayes('white')
	{'A': 60.0, 'B': 40.0}

	And to know the best chance to get white ball:

	>>> print b.get_best_match("white")
	A

	You can use is to get nice probability. Don't forget to use unicode...


	>>> b = Bayes()
	>>> b.train("french",u"Bonjour je m'appelle Patrice et je suis ingénieur en informatique.")
	>>> b.train("french",u"Il était une fois, un ogre nommé black monster est venu dans sharewood")
	>>> b.train("english","Hi, my name is Patrice and I'm development engeneer")
	>>> b.train("english","One upon a time, an ogre namd black monster came in sharewood")

	Now, try to know the lang of a sentence:

	>>> b.bayes(u"Je me pense que ce texte est en français")
	{'french': 33.333333333333336, 'english': 0.0}

	That means that there is 33% chance that the sentence is in french.
	That's a logical result because some words are found in both english
	and french texts (black, sharewood... ogre...)

	To get the detected langage, use get_best_match:

	>>> b.get_best_match(u"Je me pense que ce texte est en français")
	'french'

	To be precise, we append some english word inside the french test sentence:

	>>> b.bayes(u"Je me pense que ce texte est en français, même si je mets le mot black ou ogre")
	{'french': 27.536231884057973, 'english': 5.797101449275362}


	Even if we insert some english words, the probabilty
	that sentence is is french is high (27%)

	"""
	__author__ = "Patrice FERLET <metal3d@gmail.com"
	__license__ = "BSD"

	from string import maketrans, translate, punctuation

	class Bayes():

	def __init__(self):
	self.__T = maketrans(punctuation, ' '*len(punctuation))
	self.datas = {}

	def train(self, cat, content):
	"""Record a content to a category"""
	if cat not in self.datas:
	self.datas[cat] = []

	content = translate(content.encode("utf-8"), self.__T).split()
	self.datas[cat].append(content)

	def proba(self, word, words):
	"""gGet probability of word in words as array of array"""
	f = 0
	t = 0
	for wordlist in words:
	f += len(filter(lambda x: x.lower() == word.lower(), wordlist))
	t += len(wordlist)

	return float(f)/float(t) * 100


	def _get_category_quota(self, cat):
	"""Get given category quota in percent"""
	counter = 0
	for k in self.datas:
	for i in self.datas[k]:
	counter+=1

	return (float(len(self.datas[cat])) / counter) * 100

	def bayes(self, content):
	"""Get bayes calculation for a content"""
	content = translate(content.encode("utf-8"), self.__T).split()
	keys = self.datas.keys()

	res = {}
	for word in content:
	r = self.one_word_bayes(word)
	for k in keys:
	if k not in res:
	res[k] = []
	res[k].append(r[k])
	for i in res:
	res[i] = sum(res[i]) / len(res[i])

	return res

	def get_best_match(self, content):
	"""Fetch the best match for given content"""
	res = self.bayes(content)

	latest = -1
	for i in res:
	if latest < res[i]:
	ret = i
	latest = res[i]
	return ret


	def one_word_bayes(self, m):
	"""Fetch bayesian calculation for one word"""
	datas = self.datas
	den = 0
	ret = {}
	for data in datas:
	# numerator is probability to find m in on category * proba to hit this category
	# devide by the sum of proba to find each category * proba to find m in this category
	num = self._get_category_quota(data) * self.proba(m, datas[data])
	den = 0
	for rec in datas:
	#get sum of proba to match "m" in each categories
	p = self.proba(m, datas[rec])
	den += self._get_category_quota(rec) * p
	if den > 0:
	ret[data] = (num/den) * 100
	else:
	ret[data] = 0
	return ret

	if __name__ == "__main__":

	b = Bayes()
	b.train('A', ("white " * 30) + ("black " * 10))
	b.train('B', ("white " * 20) + ("black " * 20))
	print b.bayes('white')

	b = Bayes()
	b.train(u"dessert", u"tarte au citron meringuée sucre farine levure citron")
	b.train(u"entree", u"crevette sel citron salade de crevettes laitue")
	b.train(u"dessert", u"pommes sucre vanillé beurre")
	b.train(u"entree", u"choux rouge vinaigre pomme")
	b.train(u"entree", u"salade de laitue chèvre et miel")

	cook = "Salade de thon au vinaigre de vin"
	print "Best match for %s :: %s" % ( cook, b.get_best_match(cook))
	cook = "Tarte aux pommes maison"
	print "Best match for %s :: %s" % ( cook, b.get_best_match(cook))