YoshihitoAso/naivebayes.py

## naivebayes.py
#coding:utf-8
import math
import sys
from collections import defaultdict

class NaiveBayes:
    def __init__(self):
        self.categories = set()
        self.vocabularies = set()
        self.wordcount = {}
        self.catcount = {}
        self.denominator = {}

    def train(self, data):
        # init
        for d in data:
            cat = d[0]
            self.categories.add(cat)
        for cat in self.categories:
            self.wordcount[cat] = defaultdict(int)
            self.catcount[cat] = 0

        # count catebgory, word
        for d in data:
            cat, doc = d[0], d[1:]
            self.catcount[cat] += 1
            for wc in doc:
                word, count = wc.split(":")
                count = int(count)
                self.vocabularies.add(word)
                self.wordcount[cat][word] += count

        # calc denominator
        for cat in self.categories:
            self.denominator[cat] = sum(self.wordcount[cat].values()) + len(self.vocabularies)

    def classify(self, doc):
        # max log(P(cat|doc))
        best = None
        max = -sys.maxint
        for cat in self.catcount.keys():
            p = self.score(doc, cat)
            if p > max:
                max = p
                best = cat
        return best

    def wordProb(self, word, cat):
        # calc  P(word|cat)
        return float(self.wordcount[cat][word] + 1) / float(self.denominator[cat])

    def score(self, doc, cat):
        total = sum(self.catcount.values())
        score = math.log(float(self.catcount[cat]) / total)
        for wc in doc:
            word, count = wc.split(":")
            count = int(count)
            for i in range(count):
                score += math.log(self.wordProb(word, cat))
        return score

    def __str__(self):
        total = sum(self.catcount.values())
        return "documents: %d, vocabularies: %d, categories: %d" % (total, len(self.vocabularies), len(self.categories))


if __name__ == "__main__":

    # training data
    data = [["yes", "Chinese:2", "Beijing:1"],
            ["yes", "Chinese:2", "Shanghai:1"],
            ["yes", "Chinese:1", "Macao:1"],
            ["no", "Tokyo:1", "Japan:1", "Chinese:1"]]

    # train
    nb = NaiveBayes()
    nb.train(data)
    print nb
    print "P(Chinese|yes) = ", nb.wordProb("Chinese", "yes")
    print "P(Tokyo|yes) = ", nb.wordProb("Tokyo", "yes")
    print "P(Japan|yes) = ", nb.wordProb("Japan", "yes")
    print "P(Chinese|no) = ", nb.wordProb("Chinese", "no")
    print "P(Tokyo|no) = ", nb.wordProb("Tokyo", "no")
    print "P(Japan|no) = ", nb.wordProb("Japan", "no")

    # test
    test = ["Chinese", "Chinese", "Chinese", "Tokyo", "Japan"]
    print "log P(yes|test) =", nb.score(test, "yes")
    print "log P(no|test) =", nb.score(test, "no")
    print nb.classify(test)
	#coding:utf-8
	import math
	import sys
	from collections import defaultdict

	class NaiveBayes:
	def __init__(self):
	self.categories = set()
	self.vocabularies = set()
	self.wordcount = {}
	self.catcount = {}
	self.denominator = {}

	def train(self, data):
	# init
	for d in data:
	cat = d[0]
	self.categories.add(cat)
	for cat in self.categories:
	self.wordcount[cat] = defaultdict(int)
	self.catcount[cat] = 0

	# count catebgory, word
	for d in data:
	cat, doc = d[0], d[1:]
	self.catcount[cat] += 1
	for wc in doc:
	word, count = wc.split(":")
	count = int(count)
	self.vocabularies.add(word)
	self.wordcount[cat][word] += count

	# calc denominator
	for cat in self.categories:
	self.denominator[cat] = sum(self.wordcount[cat].values()) + len(self.vocabularies)

	def classify(self, doc):
	# max log(P(cat\|doc))
	best = None
	max = -sys.maxint
	for cat in self.catcount.keys():
	p = self.score(doc, cat)
	if p > max:
	max = p
	best = cat
	return best

	def wordProb(self, word, cat):
	# calc P(word\|cat)
	return float(self.wordcount[cat][word] + 1) / float(self.denominator[cat])

	def score(self, doc, cat):
	total = sum(self.catcount.values())
	score = math.log(float(self.catcount[cat]) / total)
	for wc in doc:
	word, count = wc.split(":")
	count = int(count)
	for i in range(count):
	score += math.log(self.wordProb(word, cat))
	return score

	def __str__(self):
	total = sum(self.catcount.values())
	return "documents: %d, vocabularies: %d, categories: %d" % (total, len(self.vocabularies), len(self.categories))


	if __name__ == "__main__":

	# training data
	data = [["yes", "Chinese:2", "Beijing:1"],
	["yes", "Chinese:2", "Shanghai:1"],
	["yes", "Chinese:1", "Macao:1"],
	["no", "Tokyo:1", "Japan:1", "Chinese:1"]]

	# train
	nb = NaiveBayes()
	nb.train(data)
	print nb
	print "P(Chinese\|yes) = ", nb.wordProb("Chinese", "yes")
	print "P(Tokyo\|yes) = ", nb.wordProb("Tokyo", "yes")
	print "P(Japan\|yes) = ", nb.wordProb("Japan", "yes")
	print "P(Chinese\|no) = ", nb.wordProb("Chinese", "no")
	print "P(Tokyo\|no) = ", nb.wordProb("Tokyo", "no")
	print "P(Japan\|no) = ", nb.wordProb("Japan", "no")

	# test
	test = ["Chinese", "Chinese", "Chinese", "Tokyo", "Japan"]
	print "log P(yes\|test) =", nb.score(test, "yes")
	print "log P(no\|test) =", nb.score(test, "no")
	print nb.classify(test)