Skip to content

Instantly share code, notes, and snippets.

@YoshihitoAso
Created February 26, 2014 07:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save YoshihitoAso/9225329 to your computer and use it in GitHub Desktop.
Save YoshihitoAso/9225329 to your computer and use it in GitHub Desktop.
[Python]naive bayes分類器のサンプル
#coding:utf-8
import math
import sys
from collections import defaultdict
class NaiveBayes:
def __init__(self):
self.categories = set()
self.vocabularies = set()
self.wordcount = {}
self.catcount = {}
self.denominator = {}
def train(self, data):
# init
for d in data:
cat = d[0]
self.categories.add(cat)
for cat in self.categories:
self.wordcount[cat] = defaultdict(int)
self.catcount[cat] = 0
# count catebgory, word
for d in data:
cat, doc = d[0], d[1:]
self.catcount[cat] += 1
for wc in doc:
word, count = wc.split(":")
count = int(count)
self.vocabularies.add(word)
self.wordcount[cat][word] += count
# calc denominator
for cat in self.categories:
self.denominator[cat] = sum(self.wordcount[cat].values()) + len(self.vocabularies)
def classify(self, doc):
# max log(P(cat|doc))
best = None
max = -sys.maxint
for cat in self.catcount.keys():
p = self.score(doc, cat)
if p > max:
max = p
best = cat
return best
def wordProb(self, word, cat):
# calc P(word|cat)
return float(self.wordcount[cat][word] + 1) / float(self.denominator[cat])
def score(self, doc, cat):
total = sum(self.catcount.values())
score = math.log(float(self.catcount[cat]) / total)
for wc in doc:
word, count = wc.split(":")
count = int(count)
for i in range(count):
score += math.log(self.wordProb(word, cat))
return score
def __str__(self):
total = sum(self.catcount.values())
return "documents: %d, vocabularies: %d, categories: %d" % (total, len(self.vocabularies), len(self.categories))
if __name__ == "__main__":
# training data
data = [["yes", "Chinese:2", "Beijing:1"],
["yes", "Chinese:2", "Shanghai:1"],
["yes", "Chinese:1", "Macao:1"],
["no", "Tokyo:1", "Japan:1", "Chinese:1"]]
# train
nb = NaiveBayes()
nb.train(data)
print nb
print "P(Chinese|yes) = ", nb.wordProb("Chinese", "yes")
print "P(Tokyo|yes) = ", nb.wordProb("Tokyo", "yes")
print "P(Japan|yes) = ", nb.wordProb("Japan", "yes")
print "P(Chinese|no) = ", nb.wordProb("Chinese", "no")
print "P(Tokyo|no) = ", nb.wordProb("Tokyo", "no")
print "P(Japan|no) = ", nb.wordProb("Japan", "no")
# test
test = ["Chinese", "Chinese", "Chinese", "Tokyo", "Japan"]
print "log P(yes|test) =", nb.score(test, "yes")
print "log P(no|test) =", nb.score(test, "no")
print nb.classify(test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment