Skip to content

Instantly share code, notes, and snippets.

@ikegami-yukino
Last active August 29, 2015 13:57
Show Gist options
  • Save ikegami-yukino/9920280 to your computer and use it in GitHub Desktop.
Save ikegami-yukino/9920280 to your computer and use it in GitHub Desktop.
Standard Naive Bayes and Complement Naive Bayes using madoka
#-*- coding: utf-8 -*-
import numpy as np
from collections import Counter, defaultdict
import madoka
NUM_DOCS_INDEX = '[[NUM_DOCS]]'
ALL_WORD_INDEX = '[[ALL]]'
class TFIDF(object):
def __init__(self, filename=None):
"""
Params:
<float> alpha : hyperparameter for smoothing
"""
self.df = madoka.Sketch()
if filename:
self.df.load(filename)
def save(self, filename):
self.df.save(filename)
def add(self, doc):
"""Update DF table
"""
self.df.inc(NUM_DOCS_INDEX, len(NUM_DOCS_INDEX))
for word in set(doc):
self.df.inc(word, len(word))
def get_df_as_log(self, word):
return np.log(self.df.get(word, len(word)))
def tfidf(self, doc, mode='tfidf', update_table=True):
mode = mode.lower()
if update_table:
self.add(doc)
if mode == 'tfidf':
num_docs = self.get_df_as_log(NUM_DOCS_INDEX)
num_words = float(len(doc))
result = {}
for word in set(doc):
tf = doc.count(word) / num_words
if mode == 'tf':
result[word] = tf
else:
idf = num_docs - self.get_df_as_log(word) + 1
result[word] = tf * idf
return result
class MadokaNaiveBayes(object):
def __init__(self, alpha=1.0):
"""
Params:
<float> alpha : hyperparameter for smoothing
<int> round : digit for rounding. higher digit, higher accuracy
"""
self.alpha = alpha
self.voca = 0.0
self.word_counter = madoka.CroquisDouble()
self.cat_counter = Counter()
def _count_word(self, word, val, cat):
already = False
for cate in self.cat_counter.keys():
idx = '%s/%s' % (cate, word)
if self.word_counter.get(idx):
already = True
break
if already:
self.voca += 1
idx = '%s/%s' % (cat, word)
self.word_counter.add(idx, val)
all_cat_word_idx = '%s/%s' % (cat, ALL_WORD_INDEX)
self.word_counter.add(all_cat_word_idx, val)
def train(self, doc, cat):
for (word, val) in doc.items():
self._count_word(word, val, cat)
self.cat_counter[cat] += 1
def _pr_category(self, cat):
"""Prior probability about category: Pr(c)"""
return float(self.cat_counter[cat]) / np.sum(self.cat_counter.values())
def _freq_word_incategory(self, word, cat):
"""Frequency of a word in a category"""
idx = '%s/%s' % (cat, word)
return self.word_counter.get(idx)
def _calc_theta(self, word, cat):
"""θ = Pr(w|c)"""
idx = '%s/%s' % (cat, ALL_WORD_INDEX)
prob = (self._freq_word_incategory(word, cat) + self.alpha) / \
(self.word_counter.get(idx) + self.voca)
return prob
def _calc_prob(self, doc, cat):
"""log(Pr(c)) + sum(tfidf * log(Pr(w|c)))"""
_calc_prob = np.log(self._pr_category(cat))
for (word, val) in doc.items():
_calc_prob += val * np.log(self._calc_theta(word, cat))
return _calc_prob
def classify(self, doc):
result = Counter()
for cat in self.cat_counter.keys():
result[cat] = self._calc_prob(doc, cat)
return result
def log_to_prob(self, data):
total = np.sum(data.values())
for k,v in data.items():
data[k] = v / total
return data
class MadokaCNaiveBayes(MadokaNaiveBayes):
def _calc_prob(self, doc, cat, theta):
"""log(Pr(c)) + sum(tfidf * log(Pr(w|c)))"""
prior = np.log(self._pr_category(cat))
prob = 0
for (word, tfidf) in doc.items():
filtered_theta = [x[1] for x in theta[word].items() if x[0] != cat]
theta_sum = np.sum(filtered_theta)
prob += tfidf * np.log(theta_sum)
return prior - prob
def classify(self, doc):
theta = defaultdict(dict)
for word in doc.keys():
for cat in self.cat_counter.keys():
theta[word][cat] = self._calc_theta(word, cat)
result = Counter()
for cat in self.cat_counter.keys():
result[cat] = self._calc_prob(doc, cat, theta)
return result
mb = MadokaCNaiveBayes()
t = TFIDF()
texts = (
('CS', ('Python', 'プログラミング言語')),
('CS', ('Ruby', 'プログラミング言語')),
('CS', ('Python', '自然言語処理')),
('BB', ('イチロー', 'ホームラン')),
('BB', ('打者', '盗塁', 'ホームラン')),
('FOOD', ('ラーメン', 'カレー', 'スパゲッティ')),
('FOOD', ('ラーメン', '二郎', 'もやし')),
)
for text in texts:
doc = text[1]
doc = t.add(doc)
for text in texts:
cat = text[0]
doc = text[1]
doc = t.tfidf(doc)
mb.train(doc, cat)
print mb.classify(t.tfidf(('Python', 'Ruby')))
print mb.classify(t.tfidf(('イチロー', '打者')))
print mb.classify(t.tfidf(('ラーメン', 'もやし')))
r = mb.classify(t.tfidf(('Python', 'Ruby')))
for k,v in mb.log_to_prob(r).items():
print k, v
r = mb.classify(t.tfidf(('イチロー', '打者')))
for k,v in mb.log_to_prob(r).items():
print k, v
r = mb.classify(t.tfidf(('ラーメン', 'もやし', '二郎')))
for k,v in mb.log_to_prob(r).items():
print k, v
r = mb.classify(t.tfidf(('ラーメン', 'Perl')))
for k,v in mb.log_to_prob(r).items():
print k, v
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment