ikegami-yukino/madoka_bayes.py

## madoka_bayes.py
#-*- coding: utf-8 -*-
import numpy as np
from collections import Counter, defaultdict
import madoka

NUM_DOCS_INDEX = '[[NUM_DOCS]]'
ALL_WORD_INDEX = '[[ALL]]'


class TFIDF(object):

    def __init__(self, filename=None):
        """
        Params:
            <float> alpha : hyperparameter for smoothing
        """
        self.df = madoka.Sketch()
        if filename:
            self.df.load(filename)

    def save(self, filename):
        self.df.save(filename)

    def add(self, doc):
        """Update DF table
        """
        self.df.inc(NUM_DOCS_INDEX, len(NUM_DOCS_INDEX))
        for word in set(doc):
            self.df.inc(word, len(word))

    def get_df_as_log(self, word):
        return np.log(self.df.get(word, len(word)))

    def tfidf(self, doc, mode='tfidf', update_table=True):
        mode = mode.lower()

        if update_table:
            self.add(doc)

        if mode == 'tfidf':
            num_docs = self.get_df_as_log(NUM_DOCS_INDEX)
        num_words = float(len(doc))
        result = {}
        for word in set(doc):
            tf = doc.count(word) / num_words
            if mode == 'tf':
                result[word] = tf
            else:
                idf = num_docs - self.get_df_as_log(word) + 1
                result[word] = tf * idf
        return result


class MadokaNaiveBayes(object):

    def __init__(self, alpha=1.0):
        """
        Params:
            <float> alpha : hyperparameter for smoothing
            <int> round : digit for rounding. higher digit, higher accuracy
        """
        self.alpha = alpha
        self.voca = 0.0
        self.word_counter = madoka.CroquisDouble()
        self.cat_counter = Counter()

    def _count_word(self, word, val, cat):
        already = False
        for cate in self.cat_counter.keys():
            idx = '%s/%s' % (cate, word)
            if self.word_counter.get(idx):
                already = True
                break
        if already:
            self.voca += 1
        idx = '%s/%s' % (cat, word)
        self.word_counter.add(idx, val)
        all_cat_word_idx = '%s/%s' % (cat, ALL_WORD_INDEX)
        self.word_counter.add(all_cat_word_idx, val)

    def train(self, doc, cat):
        for (word, val) in doc.items():
            self._count_word(word, val, cat)
        self.cat_counter[cat] += 1

    def _pr_category(self, cat):
        """Prior probability about category: Pr(c)"""
        return float(self.cat_counter[cat]) / np.sum(self.cat_counter.values())

    def _freq_word_incategory(self, word, cat):
        """Frequency of a word in a category"""
        idx = '%s/%s' % (cat, word)
        return self.word_counter.get(idx)

    def _calc_theta(self, word, cat):
        """θ = Pr(w|c)"""
        idx = '%s/%s' % (cat, ALL_WORD_INDEX)
        prob = (self._freq_word_incategory(word, cat) + self.alpha) / \
               (self.word_counter.get(idx) + self.voca)
        return prob

    def _calc_prob(self, doc, cat):
        """log(Pr(c)) + sum(tfidf * log(Pr(w|c)))"""
        _calc_prob = np.log(self._pr_category(cat))
        for (word, val) in doc.items():
            _calc_prob += val * np.log(self._calc_theta(word, cat))
        return _calc_prob

    def classify(self, doc):
        result = Counter()
        for cat in self.cat_counter.keys():
            result[cat] = self._calc_prob(doc, cat)
        return result

    def log_to_prob(self, data):
        total = np.sum(data.values())
        for k,v in data.items():
            data[k] = v / total
        return data


class MadokaCNaiveBayes(MadokaNaiveBayes):

    def _calc_prob(self, doc, cat, theta):
        """log(Pr(c)) + sum(tfidf * log(Pr(w|c)))"""
        prior = np.log(self._pr_category(cat))
        prob = 0
        for (word, tfidf) in doc.items():
            filtered_theta = [x[1] for x in theta[word].items() if x[0] != cat]
            theta_sum = np.sum(filtered_theta)
            prob += tfidf * np.log(theta_sum)
        return prior - prob

    def classify(self, doc):
        theta = defaultdict(dict)
        for word in doc.keys():
            for cat in self.cat_counter.keys():
                theta[word][cat] = self._calc_theta(word, cat)

        result = Counter()
        for cat in self.cat_counter.keys():
            result[cat] = self._calc_prob(doc, cat, theta)
        return result


mb = MadokaCNaiveBayes()
t = TFIDF()

texts = (
    ('CS', ('Python', 'プログラミング言語')),
    ('CS', ('Ruby', 'プログラミング言語')),
    ('CS', ('Python', '自然言語処理')),
    ('BB', ('イチロー', 'ホームラン')),
    ('BB', ('打者', '盗塁', 'ホームラン')),
    ('FOOD', ('ラーメン', 'カレー', 'スパゲッティ')),
    ('FOOD', ('ラーメン', '二郎', 'もやし')),
)
for text in texts:
    doc = text[1]
    doc = t.add(doc)

for text in texts:
    cat = text[0]
    doc = text[1]
    doc = t.tfidf(doc)
    mb.train(doc, cat)
print mb.classify(t.tfidf(('Python', 'Ruby')))
print mb.classify(t.tfidf(('イチロー', '打者')))
print mb.classify(t.tfidf(('ラーメン', 'もやし')))
r = mb.classify(t.tfidf(('Python', 'Ruby')))
for k,v in mb.log_to_prob(r).items():
    print k, v
r = mb.classify(t.tfidf(('イチロー', '打者')))
for k,v in mb.log_to_prob(r).items():
    print k, v
r = mb.classify(t.tfidf(('ラーメン', 'もやし', '二郎')))
for k,v in mb.log_to_prob(r).items():
    print k, v
r = mb.classify(t.tfidf(('ラーメン', 'Perl')))
for k,v in mb.log_to_prob(r).items():
    print k, v
	#-- coding: utf-8 --
	import numpy as np
	from collections import Counter, defaultdict
	import madoka

	NUM_DOCS_INDEX = '[[NUM_DOCS]]'
	ALL_WORD_INDEX = '[[ALL]]'


	class TFIDF(object):

	def __init__(self, filename=None):
	"""
	Params:
	<float> alpha : hyperparameter for smoothing
	"""
	self.df = madoka.Sketch()
	if filename:
	self.df.load(filename)

	def save(self, filename):
	self.df.save(filename)

	def add(self, doc):
	"""Update DF table
	"""
	self.df.inc(NUM_DOCS_INDEX, len(NUM_DOCS_INDEX))
	for word in set(doc):
	self.df.inc(word, len(word))

	def get_df_as_log(self, word):
	return np.log(self.df.get(word, len(word)))

	def tfidf(self, doc, mode='tfidf', update_table=True):
	mode = mode.lower()

	if update_table:
	self.add(doc)

	if mode == 'tfidf':
	num_docs = self.get_df_as_log(NUM_DOCS_INDEX)
	num_words = float(len(doc))
	result = {}
	for word in set(doc):
	tf = doc.count(word) / num_words
	if mode == 'tf':
	result[word] = tf
	else:
	idf = num_docs - self.get_df_as_log(word) + 1
	result[word] = tf * idf
	return result


	class MadokaNaiveBayes(object):

	def __init__(self, alpha=1.0):
	"""
	Params:
	<float> alpha : hyperparameter for smoothing
	<int> round : digit for rounding. higher digit, higher accuracy
	"""
	self.alpha = alpha
	self.voca = 0.0
	self.word_counter = madoka.CroquisDouble()
	self.cat_counter = Counter()

	def _count_word(self, word, val, cat):
	already = False
	for cate in self.cat_counter.keys():
	idx = '%s/%s' % (cate, word)
	if self.word_counter.get(idx):
	already = True
	break
	if already:
	self.voca += 1
	idx = '%s/%s' % (cat, word)
	self.word_counter.add(idx, val)
	all_cat_word_idx = '%s/%s' % (cat, ALL_WORD_INDEX)
	self.word_counter.add(all_cat_word_idx, val)

	def train(self, doc, cat):
	for (word, val) in doc.items():
	self._count_word(word, val, cat)
	self.cat_counter[cat] += 1

	def _pr_category(self, cat):
	"""Prior probability about category: Pr(c)"""
	return float(self.cat_counter[cat]) / np.sum(self.cat_counter.values())

	def _freq_word_incategory(self, word, cat):
	"""Frequency of a word in a category"""
	idx = '%s/%s' % (cat, word)
	return self.word_counter.get(idx)

	def _calc_theta(self, word, cat):
	"""θ = Pr(w\|c)"""
	idx = '%s/%s' % (cat, ALL_WORD_INDEX)
	prob = (self._freq_word_incategory(word, cat) + self.alpha) / \
	(self.word_counter.get(idx) + self.voca)
	return prob

	def _calc_prob(self, doc, cat):
	"""log(Pr(c)) + sum(tfidf * log(Pr(w\|c)))"""
	_calc_prob = np.log(self._pr_category(cat))
	for (word, val) in doc.items():
	_calc_prob += val * np.log(self._calc_theta(word, cat))
	return _calc_prob

	def classify(self, doc):
	result = Counter()
	for cat in self.cat_counter.keys():
	result[cat] = self._calc_prob(doc, cat)
	return result

	def log_to_prob(self, data):
	total = np.sum(data.values())
	for k,v in data.items():
	data[k] = v / total
	return data


	class MadokaCNaiveBayes(MadokaNaiveBayes):

	def _calc_prob(self, doc, cat, theta):
	"""log(Pr(c)) + sum(tfidf * log(Pr(w\|c)))"""
	prior = np.log(self._pr_category(cat))
	prob = 0
	for (word, tfidf) in doc.items():
	filtered_theta = [x[1] for x in theta[word].items() if x[0] != cat]
	theta_sum = np.sum(filtered_theta)
	prob += tfidf * np.log(theta_sum)
	return prior - prob

	def classify(self, doc):
	theta = defaultdict(dict)
	for word in doc.keys():
	for cat in self.cat_counter.keys():
	theta[word][cat] = self._calc_theta(word, cat)

	result = Counter()
	for cat in self.cat_counter.keys():
	result[cat] = self._calc_prob(doc, cat, theta)
	return result


	mb = MadokaCNaiveBayes()
	t = TFIDF()

	texts = (
	('CS', ('Python', 'プログラミング言語')),
	('CS', ('Ruby', 'プログラミング言語')),
	('CS', ('Python', '自然言語処理')),
	('BB', ('イチロー', 'ホームラン')),
	('BB', ('打者', '盗塁', 'ホームラン')),
	('FOOD', ('ラーメン', 'カレー', 'スパゲッティ')),
	('FOOD', ('ラーメン', '二郎', 'もやし')),
	)
	for text in texts:
	doc = text[1]
	doc = t.add(doc)

	for text in texts:
	cat = text[0]
	doc = text[1]
	doc = t.tfidf(doc)
	mb.train(doc, cat)
	print mb.classify(t.tfidf(('Python', 'Ruby')))
	print mb.classify(t.tfidf(('イチロー', '打者')))
	print mb.classify(t.tfidf(('ラーメン', 'もやし')))
	r = mb.classify(t.tfidf(('Python', 'Ruby')))
	for k,v in mb.log_to_prob(r).items():
	print k, v
	r = mb.classify(t.tfidf(('イチロー', '打者')))
	for k,v in mb.log_to_prob(r).items():
	print k, v
	r = mb.classify(t.tfidf(('ラーメン', 'もやし', '二郎')))
	for k,v in mb.log_to_prob(r).items():
	print k, v
	r = mb.classify(t.tfidf(('ラーメン', 'Perl')))
	for k,v in mb.log_to_prob(r).items():
	print k, v