hiropppe/nb.py

## nb.py
# -*- coding: utf-8 -*-

from abc import ABCMeta, abstractmethod

import math
import sys
from collections import defaultdict

class NB:

    @abstractmethod
    def fit(self, x, y):
        """
        """

    @abstractmethod
    def predict(self, d):
        """
        """

class MultinomialNB(NB):

    def __init__(self):
        self.wset = set()
        self.wfreq_c = defaultdict(lambda: defaultdict(int))
        self.dfreq_c = defaultdict(int)

    def fit(self, x, y):
        for i in range(len(x)):
            for w in x[i]:
                self.wset.add(w)
                self.wfreq_c[y[i]][w] += 1
            self.dfreq_c[y[i]] += 1

    def predict(self, d):
        cscore = defaultdict(float)
        for c in self.dfreq_c.keys():
            cscore[c] = self._log_likelihood(d, c)
        print cscore
        return sorted(cscore.items(), key=lambda x: x[1], reverse=True)[0][0]

    def freq_stats(self):
        print 'wset : {}'.format(self.wset)
        print 'dn(c):'
        for c in self.dfreq_c.keys():
            print '  dn(c={}) => {}'.format(c, self.dfreq_c[c])
        print 'wn(c, w):'
        for c in self.wfreq_c.keys():
            for w in self.wfreq_c[c].keys():
                print '  wn(c={}, w={}) => {}'.format(c, w, self.wfreq_c[c][w])

    def _log_likelihood(self, d, c):
        score = math.log(self._p_c(c))
        for w in self.wset:
            score += math.log(self._score(w, d, c))
        return score

    def _score(self, w, d, c):
        return math.pow(self._q_wc(w, c), self._delta(w, d))

    def _delta(self, w, d):
        if w in d:
            return 1
        else:
            return 0

    def _q_wc(self, w, c):
        return (self.wfreq_c[c][w] + 1.0) / (sum(v[1] for v in self.wfreq_c[c].items()) + len(self.wset))

    def _p_c(self, c):
        return self.dfreq_c[c] + 1.0 / ( sum(n for n in self.dfreq_c.values()) + len(self.dfreq_c))

class BernoulliNB(NB):

    def __init__(self):
        self.wset = set()
        self.dfreq_c = defaultdict(int)
        self.dfreq_wc = defaultdict(lambda: defaultdict(int))

    def fit(self, x, y):
        for i in range(len(x)):
            for w in set(x[i]):
                self.wset.add(w)
                self.dfreq_wc[w][y[i]] += 1
            self.dfreq_c[y[i]] += 1

    def predict(self, d):
        cscore = defaultdict(float)
        for c in self.dfreq_c.keys():
            cscore[c] = self._log_likelihood(d, c)
        print cscore
        return sorted(cscore.items(), key=lambda x: x[1], reverse=True)[0][0]

    def freq_stats(self):
        print 'wset : {}'.format(self.wset)
        print 'dn(c):'
        for c in self.dfreq_c.keys():
            print '  dn(c={}) => {}'.format(c, self.dfreq_c[c])
        print 'dn(w, c):'
        for w in self.dfreq_wc.keys():
            for c in self.dfreq_wc[w].keys():
                print '  dn(w={}, c={}) => {}'.format(w, c, self.dfreq_wc[w][c])

    def _log_likelihood(self, d, c):
        score = math.log(self._p_c(c))
        for w in self.wset:
            score += math.log(self._score(w, d, c))
        return score

    def _score(self, w, d, c):
        p_wc = self._p_wc(w, c)
        delta = self._delta(w, d)
        return math.pow(p_wc, delta) * math.pow((1.0 - p_wc), (1.0 - delta))

    def _delta(self, w, d):
        if w in d:
            return 1
        else:
            return 0

    def _p_wc(self, w, c):
        return (self.dfreq_wc[w][c] + 1.0) / (self.dfreq_c[c] + 2.0)

    def _p_c(self, c):
        return self.dfreq_c[c] + 1.0 / ( sum(n for n in self.dfreq_c.values()) + len(self.dfreq_wc))

def main():
    x = [['good', 'bad', 'good', 'good'],
        ['exciting', 'exciting'],
        ['good', 'good', 'exciting', 'boring'],
        ['bad', 'boring', 'boring', 'boring'],
        ['bad', 'good', 'bad'],
        ['bad', 'bad', 'boring', 'exciting']]

    y = ['P', 'P', 'P', 'N', 'N', 'N']

    nb = BernoulliNB()
    nb.fit(x, y)
    nb.freq_stats()
    c = nb.predict(['bad', 'bad', 'boring', 'boring', 'fine'])
    print 'BernoulliNB => {}'.format(c)

    nb = MultinomialNB()
    nb.fit(x, y)
    nb.freq_stats()
    c = nb.predict(['bad', 'bad', 'boring', 'boring', 'fine'])
    print 'MultinomialNB => {}'.format(c)
	# -- coding: utf-8 --

	from abc import ABCMeta, abstractmethod

	import math
	import sys
	from collections import defaultdict

	class NB:

	@abstractmethod
	def fit(self, x, y):
	"""
	"""

	@abstractmethod
	def predict(self, d):
	"""
	"""

	class MultinomialNB(NB):

	def __init__(self):
	self.wset = set()
	self.wfreq_c = defaultdict(lambda: defaultdict(int))
	self.dfreq_c = defaultdict(int)

	def fit(self, x, y):
	for i in range(len(x)):
	for w in x[i]:
	self.wset.add(w)
	self.wfreq_c[y[i]][w] += 1
	self.dfreq_c[y[i]] += 1

	def predict(self, d):
	cscore = defaultdict(float)
	for c in self.dfreq_c.keys():
	cscore[c] = self._log_likelihood(d, c)
	print cscore
	return sorted(cscore.items(), key=lambda x: x[1], reverse=True)[0][0]

	def freq_stats(self):
	print 'wset : {}'.format(self.wset)
	print 'dn(c):'
	for c in self.dfreq_c.keys():
	print ' dn(c={}) => {}'.format(c, self.dfreq_c[c])
	print 'wn(c, w):'
	for c in self.wfreq_c.keys():
	for w in self.wfreq_c[c].keys():
	print ' wn(c={}, w={}) => {}'.format(c, w, self.wfreq_c[c][w])

	def _log_likelihood(self, d, c):
	score = math.log(self._p_c(c))
	for w in self.wset:
	score += math.log(self._score(w, d, c))
	return score

	def _score(self, w, d, c):
	return math.pow(self._q_wc(w, c), self._delta(w, d))

	def _delta(self, w, d):
	if w in d:
	return 1
	else:
	return 0

	def _q_wc(self, w, c):
	return (self.wfreq_c[c][w] + 1.0) / (sum(v[1] for v in self.wfreq_c[c].items()) + len(self.wset))

	def _p_c(self, c):
	return self.dfreq_c[c] + 1.0 / ( sum(n for n in self.dfreq_c.values()) + len(self.dfreq_c))

	class BernoulliNB(NB):

	def __init__(self):
	self.wset = set()
	self.dfreq_c = defaultdict(int)
	self.dfreq_wc = defaultdict(lambda: defaultdict(int))

	def fit(self, x, y):
	for i in range(len(x)):
	for w in set(x[i]):
	self.wset.add(w)
	self.dfreq_wc[w][y[i]] += 1
	self.dfreq_c[y[i]] += 1

	def predict(self, d):
	cscore = defaultdict(float)
	for c in self.dfreq_c.keys():
	cscore[c] = self._log_likelihood(d, c)
	print cscore
	return sorted(cscore.items(), key=lambda x: x[1], reverse=True)[0][0]

	def freq_stats(self):
	print 'wset : {}'.format(self.wset)
	print 'dn(c):'
	for c in self.dfreq_c.keys():
	print ' dn(c={}) => {}'.format(c, self.dfreq_c[c])
	print 'dn(w, c):'
	for w in self.dfreq_wc.keys():
	for c in self.dfreq_wc[w].keys():
	print ' dn(w={}, c={}) => {}'.format(w, c, self.dfreq_wc[w][c])

	def _log_likelihood(self, d, c):
	score = math.log(self._p_c(c))
	for w in self.wset:
	score += math.log(self._score(w, d, c))
	return score

	def _score(self, w, d, c):
	p_wc = self._p_wc(w, c)
	delta = self._delta(w, d)
	return math.pow(p_wc, delta) * math.pow((1.0 - p_wc), (1.0 - delta))

	def _delta(self, w, d):
	if w in d:
	return 1
	else:
	return 0

	def _p_wc(self, w, c):
	return (self.dfreq_wc[w][c] + 1.0) / (self.dfreq_c[c] + 2.0)

	def _p_c(self, c):
	return self.dfreq_c[c] + 1.0 / ( sum(n for n in self.dfreq_c.values()) + len(self.dfreq_wc))

	def main():
	x = [['good', 'bad', 'good', 'good'],
	['exciting', 'exciting'],
	['good', 'good', 'exciting', 'boring'],
	['bad', 'boring', 'boring', 'boring'],
	['bad', 'good', 'bad'],
	['bad', 'bad', 'boring', 'exciting']]

	y = ['P', 'P', 'P', 'N', 'N', 'N']

	nb = BernoulliNB()
	nb.fit(x, y)
	nb.freq_stats()
	c = nb.predict(['bad', 'bad', 'boring', 'boring', 'fine'])
	print 'BernoulliNB => {}'.format(c)

	nb = MultinomialNB()
	nb.fit(x, y)
	nb.freq_stats()
	c = nb.predict(['bad', 'bad', 'boring', 'boring', 'fine'])
	print 'MultinomialNB => {}'.format(c)