Skip to content

Instantly share code, notes, and snippets.

@hiropppe
Last active December 16, 2015 03:52
Show Gist options
  • Save hiropppe/6b39869b0d9fca11f1be to your computer and use it in GitHub Desktop.
Save hiropppe/6b39869b0d9fca11f1be to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
from abc import ABCMeta, abstractmethod
import math
import sys
from collections import defaultdict
class NB:
@abstractmethod
def fit(self, x, y):
"""
"""
@abstractmethod
def predict(self, d):
"""
"""
class MultinomialNB(NB):
def __init__(self):
self.wset = set()
self.wfreq_c = defaultdict(lambda: defaultdict(int))
self.dfreq_c = defaultdict(int)
def fit(self, x, y):
for i in range(len(x)):
for w in x[i]:
self.wset.add(w)
self.wfreq_c[y[i]][w] += 1
self.dfreq_c[y[i]] += 1
def predict(self, d):
cscore = defaultdict(float)
for c in self.dfreq_c.keys():
cscore[c] = self._log_likelihood(d, c)
print cscore
return sorted(cscore.items(), key=lambda x: x[1], reverse=True)[0][0]
def freq_stats(self):
print 'wset : {}'.format(self.wset)
print 'dn(c):'
for c in self.dfreq_c.keys():
print ' dn(c={}) => {}'.format(c, self.dfreq_c[c])
print 'wn(c, w):'
for c in self.wfreq_c.keys():
for w in self.wfreq_c[c].keys():
print ' wn(c={}, w={}) => {}'.format(c, w, self.wfreq_c[c][w])
def _log_likelihood(self, d, c):
score = math.log(self._p_c(c))
for w in self.wset:
score += math.log(self._score(w, d, c))
return score
def _score(self, w, d, c):
return math.pow(self._q_wc(w, c), self._delta(w, d))
def _delta(self, w, d):
if w in d:
return 1
else:
return 0
def _q_wc(self, w, c):
return (self.wfreq_c[c][w] + 1.0) / (sum(v[1] for v in self.wfreq_c[c].items()) + len(self.wset))
def _p_c(self, c):
return self.dfreq_c[c] + 1.0 / ( sum(n for n in self.dfreq_c.values()) + len(self.dfreq_c))
class BernoulliNB(NB):
def __init__(self):
self.wset = set()
self.dfreq_c = defaultdict(int)
self.dfreq_wc = defaultdict(lambda: defaultdict(int))
def fit(self, x, y):
for i in range(len(x)):
for w in set(x[i]):
self.wset.add(w)
self.dfreq_wc[w][y[i]] += 1
self.dfreq_c[y[i]] += 1
def predict(self, d):
cscore = defaultdict(float)
for c in self.dfreq_c.keys():
cscore[c] = self._log_likelihood(d, c)
print cscore
return sorted(cscore.items(), key=lambda x: x[1], reverse=True)[0][0]
def freq_stats(self):
print 'wset : {}'.format(self.wset)
print 'dn(c):'
for c in self.dfreq_c.keys():
print ' dn(c={}) => {}'.format(c, self.dfreq_c[c])
print 'dn(w, c):'
for w in self.dfreq_wc.keys():
for c in self.dfreq_wc[w].keys():
print ' dn(w={}, c={}) => {}'.format(w, c, self.dfreq_wc[w][c])
def _log_likelihood(self, d, c):
score = math.log(self._p_c(c))
for w in self.wset:
score += math.log(self._score(w, d, c))
return score
def _score(self, w, d, c):
p_wc = self._p_wc(w, c)
delta = self._delta(w, d)
return math.pow(p_wc, delta) * math.pow((1.0 - p_wc), (1.0 - delta))
def _delta(self, w, d):
if w in d:
return 1
else:
return 0
def _p_wc(self, w, c):
return (self.dfreq_wc[w][c] + 1.0) / (self.dfreq_c[c] + 2.0)
def _p_c(self, c):
return self.dfreq_c[c] + 1.0 / ( sum(n for n in self.dfreq_c.values()) + len(self.dfreq_wc))
def main():
x = [['good', 'bad', 'good', 'good'],
['exciting', 'exciting'],
['good', 'good', 'exciting', 'boring'],
['bad', 'boring', 'boring', 'boring'],
['bad', 'good', 'bad'],
['bad', 'bad', 'boring', 'exciting']]
y = ['P', 'P', 'P', 'N', 'N', 'N']
nb = BernoulliNB()
nb.fit(x, y)
nb.freq_stats()
c = nb.predict(['bad', 'bad', 'boring', 'boring', 'fine'])
print 'BernoulliNB => {}'.format(c)
nb = MultinomialNB()
nb.fit(x, y)
nb.freq_stats()
c = nb.predict(['bad', 'bad', 'boring', 'boring', 'fine'])
print 'MultinomialNB => {}'.format(c)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment