Skip to content

Instantly share code, notes, and snippets.

@syhw
Created April 24, 2014 08:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save syhw/11246521 to your computer and use it in GitHub Desktop.
Save syhw/11246521 to your computer and use it in GitHub Desktop.
Very old implementation of mine of "Gibbs Sampling for the Uninitiated" (Philip Resnik, Eric Hardisty)
# -*- coding: utf-8 -*-
import os, re, random, math
from collections import Counter
""" Naive Bayes with Gibbs sampling, so it can deal with unlabeled data """
# as from "Gibbs Sampling for the Uninitiated" Philip Resnik, Eric Hardisty
def Dirichlet(v):
""" takes a vector of counts v and returns a Multinomial ~ Dirichlet(v) """
y = []
for count in v:
y.append(random.gammavariate(count+0.000001, 1))
s = sum(y)
return [e/s for e in y]
points = re.compile('[,.?!:;]{1}') # most simple tokenizer
docs = {}
prior_good = 0.5
n_bad = 0
n_good = 0
for r, d, fl in os.walk('reverend_thomas/test_rss/bad/'):
for fname in fl:
print r+fname
txt = filter(lambda x: x != '',
points.sub('', open(r+fname).read()).split())
docs[fname] = {'text': txt, 'label': 'bad'}
n_bad += 1
for r, d, fl in os.walk('reverend_thomas/test_rss/good/'):
for fname in fl:
print r+fname
txt = filter(lambda x: x != '',
points.sub('', open(r+fname).read()).split())
docs[fname] = {'text': txt, 'label': 'good'}
n_good += 1
prior_good = 1.0*n_good/(n_good+n_bad)
words_c = {'bad': {}, 'good': {}}
label_doc = {}
#labels = {'bad': [], 'good': []}
# *************** init ***************
for docname, doc in docs.iteritems():
label = 'bad'
if random.random() < prior_good:
label = 'good'
#labels[label].append(docname)
label_doc[docname] = label
for w in doc['text']:
words_c[label][w] = words_c[label].get(w, 0) + 1
vocab = set(words_c['bad'].keys() + words_c['good'].keys())
t_bad = [words_c['bad'].get(w, 0) for w in vocab]
t_good = [words_c['good'].get(w, 0) for w in vocab]
theta_bad = Dirichlet(t_bad)
theta_good = Dirichlet(t_good)
theta = {'bad': dict(zip(vocab, theta_bad)),
'good': dict(zip(vocab, theta_good))}
# *************** Gibbs sampling ***************
for run_number in range(420):
for docname, doc in docs.iteritems():
lab = label_doc[docname]
c_w = Counter(doc['text'])
running_prod = {'bad': 0.0, 'good': 0.0}
for w, count in c_w.iteritems():
running_prod[lab] += math.log(theta[lab][w] ** words_c[lab].get(w, 0) # + γ_{θ,i}-1
/ theta[lab][w] ** (words_c[lab].get(w, 0) - count) # + γ_{θ,i}-1
)
words_c[lab][w] -= count
running_prod = {'bad': math.exp(running_prod['bad']),
'good': math.exp(running_prod['good'])}
label_doc[docname] = 'unlabeled'
# P(L_j=x|L^(-j),θ_0,θ_1,μ) = N_x+γ_{π,x}-1/(N+γ_{π,1}+γ_{π,0}-1)
# * \prod_{i=1}^V θ_{x,i}^{W_{j,i}}
# P(L_j=x|L^(-j),thetas) ≈ N_x/N * running_prod[x]
c_b_g = Counter(label_doc.values())
total = len(label_doc.values())
val_bad = 1.0 * c_b_g['bad'] / total * running_prod['bad']
val_good = 1.0 * c_b_g['good'] / total * running_prod['good']
new_lab = 'bad'
if random.random() < val_good/(val_bad+val_good):
new_lab = 'good'
label_doc[docname] = new_lab
for w, count in c_w.iteritems():
words_c[new_lab][w] = words_c[new_lab].get(w, 0) + count
t_bad = [words_c['bad'].get(w, 0) for w in vocab]
t_good = [words_c['good'].get(w, 0) for w in vocab]
theta_bad = Dirichlet(t_bad)
theta_good = Dirichlet(t_good)
theta = {'bad': dict(zip(vocab, theta_bad)),
'good': dict(zip(vocab, theta_good))}
#print theta
print label_doc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment