Skip to content

Instantly share code, notes, and snippets.

@alextp
Created August 22, 2010 20:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alextp/544268 to your computer and use it in GitHub Desktop.
Save alextp/544268 to your computer and use it in GitHub Desktop.
import numpy as np
import random
import math
from scipy.special import gamma,gammaln
from scipy import weave
import sys
import collections
from libbayes import discrete, gamma_pdf, slice_sample, exp_pdf
from libbayes import single_collapsed_likelihood as scl
from toputils import get_words
from . import sumbasic
class ContentSampler(object):
def __init__(self, classes, nt):
self.all_words = []
self.reverse_map = {}
self.doc_map = {}
self.documents = []
self.Ndocuments = 0
self.Nwords = 0
self.alpha = 0.01
self.beta = 6.
self.Nt = nt
self.load_docs(classes, sf)
self.Nwords = len(self.all_words)
self.Ndocuments = len(self.documents)
self.assignments = [[0 for w in d] for d in self.documents]
self.initialize()
def load_doc(self,document):
v = []
doc = document[0]
for sent in doc.split("."):
if not sent: continue
s = []
v.append(s)
for w in get_words(sent, stop=False):
w = w.lower()
if not w in self.reverse_map:
self.reverse_map[w] = self.Nwords
self.all_words.append(w)
self.Nwords += 1
s.append(self.reverse_map[w])
self.doc_map[doc] = len(self.documents)
self.docs.append(document)
self.documents.append(v)
def load_docs(self, classes):
prod_map = {}
self.Nproducts = 0
self.prod = []
self.docs = []
for c in classes:
self.goodness_threshold = len(self.documents)
for prod,docs in c:
prod_map[prod] = self.Nproducts
for doc in docs:
self.prod.append(self.Nproducts)
self.load_doc(doc)
self.Nproducts += 1
def initialize(self):
self.t = np.zeros((self.Nt,self.Nt))
self.topics = np.zeros((self.Nt,self.Nwords))
self.ntopics = np.zeros(self.Nt)
self.assignments = []
self.documents = [map(np.array, di) for di in self.documents]
self.msample = np.zeros((self.Ndocuments,self.Nt))
for d in xrange(self.Ndocuments):
ad = []
self.assignments.append(ad)
p = 0
for i in xrange(len(self.documents[d])):
y = random.randint(1,self.Nt-1)
ad.append(y)
for w in self.documents[d][i]:
self.ntopics[y] += 1
self.topics[y,w] += 1
self.t[p,y] += 1
p = y
self.t[y,0] += 1
self.assignments = map(np.array, self.assignments)
def c_add(self,dist,norm,wordset, args=["Nws", "alpha", "w", "dist", "norm", "p"]):
p = 1.
dn = len(dist)*self.alpha
norm += float(dn)
Nws = len(wordset)
w = wordset
alpha = self.alpha
p = weave.inline("""
for (int i = 0; i < Nws; ++i) {
p *= ((double)(dist(w(i))+alpha))/((double)norm);
norm = (double)norm + 1.;
dist(w(i)) = dist(w(i))+1.;
}
for (int i = 0; i < Nws; ++i) {
dist(w(i)) = dist(w(i))-1.;
}
return_value = p;
""", arg_names=args, type_converters=weave.converters.blitz)
return p
def resample_sentence(self, d,i,s,pt):
"Resamples the topic assignments of a sentence"
y = self.assignments[d][i]
if i == 0:
ym1 = 0
else:
ym1 = self.assignments[d][i-1]
if i == len(self.assignments[d]) -1:
yp1 = 0
else:
yp1 = self.assignments[d][i+1]
self.t[ym1,y] -= 1
assert self.t[ym1,y] >= 0
self.t[y,yp1] -= 1
assert self.t[yp1,y] >= 0
for w in s:
self.topics[y,w] -= 1
self.ntopics[y] -= 1
assert self.topics[y,w] >= 0, "%s %s"%(y,w)
assert self.ntopics[y] >= 0
pt.fill(1)
pt *= (self.t[ym1]+self.beta)
pt *= (self.t.T[yp1]+self.beta)
for j in xrange(1,self.Nt):
pt[j] *= self.c_add(self.topics[j],self.ntopics[j],s)
pt[0] *=0
pt /= np.sum(pt)
nt = discrete(pt)
self.assignments[d][i] = nt
for w in s:
self.topics[nt,w] += 1
self.ntopics[nt] += 1
self.t[ym1,nt] += 1
self.t[nt,yp1] += 1
def proportions(self, d):
p = np.zeros(self.Nt)+self.beta
for a in self.assignments[d]:
p[a] += 1
return p/np.sum(p)
def resample_beta(self):
def partial_lik(t0):
self.beta = t0
return scl(self.beta, 2, 10, self.t)
self.beta = slice_sample(partial_lik, self.beta)
def resample_alpha(self):
def partial_lik(t0):
self.alpha = t0
return scl(self.alpha, 2, 10, self.topics)
self.alpha = slice_sample(partial_lik, self.alpha)
def iterate(self):
self.resample_beta()
print self.beta
self.resample_alpha()
print self.alpha
pt = np.zeros(self.Nt)
for document in xrange(self.Ndocuments):
for i,word in enumerate(self.documents[document]):
self.resample_sentence(document,i,word,pt)
self.msample[document] += self.proportions(document)
def run(self,its):
"The sampler itself."
iteration = 0
print "iterating.."
for i in xrange(its):
iteration += 1
self.iterate()
def print_topic(model, t, n):
s = np.argsort(-t)
for w in s[:n]:
print " ",model.all_words[w]
def print_keyw_topic(model, t, n):
tt = np.zeros(len(model.all_words))
for k,v in t.items():
tt[k] = v
print_topic(model, tt, n)
def top_keyw_topic(model, t, n):
tt = np.zeros(len(model.all_words))
for k,v in t.items():
tt[k] = v
s = np.argsort(-tt)
return [model.all_words[i] for i in s[:n]]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment