alextp/lda.py

## lda.py
import numpy as np

def symdirichlet(alpha, n):
    v = np.zeros(n)+alpha
    return np.random.dirichlet(v)

import re
wre = re.compile(r"(\w)+")
def get_words(text, stop=True):
    "A simple tokenizer"
    l = 0
    while l < len(text):
        s = wre.search(text,l)
        try:
            w = text[s.start():s.end()].lower()
            if stop:
                yield w
            elif w not in stoplist:
                yield w
            l = s.end()
        except:
            break


class EmLda(object):
    def __init__(self, docs, nt):
        self.Nt = nt
        self.docs = []
        self.all_words = []
        self.reverse_map = {}
        self.Nd = 0
        for d in docs:
            doc = []
            self.docs.append(doc)
            self.Nd += 1
            for w in get_words(d):
                if len(w) < 5: continue
                if not w in self.reverse_map:
                    self.reverse_map[w] = len(self.all_words)
                    self.all_words.append(w)
                doc.append(self.reverse_map[w])
        self.V = len(self.all_words)
        self.ctopics = np.zeros((self.Nt, self.V))
        self.doccounts = np.zeros((self.Nd, self.Nt))
        self.ptopics = np.zeros((self.Nt, self.V))
        self.docprobs = np.zeros((self.Nd, self.Nt))
        self.beta = 1.
        self.alpha = 1.
        for d in xrange(self.Nd):
            for i,w in enumerate(self.docs[d]):
                zw = symdirichlet(self.beta, self.Nt)
                self.ctopics.T[w] += zw
                self.doccounts[d] += zw
        self.m()


    def e(self):
        self.ctopics.fill(0)
        self.doccounts.fill(0)
        for d in xrange(self.Nd):
            for i,w in enumerate(self.docs[d]):
                zw = self.ptopics.T[w].copy()
                zw *= self.docprobs[d]
                zw /= np.sum(zw)
                self.ctopics.T[w] += zw
                self.doccounts[d] += zw

    def m(self):
        self.ptopics.fill(0)
        self.docprobs.fill(0)
        self.ptopics += self.ctopics + self.alpha
        self.ptopics /= np.sum(self.ctopics, axis=1).reshape((-1,1))
        self.docprobs += self.doccounts + self.beta
        self.docprobs /= np.sum(self.docprobs, axis=1).reshape((-1,1))

    def iterate(self):
        self.e()
        self.m()


    def run(self, n):
        for i in xrange(n):
            print "iter", i
            import sys
            sys.stdout.flush()
            self.iterate()
        for w in xrange(100):
            print "word", self.all_words[w],
            print "topics", self.ptopics.T[w]/np.sum(self.ptopics.T[w])
        for i in xrange(self.Nt):
            print
            print
            print "Topic", i
            print
            print_topic(self, self.ptopics[i], 40)


class PrLda(EmLda):

    def __init__(self, *args):
        super(PrLda, self).__init__(*args)
        self.Nw = sum(len(d) for d in self.docs)
        c = 0
        self.sigma = 15.

    def e(self):
        self.do_lambda()
        self.do_z()

    def do_lambda(self):
        self.ptheta = [[] for i in xrange(self.V)]
        self.ctopics.fill(0)
        self.doccounts.fill(0)
        for d in xrange(self.Nd):
            for i,w in enumerate(self.docs[d]):
                zw = self.ptopics.T[w].copy()
                zw *= self.docprobs[d]
                zw /= np.sum(zw)
                self.ptheta[w].append(zw)
        self.lbda = []
        self.ptheta = map(np.array, self.ptheta)
        for w in xrange(self.V):
            self.lbda.append(self.optimize_lambda(self.ptheta[w]))

    def optimize_lambda(self, ptheta, steps=50, lrate=1.):
        lbda = np.ones(ptheta.shape[1])
        lbda /= np.sum(lbda).T
        lbda *= self.sigma
        prevobj = np.inf
        n = 0
        while True:
            obj = np.sum(np.sum(ptheta,axis=0).T*np.exp(-lbda))
            if  n > 5 and  prevobj - obj < 0.01*obj:
                break
            n += 1
            prevobj = obj
            # do the gradient descent
            lbda += lrate*np.sum(ptheta,axis=0)*np.exp(-lbda)
            # truncate
            lbda *= lbda>0
            # project it into the l1 ball with diameter sigma
            ll = -np.sort(-lbda)
            cs = np.argmin(((ll-self.sigma).cumsum()/(np.arange(len(ll))+1.))) >= 0
            theta = ll[cs-1]
            lbda -= theta
            lbda *= lbda > 0
        return lbda

    def do_z(self):
        indices = [0 for i in xrange(self.V)]
        for d in xrange(self.Nd):
            for i,w in enumerate(self.docs[d]):
                zw = self.ptheta[w][indices[w]]
                zw *= np.exp(-self.lbda[w])
                zw /= np.sum(zw)
                indices[w] += 1
                self.ctopics.T[w] += zw
                self.doccounts[d] += zw


def print_topic(model, t, n):
    s = np.argsort(-t)
    for w in s[:n]:
        print "     ",model.all_words[w]
	import numpy as np

	def symdirichlet(alpha, n):
	v = np.zeros(n)+alpha
	return np.random.dirichlet(v)

	import re
	wre = re.compile(r"(\w)+")
	def get_words(text, stop=True):
	"A simple tokenizer"
	l = 0
	while l < len(text):
	s = wre.search(text,l)
	try:
	w = text[s.start():s.end()].lower()
	if stop:
	yield w
	elif w not in stoplist:
	yield w
	l = s.end()
	except:
	break


	class EmLda(object):
	def __init__(self, docs, nt):
	self.Nt = nt
	self.docs = []
	self.all_words = []
	self.reverse_map = {}
	self.Nd = 0
	for d in docs:
	doc = []
	self.docs.append(doc)
	self.Nd += 1
	for w in get_words(d):
	if len(w) < 5: continue
	if not w in self.reverse_map:
	self.reverse_map[w] = len(self.all_words)
	self.all_words.append(w)
	doc.append(self.reverse_map[w])
	self.V = len(self.all_words)
	self.ctopics = np.zeros((self.Nt, self.V))
	self.doccounts = np.zeros((self.Nd, self.Nt))
	self.ptopics = np.zeros((self.Nt, self.V))
	self.docprobs = np.zeros((self.Nd, self.Nt))
	self.beta = 1.
	self.alpha = 1.
	for d in xrange(self.Nd):
	for i,w in enumerate(self.docs[d]):
	zw = symdirichlet(self.beta, self.Nt)
	self.ctopics.T[w] += zw
	self.doccounts[d] += zw
	self.m()



	def e(self):
	self.ctopics.fill(0)
	self.doccounts.fill(0)
	for d in xrange(self.Nd):
	for i,w in enumerate(self.docs[d]):
	zw = self.ptopics.T[w].copy()
	zw *= self.docprobs[d]
	zw /= np.sum(zw)
	self.ctopics.T[w] += zw
	self.doccounts[d] += zw

	def m(self):
	self.ptopics.fill(0)
	self.docprobs.fill(0)
	self.ptopics += self.ctopics + self.alpha
	self.ptopics /= np.sum(self.ctopics, axis=1).reshape((-1,1))
	self.docprobs += self.doccounts + self.beta
	self.docprobs /= np.sum(self.docprobs, axis=1).reshape((-1,1))

	def iterate(self):
	self.e()
	self.m()


	def run(self, n):
	for i in xrange(n):
	print "iter", i
	import sys
	sys.stdout.flush()
	self.iterate()
	for w in xrange(100):
	print "word", self.all_words[w],
	print "topics", self.ptopics.T[w]/np.sum(self.ptopics.T[w])
	for i in xrange(self.Nt):
	print
	print
	print "Topic", i
	print
	print_topic(self, self.ptopics[i], 40)


	class PrLda(EmLda):

	def __init__(self, *args):
	super(PrLda, self).__init__(*args)
	self.Nw = sum(len(d) for d in self.docs)
	c = 0
	self.sigma = 15.

	def e(self):
	self.do_lambda()
	self.do_z()

	def do_lambda(self):
	self.ptheta = [[] for i in xrange(self.V)]
	self.ctopics.fill(0)
	self.doccounts.fill(0)
	for d in xrange(self.Nd):
	for i,w in enumerate(self.docs[d]):
	zw = self.ptopics.T[w].copy()
	zw *= self.docprobs[d]
	zw /= np.sum(zw)
	self.ptheta[w].append(zw)
	self.lbda = []
	self.ptheta = map(np.array, self.ptheta)
	for w in xrange(self.V):
	self.lbda.append(self.optimize_lambda(self.ptheta[w]))

	def optimize_lambda(self, ptheta, steps=50, lrate=1.):
	lbda = np.ones(ptheta.shape[1])
	lbda /= np.sum(lbda).T
	lbda *= self.sigma
	prevobj = np.inf
	n = 0
	while True:
	obj = np.sum(np.sum(ptheta,axis=0).T*np.exp(-lbda))
	if n > 5 and prevobj - obj < 0.01*obj:
	break
	n += 1
	prevobj = obj
	# do the gradient descent
	lbda += lratenp.sum(ptheta,axis=0)np.exp(-lbda)
	# truncate
	lbda *= lbda>0
	# project it into the l1 ball with diameter sigma
	ll = -np.sort(-lbda)
	cs = np.argmin(((ll-self.sigma).cumsum()/(np.arange(len(ll))+1.))) >= 0
	theta = ll[cs-1]
	lbda -= theta
	lbda *= lbda > 0
	return lbda

	def do_z(self):
	indices = [0 for i in xrange(self.V)]
	for d in xrange(self.Nd):
	for i,w in enumerate(self.docs[d]):
	zw = self.ptheta[w][indices[w]]
	zw *= np.exp(-self.lbda[w])
	zw /= np.sum(zw)
	indices[w] += 1
	self.ctopics.T[w] += zw
	self.doccounts[d] += zw



	def print_topic(model, t, n):
	s = np.argsort(-t)
	for w in s[:n]:
	print " ",model.all_words[w]