Skip to content

Instantly share code, notes, and snippets.

@i-Hun
Created December 16, 2014 05:02
Show Gist options
  • Save i-Hun/4a82c3e00dae59e1a290 to your computer and use it in GitHub Desktop.
Save i-Hun/4a82c3e00dae59e1a290 to your computer and use it in GitHub Desktop.
from __future__ import division
from gensim import corpora, models, similarities, matutils
import numpy as np
import scipy.stats as stats
from scipy.sparse import linalg as splinalg
from scipy.sparse import *
import matplotlib.pyplot as plt
import h5py
from pymongo import MongoClient
db = MongoClient().thesis
raw_tokens = db.raw_tokens
from analysis import LDA, get_corpus, get_dictionary
# Define KL function
def sym_kl(p, q):
return np.sum([stats.entropy(p, q), stats.entropy(q, p)])
dictionary = get_dictionary("solo")
class MyCorpus(object):
def __iter__(self):
for doc in raw_tokens.find(fields={"content": 1}):
yield dictionary.doc2bow(doc["content"])
my_corpus = MyCorpus()
l = np.array([sum(cnt for _, cnt in doc) for doc in my_corpus])
def arun(corpus, dictionary, min_topics=20, max_topics=50, step=10):
kl = []
for i in range(min_topics, max_topics, step):
lda = LDA(dictionary, corpus, i, str(i) + "topics")
m1 = lda.expElogbeta
U, cm1, V = splinalg.svds(m1, k=1)
#Document-topic matrix
lda_topics = lda[my_corpus]
m2 = matutils.corpus2dense(lda_topics, lda.num_topics).transpose()
cm2 = l.dot(m2)
cm2 = cm2 + 0.0001
cm2norm = np.linalg.norm(l)
cm2 = cm2/cm2norm
cm2 = csr_matrix(cm2).todense()
print len(cm1), len(csr_matrix(cm2).todense())
kl.append(sym_kl(cm1, cm2))
return kl
kl = arun(my_corpus, dictionary, min_topics=20, max_topics=50)
# Plot kl divergence against number of topics
plt.plot(kl)
plt.ylabel('Symmetric KL Divergence')
plt.xlabel('Number of Topics')
plt.savefig('kldiv.png', bbox_inches='tight')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment