Last active January 11, 2023 09:27
Calculate Kullback-Leibler Divergence of Given Corpus
import numpy
import sys
import scipy.stats as stats
import matplotlib.pyplot as plotter
from gensim import corpora, models, similarities, matutils
# Defines dictionary from the specified corpus.
dictionary = corpora.Dictionary(
line.lower().split() for line in open('corpus_train.txt', 'rb')
# Holds token ids which appears only once.
unique_ids = [
token_id for token_id, frequency in dictionary.dfs.iteritems() if frequency == 1
# Filters out tokens which appears only once.
# Filters out tokens which appears in more than no_above documents,
# and keeps only the first keep_n tokens.
dictionary.filter_extremes(no_above=5, keep_n=100000)
# Compactifies.
class Corpus(object):
""" Represents corpus.
def __iter__(self):
""" Iterates over the specified corpus as bag-of-words object.
for line in open('corpus_train.txt', 'r'):
yield dictionary.doc2bow(line.lower().split())
# Instanciates corpus.
my_corpus = Corpus()
# Generates corpus length vectors.
corpus_length_vector = numpy.array(
[sum(frequency for _, frequency in document) for document in my_corpus]
def symmetric_kl_divergence(p, q):
""" Caluculates symmetric Kullback-Leibler divergence.
return numpy.sum([stats.entropy(p, q), stats.entropy(q, p)])
def arun_metric(corpus, dictionary, min_topics=1, max_topics=1, iteration=1):
""" Caluculates Arun et al metric..
result = [];
for i in range(min_topics, max_topics, iteration):
# Instanciates LDA.
lda = models.ldamodel.LdaModel(
# Caluculates raw LDA matrix.
matrix = lda.expElogbeta
# Caluculates SVD for LDA matris.
U, document_word_vector, V = numpy.linalg.svd(matrix)
# Gets LDA topics.
lda_topics = lda[my_corpus]
# Caluculates document-topic matrix.
term_document_matrix = matutils.corpus2dense(
lda_topics, lda.num_topics
document_topic_vector =
document_topic_vector = document_topic_matrix + 0.0001
document_topic_norm = numpy.linalg.norm(corpus_length_vector)
document_topic_vector = document_topic_vector / document_topic_norm
return result
def main(argv=None):
# Caluculates symmetric KL divergence.
kl_divergence = arun_metric(my_corpus, dictionary, max_topics=200);
# Plots KL divergence against number of topics.
plotter.ylabel('Symmetric KL Divergence')
plotter.xlabel('Number of Topics')
plotter.savefig('kl_topics.png', bbox_inches='tight')
if __name__ == '__main__':
Copy link

KhaledTo commented Jun 1, 2017

Hi Shingo, thank you for this script. I think there is a mistake line 79, maybe it should be document_topic_vector instead of document_topic_matrix?

