Skip to content

Instantly share code, notes, and snippets.

@ognis1205
Last active January 11, 2023 09:27
Show Gist options
  • Save ognis1205/b8f92cc0f6f0183767dc to your computer and use it in GitHub Desktop.
Save ognis1205/b8f92cc0f6f0183767dc to your computer and use it in GitHub Desktop.
Calculate Kullback-Leibler Divergence of Given Corpus
import numpy
import sys
import scipy.stats as stats
import matplotlib.pyplot as plotter
from gensim import corpora, models, similarities, matutils
# Defines dictionary from the specified corpus.
dictionary = corpora.Dictionary(
line.lower().split() for line in open('corpus_train.txt', 'rb')
)
# Holds token ids which appears only once.
unique_ids = [
token_id for token_id, frequency in dictionary.dfs.iteritems() if frequency == 1
]
# Filters out tokens which appears only once.
dictionary.filter_tokens(unique_ids)
# Filters out tokens which appears in more than no_above documents,
# and keeps only the first keep_n tokens.
dictionary.filter_extremes(no_above=5, keep_n=100000)
# Compactifies.
dictionary.compactify()
class Corpus(object):
""" Represents corpus.
"""
def __iter__(self):
""" Iterates over the specified corpus as bag-of-words object.
"""
for line in open('corpus_train.txt', 'r'):
yield dictionary.doc2bow(line.lower().split())
# Instanciates corpus.
my_corpus = Corpus()
# Generates corpus length vectors.
corpus_length_vector = numpy.array(
[sum(frequency for _, frequency in document) for document in my_corpus]
)
def symmetric_kl_divergence(p, q):
""" Caluculates symmetric Kullback-Leibler divergence.
"""
return numpy.sum([stats.entropy(p, q), stats.entropy(q, p)])
def arun_metric(corpus, dictionary, min_topics=1, max_topics=1, iteration=1):
""" Caluculates Arun et al metric..
"""
result = [];
for i in range(min_topics, max_topics, iteration):
# Instanciates LDA.
lda = models.ldamodel.LdaModel(
corpus=corpus,
id2word=dictionary,
num_topics=i
)
# Caluculates raw LDA matrix.
matrix = lda.expElogbeta
# Caluculates SVD for LDA matris.
U, document_word_vector, V = numpy.linalg.svd(matrix)
# Gets LDA topics.
lda_topics = lda[my_corpus]
# Caluculates document-topic matrix.
term_document_matrix = matutils.corpus2dense(
lda_topics, lda.num_topics
).transpose()
document_topic_vector = corpus_length_vector.dot(term_document_matrix)
document_topic_vector = document_topic_matrix + 0.0001
document_topic_norm = numpy.linalg.norm(corpus_length_vector)
document_topic_vector = document_topic_vector / document_topic_norm
result.append(symmetric_kl_divergence(
document_word_vector,
document_topic_vector
))
return result
def main(argv=None):
# Caluculates symmetric KL divergence.
kl_divergence = arun_metric(my_corpus, dictionary, max_topics=200);
# Plots KL divergence against number of topics.
plotter.plot(kl_divergence)
plotter.ylabel('Symmetric KL Divergence')
plotter.xlabel('Number of Topics')
plotter.savefig('kl_topics.png', bbox_inches='tight')
if __name__ == '__main__':
sys.exit(main())
@KhaledTo
Copy link

KhaledTo commented Jun 1, 2017

Hi Shingo, thank you for this script. I think there is a mistake line 79, maybe it should be document_topic_vector instead of document_topic_matrix?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment