Skip to content

Instantly share code, notes, and snippets.

@jbencina
Last active April 24, 2017 17:30
Show Gist options
  • Save jbencina/7f156fb4fc7504ef4e4343615d0c1d8b to your computer and use it in GitHub Desktop.
Save jbencina/7f156fb4fc7504ef4e4343615d0c1d8b to your computer and use it in GitHub Desktop.
Scikit Topic Cohesion
def topic_coherence(lda_model, corpus, num_words, sort_topics=True, avg_per_word=False):
topics = {}
# Get the top num_words within each model in topic
for idx, topic in enumerate(lda_model.components_):
topics[idx] = [i for i in topic.argsort()[:-num_words -1: -1]]
# Convert to csc for efficient column slicing
D = corpus.tocsc()
coherence_scores = []
for topic,terms in topics.items():
coherence = 0.0
# Iterate such that m < l in terms of frequency
for m_index, m_term in enumerate(terms[1:]):
# Get all documents with term m
m_docs = D[:,m_term].nonzero()[0]
for l_term in terms[:m_index+1]:
# Get all documents with term l
l_docs = D[:,l_term].nonzero()[0]
# Compute coherence as log( (docs with m and l) / (docs with l) )
wi = len(l_docs)
wj = len(np.intersect1d(m_docs, l_docs))+1
coherence += np.log(wj/wi)
# Calculate average coherence per word to compare across different num_words
if avg_per_word:
coherence = coherence/len(terms)
coherence_scores.append((topic, coherence))
if sort_topics:
return sorted(coherence_scores, key=lambda t: t[1], reverse=True)
else:
return coherence_scores
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment