jbencina/scikit_topic_cohesion.py

## scikit_topic_cohesion.py
def topic_coherence(lda_model, corpus, num_words, sort_topics=True, avg_per_word=False):
    topics = {}

    # Get the top num_words within each model in topic
    for idx, topic in enumerate(lda_model.components_):
        topics[idx] = [i for i in topic.argsort()[:-num_words -1: -1]]

    # Convert to csc for efficient column slicing
    D = corpus.tocsc()

    coherence_scores = []
    for topic,terms in topics.items():
        coherence = 0.0

        # Iterate such that m < l in terms of frequency
        for m_index, m_term in enumerate(terms[1:]):

            # Get all documents with term m
            m_docs = D[:,m_term].nonzero()[0]

            for l_term in terms[:m_index+1]:

                # Get all documents with term l
                l_docs = D[:,l_term].nonzero()[0]

                # Compute coherence as log( (docs with m and l) / (docs with l) )
                wi = len(l_docs)
                wj = len(np.intersect1d(m_docs, l_docs))+1
                coherence += np.log(wj/wi)

        # Calculate average coherence per word to compare across different num_words
        if avg_per_word:
            coherence = coherence/len(terms)
        coherence_scores.append((topic, coherence))

    if sort_topics:
        return sorted(coherence_scores, key=lambda t: t[1], reverse=True)
    else:
        return coherence_scores
	def topic_coherence(lda_model, corpus, num_words, sort_topics=True, avg_per_word=False):
	topics = {}

	# Get the top num_words within each model in topic
	for idx, topic in enumerate(lda_model.components_):
	topics[idx] = [i for i in topic.argsort()[:-num_words -1: -1]]

	# Convert to csc for efficient column slicing
	D = corpus.tocsc()

	coherence_scores = []
	for topic,terms in topics.items():
	coherence = 0.0

	# Iterate such that m < l in terms of frequency
	for m_index, m_term in enumerate(terms[1:]):

	# Get all documents with term m
	m_docs = D[:,m_term].nonzero()[0]

	for l_term in terms[:m_index+1]:

	# Get all documents with term l
	l_docs = D[:,l_term].nonzero()[0]

	# Compute coherence as log( (docs with m and l) / (docs with l) )
	wi = len(l_docs)
	wj = len(np.intersect1d(m_docs, l_docs))+1
	coherence += np.log(wj/wi)

	# Calculate average coherence per word to compare across different num_words
	if avg_per_word:
	coherence = coherence/len(terms)
	coherence_scores.append((topic, coherence))

	if sort_topics:
	return sorted(coherence_scores, key=lambda t: t[1], reverse=True)
	else:
	return coherence_scores