do-me/Topic Modeling with Scikit Learn.py

## Topic Modeling with Scikit Learn.py
# adapted code for Python 3 and latest Scikit-learn version 0:23
# based on https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np

def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print("Topic {}".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print(documents[doc_index])

# Single line documents from http://web.eecs.utk.edu/~berry/order/node4.html#SECTION00022000000000000000
documents = [
            "Human machine interface for Lab ABC computer applications",
            "A survey of user opinion of computer system response time",
            "The EPS user interface management system",
            "System and human system engineering testing of EPS",
            "Relation of user-perceived response time to error measurement",
            "The generation of random, binary, unordered trees",
            "The intersection graph of paths in trees",
            "Graph minors IV: Widths of trees and quasi-ordering",
            "Graph minors: A survey"
            ]

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 2

# Run NMF
nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_W = nmf_model.transform(tfidf)
nmf_H = nmf_model.components_

# Run LDA
lda_model = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_

no_top_words = 4
no_top_documents = 4
display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)
display_topics(lda_H, lda_W, tf_feature_names, documents, no_top_words, no_top_documents)
	# adapted code for Python 3 and latest Scikit-learn version 0:23
	# based on https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from sklearn.decomposition import NMF, LatentDirichletAllocation
	import numpy as np

	def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
	for topic_idx, topic in enumerate(H):
	print("Topic {}".format(topic_idx))
	print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
	top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
	for doc_index in top_doc_indices:
	print(documents[doc_index])

	# Single line documents from http://web.eecs.utk.edu/~berry/order/node4.html#SECTION00022000000000000000
	documents = [
	"Human machine interface for Lab ABC computer applications",
	"A survey of user opinion of computer system response time",
	"The EPS user interface management system",
	"System and human system engineering testing of EPS",
	"Relation of user-perceived response time to error measurement",
	"The generation of random, binary, unordered trees",
	"The intersection graph of paths in trees",
	"Graph minors IV: Widths of trees and quasi-ordering",
	"Graph minors: A survey"
	]

	# NMF is able to use tf-idf
	tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
	tfidf = tfidf_vectorizer.fit_transform(documents)
	tfidf_feature_names = tfidf_vectorizer.get_feature_names()

	# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
	tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
	tf = tf_vectorizer.fit_transform(documents)
	tf_feature_names = tf_vectorizer.get_feature_names()

	no_topics = 2

	# Run NMF
	nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
	nmf_W = nmf_model.transform(tfidf)
	nmf_H = nmf_model.components_

	# Run LDA
	lda_model = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
	lda_W = lda_model.transform(tf)
	lda_H = lda_model.components_

	no_top_words = 4
	no_top_documents = 4
	display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)
	display_topics(lda_H, lda_W, tf_feature_names, documents, no_top_words, no_top_documents)