chyikwei/lda_test.py

## lda_test.py
import nltk
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i] + " (" + str(round(topic[i], 2)) + ")"
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)


data_samples = [nltk.corpus.gutenberg.raw(f_id)
               for f_id in nltk.corpus.gutenberg.fileids()]

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                stop_words='english',
                                max_features=2000)
tf = tf_vectorizer.fit_transform(data_samples)

lda = LatentDirichletAllocation(n_components=30,
                                learning_method='batch',
                                n_jobs=-1,  # all CPUs
                                verbose=1,
                                evaluate_every=10,
                                max_iter=100,
                                random_state=1)

doc_distr = lda.fit_transform(tf)

tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, 5)

for d in doc_distr:
    print np.where(d > 0.001)[0]
	import nltk
	import numpy as np
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.decomposition import LatentDirichletAllocation

	def print_top_words(model, feature_names, n_top_words):
	for topic_idx, topic in enumerate(model.components_):
	message = "Topic #%d: " % topic_idx
	message += " ".join([feature_names[i] + " (" + str(round(topic[i], 2)) + ")"
	for i in topic.argsort()[:-n_top_words - 1:-1]])
	print(message)


	data_samples = [nltk.corpus.gutenberg.raw(f_id)
	for f_id in nltk.corpus.gutenberg.fileids()]

	tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
	stop_words='english',
	max_features=2000)
	tf = tf_vectorizer.fit_transform(data_samples)

	lda = LatentDirichletAllocation(n_components=30,
	learning_method='batch',
	n_jobs=-1, # all CPUs
	verbose=1,
	evaluate_every=10,
	max_iter=100,
	random_state=1)

	doc_distr = lda.fit_transform(tf)

	tf_feature_names = tf_vectorizer.get_feature_names()
	print_top_words(lda, tf_feature_names, 5)

	for d in doc_distr:
	print np.where(d > 0.001)[0]