Skip to content

Instantly share code, notes, and snippets.

@chyikwei
Created September 17, 2017 23:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chyikwei/1707b59e009d381e1ce1e7a38f9c7826 to your computer and use it in GitHub Desktop.
Save chyikwei/1707b59e009d381e1ce1e7a38f9c7826 to your computer and use it in GitHub Desktop.
import nltk
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
def print_top_words(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
message = "Topic #%d: " % topic_idx
message += " ".join([feature_names[i] + " (" + str(round(topic[i], 2)) + ")"
for i in topic.argsort()[:-n_top_words - 1:-1]])
print(message)
data_samples = [nltk.corpus.gutenberg.raw(f_id)
for f_id in nltk.corpus.gutenberg.fileids()]
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
stop_words='english',
max_features=2000)
tf = tf_vectorizer.fit_transform(data_samples)
lda = LatentDirichletAllocation(n_components=30,
learning_method='batch',
n_jobs=-1, # all CPUs
verbose=1,
evaluate_every=10,
max_iter=100,
random_state=1)
doc_distr = lda.fit_transform(tf)
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, 5)
for d in doc_distr:
print np.where(d > 0.001)[0]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment