rloredo/LDA_sklearn.py

## LDA_sklearn.py
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
#For cvectorizer
def do_nothing(x):
    return x

#Create CV matrix
#Use max_df to delete words that appears in more than x.x% of documents (float is %)
#Use min_df to delete words that appears in less than x documents (int is x)
#Use ngram_range to create ngrams and use them as extra features
cv = CountVectorizer(lowercase=False,tokenizer=do_nothing, max_features = None, ngram_range = (1,3), max_df = 0.95, min_df=15)
cv_matrix = cv.fit_transform(docs.values)

#Fit LDA with 2 components
lda = LatentDirichletAllocation(n_components = 2, max_iter = 20, doc_topic_prior=0.5, topic_word_prior=0.2, learning_decay = 0.6, n_jobs = -1, random_state = 42)
trans_matrix = lda.fit_transform(cv_matrix)

#Visualize and analyze reuslts
import pyLDAvis
import pyLDAvis.sklearn


panel = pyLDAvis.sklearn.prepare(lda, cv_matrix, cv, mds='tsne', sort_topics=False, n_jobs = -1)
word_info = panel.topic_info

#To save panel in html
pyLDAvis.save_html(panel, 'panel.html')

#Print top 30 keywords
for topic in word_info.loc[word_info.Category != 'Default'].Category.unique():
    print(topic)
    print(word_info.loc[word_info.Category.isin([topic])].sort_values('logprob', ascending = False).Term.values[:30])
    print()
	from sklearn.decomposition import LatentDirichletAllocation
	from sklearn.feature_extraction.text import CountVectorizer
	#For cvectorizer
	def do_nothing(x):
	return x

	#Create CV matrix
	#Use max_df to delete words that appears in more than x.x% of documents (float is %)
	#Use min_df to delete words that appears in less than x documents (int is x)
	#Use ngram_range to create ngrams and use them as extra features
	cv = CountVectorizer(lowercase=False,tokenizer=do_nothing, max_features = None, ngram_range = (1,3), max_df = 0.95, min_df=15)
	cv_matrix = cv.fit_transform(docs.values)

	#Fit LDA with 2 components
	lda = LatentDirichletAllocation(n_components = 2, max_iter = 20, doc_topic_prior=0.5, topic_word_prior=0.2, learning_decay = 0.6, n_jobs = -1, random_state = 42)
	trans_matrix = lda.fit_transform(cv_matrix)

	#Visualize and analyze reuslts
	import pyLDAvis
	import pyLDAvis.sklearn


	panel = pyLDAvis.sklearn.prepare(lda, cv_matrix, cv, mds='tsne', sort_topics=False, n_jobs = -1)
	word_info = panel.topic_info

	#To save panel in html
	pyLDAvis.save_html(panel, 'panel.html')

	#Print top 30 keywords
	for topic in word_info.loc[word_info.Category != 'Default'].Category.unique():
	print(topic)
	print(word_info.loc[word_info.Category.isin([topic])].sort_values('logprob', ascending = False).Term.values[:30])
	print()