Skip to content

Instantly share code, notes, and snippets.

@rloredo
Created September 26, 2021 11:54
Show Gist options
  • Save rloredo/5fc2df802acef6abcbe2ec8256e17bce to your computer and use it in GitHub Desktop.
Save rloredo/5fc2df802acef6abcbe2ec8256e17bce to your computer and use it in GitHub Desktop.
LDA topic modelling with sklearn and visualization with pyLDAvis
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
#For cvectorizer
def do_nothing(x):
return x
#Create CV matrix
#Use max_df to delete words that appears in more than x.x% of documents (float is %)
#Use min_df to delete words that appears in less than x documents (int is x)
#Use ngram_range to create ngrams and use them as extra features
cv = CountVectorizer(lowercase=False,tokenizer=do_nothing, max_features = None, ngram_range = (1,3), max_df = 0.95, min_df=15)
cv_matrix = cv.fit_transform(docs.values)
#Fit LDA with 2 components
lda = LatentDirichletAllocation(n_components = 2, max_iter = 20, doc_topic_prior=0.5, topic_word_prior=0.2, learning_decay = 0.6, n_jobs = -1, random_state = 42)
trans_matrix = lda.fit_transform(cv_matrix)
#Visualize and analyze reuslts
import pyLDAvis
import pyLDAvis.sklearn
panel = pyLDAvis.sklearn.prepare(lda, cv_matrix, cv, mds='tsne', sort_topics=False, n_jobs = -1)
word_info = panel.topic_info
#To save panel in html
pyLDAvis.save_html(panel, 'panel.html')
#Print top 30 keywords
for topic in word_info.loc[word_info.Category != 'Default'].Category.unique():
print(topic)
print(word_info.loc[word_info.Category.isin([topic])].sort_values('logprob', ascending = False).Term.values[:30])
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment