Created
September 26, 2021 11:54
-
-
Save rloredo/5fc2df802acef6abcbe2ec8256e17bce to your computer and use it in GitHub Desktop.
LDA topic modelling with sklearn and visualization with pyLDAvis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.decomposition import LatentDirichletAllocation | |
from sklearn.feature_extraction.text import CountVectorizer | |
#For cvectorizer | |
def do_nothing(x): | |
return x | |
#Create CV matrix | |
#Use max_df to delete words that appears in more than x.x% of documents (float is %) | |
#Use min_df to delete words that appears in less than x documents (int is x) | |
#Use ngram_range to create ngrams and use them as extra features | |
cv = CountVectorizer(lowercase=False,tokenizer=do_nothing, max_features = None, ngram_range = (1,3), max_df = 0.95, min_df=15) | |
cv_matrix = cv.fit_transform(docs.values) | |
#Fit LDA with 2 components | |
lda = LatentDirichletAllocation(n_components = 2, max_iter = 20, doc_topic_prior=0.5, topic_word_prior=0.2, learning_decay = 0.6, n_jobs = -1, random_state = 42) | |
trans_matrix = lda.fit_transform(cv_matrix) | |
#Visualize and analyze reuslts | |
import pyLDAvis | |
import pyLDAvis.sklearn | |
panel = pyLDAvis.sklearn.prepare(lda, cv_matrix, cv, mds='tsne', sort_topics=False, n_jobs = -1) | |
word_info = panel.topic_info | |
#To save panel in html | |
pyLDAvis.save_html(panel, 'panel.html') | |
#Print top 30 keywords | |
for topic in word_info.loc[word_info.Category != 'Default'].Category.unique(): | |
print(topic) | |
print(word_info.loc[word_info.Category.isin([topic])].sort_values('logprob', ascending = False).Term.values[:30]) | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment