Last active
November 27, 2021 14:46
-
-
Save avriiil/af64ea79cc676a05d35a04560c5d58cb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# importing libraries | |
import pandas as pd | |
import numpy as np | |
import gensim | |
from gensim import corpora, models | |
# cast tweets to numpy array | |
docs = df.tweet_text.to_numpy() | |
# create dictionary of all words in all documents | |
dictionary = gensim.corpora.Dictionary(docs) | |
# filter extreme cases out of dictionary | |
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000) | |
# create BOW dictionary | |
bow_corpus = [dictionary.doc2bow(doc) for doc in docs] | |
# create LDA model using preferred hyperparameters | |
lda_model = gensim.models.LdaMulticore(bow_corpus, | |
num_topics=5, | |
id2word=dictionary, | |
passes=4, | |
workers=2, | |
random_state=21) | |
# Save LDA model to disk | |
path_to_model = "" | |
lda_model.save(path_to_model) | |
# for each topic, print words occuring in that topic | |
for idx, topic in lda_model.print_topics(-1): | |
print('Topic: {} \nWords: {}'.format(idx, topic)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment