Skip to content

Instantly share code, notes, and snippets.

@Dref360
Created March 22, 2022 00:48
Show Gist options
  • Save Dref360/4df9723789d089b43851d8a58047364f to your computer and use it in GitHub Desktop.
Save Dref360/4df9723789d089b43851d8a58047364f to your computer and use it in GitHub Desktop.
import gensim
import nltk
from gensim import corpora, models
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import pandas as pd
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
"""
Taken from https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
"""
FILTERED_TOKENS = {'okay', 'like', 'know', 'yeah', 'think', 'thing'}
def lemmatize_stemming(text):
stemmer = SnowballStemmer('english', ignore_stopwords=True)
return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text):
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in FILTERED_TOKENS:
result.append(lemmatize_stemming(token))
return result
def find_topics(df: pd.DataFrame, text_column: str):
"""Find topics for a dataframe.
:note Just print for now.
:param df: Dataframe with data
:param text_column: Column name to get segment.
:return: Nothing
"""
processed_docs = df[text_column].map(preprocess)
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.2, keep_n=100)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
print('Topic: {} Word: {}'.format(idx, topic))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment