Skip to content

Instantly share code, notes, and snippets.

Created December 22, 2021 08:25
Show Gist options
  • Save pranavraikote/07a2933ac2e899ada0f11cb067e9e807 to your computer and use it in GitHub Desktop.
Save pranavraikote/07a2933ac2e899ada0f11cb067e9e807 to your computer and use it in GitHub Desktop.
NLP Tutorials - Part 27: Topic Modelling
!pip install pyLDAvis
import nltk'stopwords')'punkt')
import os
import re
import nltk
import string
import pyLDAvis
import numpy as np
import pandas as pd
import seaborn as sns
import pyLDAvis.sklearn
import as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from pyLDAvis import sklearn as sklearn_lda
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
df = pd.read_csv('medium_data.csv')
def preprocess(df):
stopwords = nltk.corpus.stopwords.words('english')
df['title_process'] = df['title'].astype(str)
df['title_process'] = df['title_process'].apply(lambda x : x.lower())
df['title_process'] = df['title_process'].apply(lambda x : nltk.word_tokenize(x))
df['title_process'] = df['title_process'].apply(lambda x : [item for item in x if item not in stopwords])
df['title_process'] = df['title_process'].apply(lambda x : " ".join(x))
df['title_process'] = df['title_process'].str.replace('@[^\s]+', "")
df['title_process'] = df['title_process'].str.replace('https?:\/\/.*[\r\n]*', '')
df['title_process'] = df['title_process'].str.replace('\d+', '')
df['title_process'] = df['title_process'].str.replace('[^\w\s]', '')
return df
df_data_science = preprocess(df)
tf_idf = TfidfVectorizer()
doc_term_matrix = tf_idf.fit_transform(df_data_science["title"].values)
# LDA requires us to specify the number of topics. So that will be hyperparameter to tweak.
number_topics = 3
number_words = 25
LDA = LatentDirichletAllocation(n_components = number_topics, n_jobs = -1)
def print_topics(model, count_vectorizer, n_top_words):
words = count_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(model.components_):
print("\nTopic #%d:" % topic_idx)
print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
print_topics(LDA, tf_idf, number_words)
#Prepare to display result in the Jupyter notebook
LDAvis = sklearn_lda.prepare(LDA, doc_term_matrix, tf_idf)
#Run the visualization [mds is a function to use for visualizing the "distance" between topics]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment