Skip to content

Instantly share code, notes, and snippets.

@pranavraikote
Created December 22, 2021 08:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pranavraikote/07a2933ac2e899ada0f11cb067e9e807 to your computer and use it in GitHub Desktop.
Save pranavraikote/07a2933ac2e899ada0f11cb067e9e807 to your computer and use it in GitHub Desktop.
NLP Tutorials - Part 27: Topic Modelling
!pip install pyLDAvis
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import os
import re
import nltk
import string
import pyLDAvis
import numpy as np
import pandas as pd
import seaborn as sns
import pyLDAvis.sklearn
import plotly.express as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from pyLDAvis import sklearn as sklearn_lda
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv('medium_data.csv')
df.head(10)
def preprocess(df):
stopwords = nltk.corpus.stopwords.words('english')
df['title_process'] = df['title'].astype(str)
df['title_process'] = df['title_process'].apply(lambda x : x.lower())
df['title_process'] = df['title_process'].apply(lambda x : nltk.word_tokenize(x))
df['title_process'] = df['title_process'].apply(lambda x : [item for item in x if item not in stopwords])
df['title_process'] = df['title_process'].apply(lambda x : " ".join(x))
df['title_process'] = df['title_process'].str.replace('@[^\s]+', "")
df['title_process'] = df['title_process'].str.replace('https?:\/\/.*[\r\n]*', '')
df['title_process'] = df['title_process'].str.replace('\d+', '')
df['title_process'] = df['title_process'].str.replace('[^\w\s]', '')
return df
df_data_science = preprocess(df)
df_data_science
tf_idf = TfidfVectorizer()
doc_term_matrix = tf_idf.fit_transform(df_data_science["title"].values)
print(doc_term_matrix)
# LDA requires us to specify the number of topics. So that will be hyperparameter to tweak.
number_topics = 3
number_words = 25
LDA = LatentDirichletAllocation(n_components = number_topics, n_jobs = -1)
LDA.fit(doc_term_matrix)
def print_topics(model, count_vectorizer, n_top_words):
words = count_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(model.components_):
print("\nTopic #%d:" % topic_idx)
print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("Topics:")
print_topics(LDA, tf_idf, number_words)
#Prepare to display result in the Jupyter notebook
pyLDAvis.enable_notebook()
LDAvis = sklearn_lda.prepare(LDA, doc_term_matrix, tf_idf)
#Run the visualization [mds is a function to use for visualizing the "distance" between topics]
LDAvis
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment