Created December 22, 2021 08:25
NLP Tutorials - Part 27: Topic Modelling
!pip install pyLDAvis
import nltk'stopwords')'punkt')
import os
import re
import nltk
import string
import pyLDAvis
import numpy as np
import pandas as pd
import seaborn as sns
import pyLDAvis.sklearn
import as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from pyLDAvis import sklearn as sklearn_lda
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
df = pd.read_csv('medium_data.csv')
def preprocess(df):
stopwords = nltk.corpus.stopwords.words('english')
df['title_process'] = df['title'].astype(str)
df['title_process'] = df['title_process'].apply(lambda x : x.lower())
df['title_process'] = df['title_process'].apply(lambda x : nltk.word_tokenize(x))
df['title_process'] = df['title_process'].apply(lambda x : [item for item in x if item not in stopwords])
df['title_process'] = df['title_process'].apply(lambda x : " ".join(x))
df['title_process'] = df['title_process'].str.replace('@[^\s]+', "")
df['title_process'] = df['title_process'].str.replace('https?:\/\/.*[\r\n]*', '')
df['title_process'] = df['title_process'].str.replace('\d+', '')
df['title_process'] = df['title_process'].str.replace('[^\w\s]', '')
return df
df_data_science = preprocess(df)
tf_idf = TfidfVectorizer()
doc_term_matrix = tf_idf.fit_transform(df_data_science["title"].values)
# LDA requires us to specify the number of topics. So that will be hyperparameter to tweak.
number_topics = 3
number_words = 25
LDA = LatentDirichletAllocation(n_components = number_topics, n_jobs = -1)
def print_topics(model, count_vectorizer, n_top_words):
words = count_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(model.components_):
print("\nTopic #%d:" % topic_idx)
print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
print_topics(LDA, tf_idf, number_words)
#Prepare to display result in the Jupyter notebook
LDAvis = sklearn_lda.prepare(LDA, doc_term_matrix, tf_idf)
#Run the visualization [mds is a function to use for visualizing the "distance" between topics]
