Created
December 22, 2021 08:25
-
-
Save pranavraikote/07a2933ac2e899ada0f11cb067e9e807 to your computer and use it in GitHub Desktop.
NLP Tutorials - Part 27: Topic Modelling
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!pip install pyLDAvis | |
import nltk | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
import os | |
import re | |
import nltk | |
import string | |
import pyLDAvis | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
import pyLDAvis.sklearn | |
import plotly.express as px | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud | |
from pyLDAvis import sklearn as sklearn_lda | |
from sklearn.decomposition import LatentDirichletAllocation | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
import warnings | |
warnings.filterwarnings("ignore") | |
df = pd.read_csv('medium_data.csv') | |
df.head(10) | |
def preprocess(df): | |
stopwords = nltk.corpus.stopwords.words('english') | |
df['title_process'] = df['title'].astype(str) | |
df['title_process'] = df['title_process'].apply(lambda x : x.lower()) | |
df['title_process'] = df['title_process'].apply(lambda x : nltk.word_tokenize(x)) | |
df['title_process'] = df['title_process'].apply(lambda x : [item for item in x if item not in stopwords]) | |
df['title_process'] = df['title_process'].apply(lambda x : " ".join(x)) | |
df['title_process'] = df['title_process'].str.replace('@[^\s]+', "") | |
df['title_process'] = df['title_process'].str.replace('https?:\/\/.*[\r\n]*', '') | |
df['title_process'] = df['title_process'].str.replace('\d+', '') | |
df['title_process'] = df['title_process'].str.replace('[^\w\s]', '') | |
return df | |
df_data_science = preprocess(df) | |
df_data_science | |
tf_idf = TfidfVectorizer() | |
doc_term_matrix = tf_idf.fit_transform(df_data_science["title"].values) | |
print(doc_term_matrix) | |
# LDA requires us to specify the number of topics. So that will be hyperparameter to tweak. | |
number_topics = 3 | |
number_words = 25 | |
LDA = LatentDirichletAllocation(n_components = number_topics, n_jobs = -1) | |
LDA.fit(doc_term_matrix) | |
def print_topics(model, count_vectorizer, n_top_words): | |
words = count_vectorizer.get_feature_names() | |
for topic_idx, topic in enumerate(model.components_): | |
print("\nTopic #%d:" % topic_idx) | |
print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) | |
print("Topics:") | |
print_topics(LDA, tf_idf, number_words) | |
#Prepare to display result in the Jupyter notebook | |
pyLDAvis.enable_notebook() | |
LDAvis = sklearn_lda.prepare(LDA, doc_term_matrix, tf_idf) | |
#Run the visualization [mds is a function to use for visualizing the "distance" between topics] | |
LDAvis |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment