pranavraikote/text_LDA.py

## text_LDA.py
!pip install pyLDAvis

import nltk
nltk.download('stopwords')
nltk.download('punkt')

import os
import re
import nltk
import string
import pyLDAvis
import numpy as np
import pandas as pd
import seaborn as sns
import pyLDAvis.sklearn
import plotly.express as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from pyLDAvis import sklearn as sklearn_lda
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('medium_data.csv')
df.head(10)

def preprocess(df):
    stopwords = nltk.corpus.stopwords.words('english')
    df['title_process'] = df['title'].astype(str)
    df['title_process'] = df['title_process'].apply(lambda x : x.lower())
    df['title_process'] = df['title_process'].apply(lambda x : nltk.word_tokenize(x))
    df['title_process'] = df['title_process'].apply(lambda x : [item for item in x if item not in stopwords])
    df['title_process'] = df['title_process'].apply(lambda x : " ".join(x))
    df['title_process'] = df['title_process'].str.replace('@[^\s]+', "")
    df['title_process'] = df['title_process'].str.replace('https?:\/\/.*[\r\n]*', '')
    df['title_process'] = df['title_process'].str.replace('\d+', '')
    df['title_process'] = df['title_process'].str.replace('[^\w\s]', '')

    return df

df_data_science = preprocess(df)
df_data_science

tf_idf = TfidfVectorizer()
doc_term_matrix = tf_idf.fit_transform(df_data_science["title"].values)
print(doc_term_matrix)

# LDA requires us to specify the number of topics. So that will be hyperparameter to tweak.
number_topics = 3
number_words = 25
LDA = LatentDirichletAllocation(n_components = number_topics, n_jobs = -1)
LDA.fit(doc_term_matrix)

def print_topics(model, count_vectorizer, n_top_words):

    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

print("Topics:")
print_topics(LDA, tf_idf, number_words)

#Prepare to display result in the Jupyter notebook
pyLDAvis.enable_notebook()
LDAvis = sklearn_lda.prepare(LDA, doc_term_matrix, tf_idf)

#Run the visualization [mds is a function to use for visualizing the "distance" between topics]
LDAvis
	!pip install pyLDAvis

	import nltk
	nltk.download('stopwords')
	nltk.download('punkt')

	import os
	import re
	import nltk
	import string
	import pyLDAvis
	import numpy as np
	import pandas as pd
	import seaborn as sns
	import pyLDAvis.sklearn
	import plotly.express as px
	import matplotlib.pyplot as plt
	from wordcloud import WordCloud
	from pyLDAvis import sklearn as sklearn_lda
	from sklearn.decomposition import LatentDirichletAllocation
	from sklearn.feature_extraction.text import TfidfVectorizer

	import warnings
	warnings.filterwarnings("ignore")

	df = pd.read_csv('medium_data.csv')
	df.head(10)

	def preprocess(df):
	stopwords = nltk.corpus.stopwords.words('english')
	df['title_process'] = df['title'].astype(str)
	df['title_process'] = df['title_process'].apply(lambda x : x.lower())
	df['title_process'] = df['title_process'].apply(lambda x : nltk.word_tokenize(x))
	df['title_process'] = df['title_process'].apply(lambda x : [item for item in x if item not in stopwords])
	df['title_process'] = df['title_process'].apply(lambda x : " ".join(x))
	df['title_process'] = df['title_process'].str.replace('@[^\s]+', "")
	df['title_process'] = df['title_process'].str.replace('https?:\/\/.[\r\n]', '')
	df['title_process'] = df['title_process'].str.replace('\d+', '')
	df['title_process'] = df['title_process'].str.replace('[^\w\s]', '')

	return df

	df_data_science = preprocess(df)
	df_data_science

	tf_idf = TfidfVectorizer()
	doc_term_matrix = tf_idf.fit_transform(df_data_science["title"].values)
	print(doc_term_matrix)

	# LDA requires us to specify the number of topics. So that will be hyperparameter to tweak.
	number_topics = 3
	number_words = 25
	LDA = LatentDirichletAllocation(n_components = number_topics, n_jobs = -1)
	LDA.fit(doc_term_matrix)

	def print_topics(model, count_vectorizer, n_top_words):

	words = count_vectorizer.get_feature_names()
	for topic_idx, topic in enumerate(model.components_):
	print("\nTopic #%d:" % topic_idx)
	print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

	print("Topics:")
	print_topics(LDA, tf_idf, number_words)

	#Prepare to display result in the Jupyter notebook
	pyLDAvis.enable_notebook()
	LDAvis = sklearn_lda.prepare(LDA, doc_term_matrix, tf_idf)

	#Run the visualization [mds is a function to use for visualizing the "distance" between topics]
	LDAvis