Generate similarity scores between Subreddit using LDA modeling
import gensim
from gensim.corpora import Dictionary
from gensim.models import ldamodel
from gensim.matutils import hellinger
from gensim.matutils import kullback_leibler
import pandas as pd
import praw
import nltk
from pprint import pprint
# Enter your own client_id, client_secret, username and password, or follow this quick start guide:
reddit = praw.Reddit(user_agent='Comment Extraction (by /u/USERNAME)',client_id='enter_here',client_secret="enter_here",username='enter_here', password='enter_here')
# Convert large .txt corpus into list
# You can train your model on a selected corpus, or use the one I use below:
story_file = open('TheStory.txt', 'r')
res = [sub.split() for sub in story_file]
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['like','For','&','also','To','said','took','*','This','It','de','The','He','She','They','I','2','Robert','A','.',',','In','one','two','-','de']
story_filtered_sentence = []
while i < len(res):
sublist = []
for w in res[i]:
if w not in stopwords:
i = i + 1
# Repeat process for subreddit #1
subreddit = reddit.subreddit('apple')
with open('%s_hot.txt' % subreddit, 'a') as file:
posts =
with open('%s_hot.txt' % subreddit, 'w') as file:
for post in posts:
file.write(post.title + '\n')
subreddit_file = open('%s_hot.txt' % subreddit, 'r')
res = [sub.split() for sub in subreddit_file]
subreddit1_filtered_sentence = []
while i < len(res):
for w in res[i]:
if w not in stopwords:
i = i + 1
# Repeat process for subreddit #2
subreddit = reddit.subreddit('microsoft')
with open('%s_hot.txt' % subreddit, 'a') as file:
posts =
with open('%s_hot.txt' % subreddit, 'w') as file:
for post in posts:
file.write(post.title + '\n')
subreddit_file = open('%s_hot.txt' % subreddit, 'r')
# story_list = [line.split('\t') for line in story_file.readlines()]
res = [sub.split() for sub in subreddit_file]
subreddit2_filtered_sentence = []
while i < len(res):
for w in res[i]:
if w not in stopwords:
i = i + 1
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# Create LDA model that will perform NLP to generate topics, and apply those topics to each subreddit description to generate similarity score
texts = story_filtered_sentence
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
import numpy as np
from gensim.models import ldamodel
model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=5, minimum_probability=1e-8)
# model.show_topics()
i = 0
s_0 = subreddit1_filtered_sentence
s_0_bow = model.id2word.doc2bow(s_0)
s_0_lda_bow = model[s_0_bow]
e_0 = subreddit2_filtered_sentence
e_0_bow = model.id2word.doc2bow(e_0)
e_0_lda_bow = model[e_0_bow]
x = 100 - (hellinger(e_0_lda_bow, s_0_lda_bow)*100)
# x in this case represents percent similarity. So closer to 100% is more similar.
