Skip to content

Instantly share code, notes, and snippets.

@joshstrupp
Created July 22, 2020 18:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joshstrupp/e80343b23127ae6b0b83e6149ee9aafb to your computer and use it in GitHub Desktop.
Save joshstrupp/e80343b23127ae6b0b83e6149ee9aafb to your computer and use it in GitHub Desktop.
Generate similarity scores between Subreddit using LDA modeling
import gensim
from gensim.corpora import Dictionary
from gensim.models import ldamodel
from gensim.matutils import hellinger
from gensim.matutils import kullback_leibler
import pandas as pd
import praw
import nltk
from pprint import pprint
# Enter your own client_id, client_secret, username and password, or follow this quick start guide: https://github.com/reddit-archive/reddit/wiki/OAuth2-Quick-Start-Example#first-steps
reddit = praw.Reddit(user_agent='Comment Extraction (by /u/USERNAME)',client_id='enter_here',client_secret="enter_here",username='enter_here', password='enter_here')
# Convert large .txt corpus into list
# You can train your model on a selected corpus, or use the one I use below: https://github.com/joshstrupp/subreddit-sentiment-analysis-generator/blob/master/TheStory.txt
story_file = open('TheStory.txt', 'r')
res = [sub.split() for sub in story_file]
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['like','For','&','also','To','said','took','*','This','It','de','The','He','She','They','I','2','Robert','A','.',',','In','one','two','-','de']
stopwords.extend(newStopWords)
story_filtered_sentence = []
i=0
while i < len(res):
sublist = []
for w in res[i]:
if w not in stopwords:
sublist.append(w)
story_filtered_sentence.append(sublist)
i = i + 1
# Repeat process for subreddit #1
subreddit = reddit.subreddit('apple')
with open('%s_hot.txt' % subreddit, 'a') as file:
posts = subreddit.hot(limit=50)
with open('%s_hot.txt' % subreddit, 'w') as file:
for post in posts:
file.write(post.title + '\n')
subreddit_file = open('%s_hot.txt' % subreddit, 'r')
res = [sub.split() for sub in subreddit_file]
subreddit1_filtered_sentence = []
i=0
while i < len(res):
for w in res[i]:
if w not in stopwords:
subreddit1_filtered_sentence.append(w)
i = i + 1
# Repeat process for subreddit #2
subreddit = reddit.subreddit('microsoft')
with open('%s_hot.txt' % subreddit, 'a') as file:
posts = subreddit.hot(limit=50)
with open('%s_hot.txt' % subreddit, 'w') as file:
for post in posts:
file.write(post.title + '\n')
subreddit_file = open('%s_hot.txt' % subreddit, 'r')
# story_list = [line.split('\t') for line in story_file.readlines()]
res = [sub.split() for sub in subreddit_file]
subreddit2_filtered_sentence = []
i=0
while i < len(res):
for w in res[i]:
if w not in stopwords:
subreddit2_filtered_sentence.append(w)
i = i + 1
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# Create LDA model that will perform NLP to generate topics, and apply those topics to each subreddit description to generate similarity score
texts = story_filtered_sentence
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
import numpy as np
np.random.seed(1)
from gensim.models import ldamodel
model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=5, minimum_probability=1e-8)
# model.show_topics()
i = 0
s_0 = subreddit1_filtered_sentence
s_0_bow = model.id2word.doc2bow(s_0)
s_0_lda_bow = model[s_0_bow]
e_0 = subreddit2_filtered_sentence
e_0_bow = model.id2word.doc2bow(e_0)
e_0_lda_bow = model[e_0_bow]
x = 100 - (hellinger(e_0_lda_bow, s_0_lda_bow)*100)
# x in this case represents percent similarity. So closer to 100% is more similar.
print(x)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment