Created
July 22, 2020 18:45
-
-
Save joshstrupp/e80343b23127ae6b0b83e6149ee9aafb to your computer and use it in GitHub Desktop.
Generate similarity scores between Subreddit using LDA modeling
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gensim | |
from gensim.corpora import Dictionary | |
from gensim.models import ldamodel | |
from gensim.matutils import hellinger | |
from gensim.matutils import kullback_leibler | |
import pandas as pd | |
import praw | |
import nltk | |
from pprint import pprint | |
# Enter your own client_id, client_secret, username and password, or follow this quick start guide: https://github.com/reddit-archive/reddit/wiki/OAuth2-Quick-Start-Example#first-steps | |
reddit = praw.Reddit(user_agent='Comment Extraction (by /u/USERNAME)',client_id='enter_here',client_secret="enter_here",username='enter_here', password='enter_here') | |
# Convert large .txt corpus into list | |
# You can train your model on a selected corpus, or use the one I use below: https://github.com/joshstrupp/subreddit-sentiment-analysis-generator/blob/master/TheStory.txt | |
story_file = open('TheStory.txt', 'r') | |
res = [sub.split() for sub in story_file] | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
stopwords = nltk.corpus.stopwords.words('english') | |
newStopWords = ['like','For','&','also','To','said','took','*','This','It','de','The','He','She','They','I','2','Robert','A','.',',','In','one','two','-','de'] | |
stopwords.extend(newStopWords) | |
story_filtered_sentence = [] | |
i=0 | |
while i < len(res): | |
sublist = [] | |
for w in res[i]: | |
if w not in stopwords: | |
sublist.append(w) | |
story_filtered_sentence.append(sublist) | |
i = i + 1 | |
# Repeat process for subreddit #1 | |
subreddit = reddit.subreddit('apple') | |
with open('%s_hot.txt' % subreddit, 'a') as file: | |
posts = subreddit.hot(limit=50) | |
with open('%s_hot.txt' % subreddit, 'w') as file: | |
for post in posts: | |
file.write(post.title + '\n') | |
subreddit_file = open('%s_hot.txt' % subreddit, 'r') | |
res = [sub.split() for sub in subreddit_file] | |
subreddit1_filtered_sentence = [] | |
i=0 | |
while i < len(res): | |
for w in res[i]: | |
if w not in stopwords: | |
subreddit1_filtered_sentence.append(w) | |
i = i + 1 | |
# Repeat process for subreddit #2 | |
subreddit = reddit.subreddit('microsoft') | |
with open('%s_hot.txt' % subreddit, 'a') as file: | |
posts = subreddit.hot(limit=50) | |
with open('%s_hot.txt' % subreddit, 'w') as file: | |
for post in posts: | |
file.write(post.title + '\n') | |
subreddit_file = open('%s_hot.txt' % subreddit, 'r') | |
# story_list = [line.split('\t') for line in story_file.readlines()] | |
res = [sub.split() for sub in subreddit_file] | |
subreddit2_filtered_sentence = [] | |
i=0 | |
while i < len(res): | |
for w in res[i]: | |
if w not in stopwords: | |
subreddit2_filtered_sentence.append(w) | |
i = i + 1 | |
import logging | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) | |
# Create LDA model that will perform NLP to generate topics, and apply those topics to each subreddit description to generate similarity score | |
texts = story_filtered_sentence | |
dictionary = Dictionary(texts) | |
corpus = [dictionary.doc2bow(text) for text in texts] | |
import numpy as np | |
np.random.seed(1) | |
from gensim.models import ldamodel | |
model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=5, minimum_probability=1e-8) | |
# model.show_topics() | |
i = 0 | |
s_0 = subreddit1_filtered_sentence | |
s_0_bow = model.id2word.doc2bow(s_0) | |
s_0_lda_bow = model[s_0_bow] | |
e_0 = subreddit2_filtered_sentence | |
e_0_bow = model.id2word.doc2bow(e_0) | |
e_0_lda_bow = model[e_0_bow] | |
x = 100 - (hellinger(e_0_lda_bow, s_0_lda_bow)*100) | |
# x in this case represents percent similarity. So closer to 100% is more similar. | |
print(x) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment