joshstrupp/Subreddit similarity score.py

## Subreddit similarity score.py
import gensim
from gensim.corpora import Dictionary
from gensim.models import ldamodel
from gensim.matutils import hellinger
from gensim.matutils import kullback_leibler
import pandas as pd
import praw
import nltk
from pprint import pprint

# Enter your own client_id, client_secret, username and password, or follow this quick start guide: https://github.com/reddit-archive/reddit/wiki/OAuth2-Quick-Start-Example#first-steps
reddit = praw.Reddit(user_agent='Comment Extraction (by /u/USERNAME)',client_id='enter_here',client_secret="enter_here",username='enter_here', password='enter_here')

# Convert large .txt corpus into list

# You can train your model on a selected corpus, or use the one I use below: https://github.com/joshstrupp/subreddit-sentiment-analysis-generator/blob/master/TheStory.txt
story_file = open('TheStory.txt', 'r')
res = [sub.split() for sub in story_file]

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['like','For','&','also','To','said','took','*','This','It','de','The','He','She','They','I','2','Robert','A','.',',','In','one','two','-','de']
stopwords.extend(newStopWords)


story_filtered_sentence = []

i=0
while i < len(res):
    sublist = []
    for w in res[i]:
        if w not in stopwords:
            sublist.append(w)
    story_filtered_sentence.append(sublist)
    i = i + 1


# Repeat process for subreddit #1

subreddit = reddit.subreddit('apple')

with open('%s_hot.txt' % subreddit, 'a') as file:
    posts = subreddit.hot(limit=50)

with open('%s_hot.txt' % subreddit, 'w') as file:
    for post in posts:
        file.write(post.title + '\n')

subreddit_file = open('%s_hot.txt' % subreddit, 'r')
res = [sub.split() for sub in subreddit_file]

subreddit1_filtered_sentence = []

i=0

while i < len(res):
    for w in res[i]:
        if w not in stopwords:
            subreddit1_filtered_sentence.append(w)
    i = i + 1

# Repeat process for subreddit #2

subreddit = reddit.subreddit('microsoft')

with open('%s_hot.txt' % subreddit, 'a') as file:
    posts = subreddit.hot(limit=50)

with open('%s_hot.txt' % subreddit, 'w') as file:
    for post in posts:
        file.write(post.title + '\n')


subreddit_file = open('%s_hot.txt' % subreddit, 'r')
# story_list = [line.split('\t') for line in story_file.readlines()]
res = [sub.split() for sub in subreddit_file]


subreddit2_filtered_sentence = []

i=0

while i < len(res):
    for w in res[i]:
        if w not in stopwords:
            subreddit2_filtered_sentence.append(w)
    i = i + 1

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Create LDA model that will perform NLP to generate topics, and apply those topics to each subreddit description to generate similarity score

texts = story_filtered_sentence

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

import numpy as np
np.random.seed(1)

from gensim.models import ldamodel
model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=5, minimum_probability=1e-8)
# model.show_topics()

i = 0

s_0 = subreddit1_filtered_sentence
s_0_bow = model.id2word.doc2bow(s_0)
s_0_lda_bow = model[s_0_bow]

e_0 = subreddit2_filtered_sentence
e_0_bow = model.id2word.doc2bow(e_0)
e_0_lda_bow = model[e_0_bow]

x = 100 - (hellinger(e_0_lda_bow, s_0_lda_bow)*100)

# x in this case represents percent similarity. So closer to 100% is more similar.
print(x)
	import gensim
	from gensim.corpora import Dictionary
	from gensim.models import ldamodel
	from gensim.matutils import hellinger
	from gensim.matutils import kullback_leibler
	import pandas as pd
	import praw
	import nltk
	from pprint import pprint

	# Enter your own client_id, client_secret, username and password, or follow this quick start guide: https://github.com/reddit-archive/reddit/wiki/OAuth2-Quick-Start-Example#first-steps
	reddit = praw.Reddit(user_agent='Comment Extraction (by /u/USERNAME)',client_id='enter_here',client_secret="enter_here",username='enter_here', password='enter_here')

	# Convert large .txt corpus into list

	# You can train your model on a selected corpus, or use the one I use below: https://github.com/joshstrupp/subreddit-sentiment-analysis-generator/blob/master/TheStory.txt
	story_file = open('TheStory.txt', 'r')
	res = [sub.split() for sub in story_file]

	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize

	stopwords = nltk.corpus.stopwords.words('english')
	newStopWords = ['like','For','&','also','To','said','took','*','This','It','de','The','He','She','They','I','2','Robert','A','.',',','In','one','two','-','de']
	stopwords.extend(newStopWords)


	story_filtered_sentence = []

	i=0
	while i < len(res):
	sublist = []
	for w in res[i]:
	if w not in stopwords:
	sublist.append(w)
	story_filtered_sentence.append(sublist)
	i = i + 1


	# Repeat process for subreddit #1

	subreddit = reddit.subreddit('apple')

	with open('%s_hot.txt' % subreddit, 'a') as file:
	posts = subreddit.hot(limit=50)

	with open('%s_hot.txt' % subreddit, 'w') as file:
	for post in posts:
	file.write(post.title + '\n')

	subreddit_file = open('%s_hot.txt' % subreddit, 'r')
	res = [sub.split() for sub in subreddit_file]

	subreddit1_filtered_sentence = []

	i=0

	while i < len(res):
	for w in res[i]:
	if w not in stopwords:
	subreddit1_filtered_sentence.append(w)
	i = i + 1

	# Repeat process for subreddit #2

	subreddit = reddit.subreddit('microsoft')

	with open('%s_hot.txt' % subreddit, 'a') as file:
	posts = subreddit.hot(limit=50)

	with open('%s_hot.txt' % subreddit, 'w') as file:
	for post in posts:
	file.write(post.title + '\n')


	subreddit_file = open('%s_hot.txt' % subreddit, 'r')
	# story_list = [line.split('\t') for line in story_file.readlines()]
	res = [sub.split() for sub in subreddit_file]


	subreddit2_filtered_sentence = []

	i=0

	while i < len(res):
	for w in res[i]:
	if w not in stopwords:
	subreddit2_filtered_sentence.append(w)
	i = i + 1

	import logging
	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

	# Create LDA model that will perform NLP to generate topics, and apply those topics to each subreddit description to generate similarity score

	texts = story_filtered_sentence

	dictionary = Dictionary(texts)
	corpus = [dictionary.doc2bow(text) for text in texts]

	import numpy as np
	np.random.seed(1)

	from gensim.models import ldamodel
	model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=5, minimum_probability=1e-8)
	# model.show_topics()

	i = 0

	s_0 = subreddit1_filtered_sentence
	s_0_bow = model.id2word.doc2bow(s_0)
	s_0_lda_bow = model[s_0_bow]

	e_0 = subreddit2_filtered_sentence
	e_0_bow = model.id2word.doc2bow(e_0)
	e_0_lda_bow = model[e_0_bow]

	x = 100 - (hellinger(e_0_lda_bow, s_0_lda_bow)*100)

	# x in this case represents percent similarity. So closer to 100% is more similar.
	print(x)