Created
January 10, 2014 23:16
-
-
Save DanBrink91/8364562 to your computer and use it in GitHub Desktop.
Calculate which subreddit a title(sentence) should be posted to based on previous titles
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import math | |
from collections import Counter | |
sentence = "I love this game" | |
json_data = open('reddits.json') | |
data = json.load(json_data) | |
stop_words = set(['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de', 'describe', 'detail', 'did', 'do', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it', 'its', 'itself', 'keep', 'last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should', 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'system', 'take', 'ten', 'than', 'that', 'the', 'their', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under', 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well', 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves']) | |
# Calculate word counts for each sub | |
word_banks = {} | |
for sub, titles in data.items(): | |
word_banks[sub] = {} | |
word_banks[sub]['single'] = Counter(filter(lambda x: x not in stop_words,' '.join(titles).lower().split(' '))) | |
word_banks[sub]['double'] = {} | |
title = [title.lower().split(' ') for title in titles] | |
for words in title: | |
for i in range(len(words)-1): | |
if words[i] in word_banks[sub]['double']: | |
word_banks[sub]['double'][words[i]].append(words[i+1]) | |
else: | |
word_banks[sub]['double'][words[i]] = [words[i+1]] | |
# Calculate probablities for each word for each sub | |
prob = {} | |
for sub, words in word_banks.items(): | |
prob[sub] = {} | |
n = sum(words['single'].values()) # Total words | |
vocab = len(words['single']) # unique word count | |
prob[sub]['single'] = {word: (float(word_count + 1)/(n+vocab)) for word, word_count in words['single'].items()} | |
prob[sub]['double'] = {} | |
for word, word_list in words['double'].items(): | |
prob[sub]['double'][word] ={second_word: (float(sum([1 for other_word in word_list if second_word==other_word]))/len(word_list)) for second_word in word_list} | |
# Score sentence | |
# Normally you multiply the probablities but the numbers are too low here so we log and sum them | |
def find_subreddit(sentence): | |
best_sub_score = 0 | |
best_sub = "" | |
words_in_sentence = sentence.lower().split() | |
word_length = len(words_in_sentence) | |
for sub in prob.keys(): | |
double_result = 0.0 | |
result = 0.0 | |
for i in range(word_length): | |
if words_in_sentence[i] in prob[sub]['single']: | |
result += math.log(prob[sub]['single'][words_in_sentence[i]]) | |
# Add the probably of the next word following this one, this doesn't run on the last word of a sentence | |
if i < word_length -1 and words_in_sentence[i] in prob[sub]['double'] and words_in_sentence[i+1] in prob[sub]['double'][words_in_sentence[i]]: | |
double_result += math.log(prob[sub]['double'][words_in_sentence[i]][words_in_sentence[i+1]]) | |
# Comment the line below to see results without looking ahead to the next word | |
result += double_result | |
if result < best_sub_score: | |
best_sub = sub | |
best_sub_score = result | |
return best_sub | |
print find_subreddit(sentence) | |
''' | |
# testing report here | |
total = 0 | |
correct = 0 | |
for sub, titles in data.items(): | |
for title in titles: | |
total += 1 | |
if sub==find_subreddit(title): | |
correct += 1 | |
print str(correct) + "/"+str(total) | |
print float(correct)/total | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment