Last active
December 11, 2015 10:38
-
-
Save Sourceless/4588242 to your computer and use it in GitHub Desktop.
Get frequencies of words from the comments of n subreddit pages. Most of the variables you may want to change are in main(). It produces a reddit markdown table to console of the 200 most common words, and outputs a raw dump of all words. Rate limited to under 30 requests/second. If anyone wants to play with this, or take it on, it needs a user …
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# Author: github.com/Sourceless | |
from urllib2 import urlopen, HTTPError, URLError | |
from socket import timeout | |
from time import sleep | |
from collections import Counter | |
import json | |
def get_ids_json(redditlisting): | |
# Accepts a reddit/subreddit listing and gets the id of each post, | |
# returning them in a list. | |
ids = [] # I'm annoyed by the lack of apostrophe too | |
for item in redditlisting[u'data'][u'children']: | |
ids.append(item[u'data'][u'id']) | |
return ids | |
def get_comments_and_children(commentjson): | |
comments = [] | |
#print "Parent Comment:" | |
for comment in commentjson[:-1]: | |
#print comment[u'data'][u'replies'][u'data'][u'children'] | |
#print comment[u'data'][u'id'] | |
if comment[u'data'][u'body'] != u'': | |
comments += [comment[u'data'][u'body']] | |
try: | |
comments += get_comments_and_children(comment[u'data'][u'replies'][u'data'][u'children']) | |
except TypeError, e: | |
#print "No more children for", comment[u'data'][u'link_id'], "by", comment[u'data'][u'author'] | |
continue | |
return comments | |
def get_comments(subreddit, post_id): | |
# Accepts a self post/comments page and returns the text of each | |
# comments in list form. | |
successful = False | |
while not successful: | |
try: | |
fh = urlopen("http://www.reddit.com/r/" + subreddit + "/comments/" + post_id + "/.json", timeout=1) | |
fd = json.loads(fh.read()) | |
successful = True | |
except (HTTPError, URLError, timeout), e: | |
print e, "- retrying" | |
else: | |
fh.close() | |
sleep(2) # Keep to reddit's 30 requests/min rule | |
comments = [fd[0][u'data'][u'children'][0][u'data'][u'selftext']] # add the selftext onto the comments list | |
# gives an empty string for non-self posts | |
comments += get_comments_and_children(fd[1][u'data'][u'children']) | |
return comments | |
## Don't need! Same as just reading the last comment's id | |
##def get_next_page_id(redditlisting): | |
## # Finds the id of the next page and returns it | |
## return redditlisting[u'data'][u'after'] | |
def json_load_reddit(subreddit, after_comment_id=""): # Should merge with getmcomments | |
successful = False | |
while not successful: | |
try: | |
fh = urlopen("http://www.reddit.com/r/" + subreddit + "/.json?after=" + after_comment_id, timeout=1) # need to make a proper user agent | |
fd = json.loads(fh.read()) | |
successful = True | |
except (HTTPError, URLError, timeout), e: | |
print e, "- retrying" | |
else: | |
fh.close() | |
sleep(2) # In keeping with reddit's rules | |
return fd | |
def main(): | |
subreddit = "intp" # CHANGE THIS... potential other mbti types that want to use it... or anyone else... | |
numpages = 10 # Multiplies the running time! | |
ids = [] | |
last_comment_id = "" | |
for _ in xrange(numpages): | |
page = json_load_reddit(subreddit, last_comment_id) # Get the first four pages | |
ids += get_ids_json(page) | |
last_comment_id = ids[-1] | |
comments = [] | |
for thread in ids: | |
print "Scraping thread:", thread | |
comments += get_comments(subreddit, thread) | |
print "\nFetched", len(comments), "comments in", len(ids), "threads." | |
cnt = Counter() | |
fh = open(unicode('raw_dump.txt'), 'w') | |
for comment in comments: | |
for char in comment: | |
try: | |
fh.write(char) | |
except (UnicodeDecodeError, UnicodeEncodeError), e: | |
print e # this was really pissing me off... so losing performance ftw | |
fh.write('\n') | |
fh.close() | |
for comment in comments: | |
_comment = filter(lambda c: c.isalpha() or c in (' ', '\t', '\n', '/', ':'), comment) # Get rid of non-alphabetic characters, except for spaces and linkymathings (will regex out later, cba to now) | |
for word in _comment.split(' '): | |
cnt[word.lower()] += 1 # Count stuff | |
del cnt[''] # Get rid of empties, of course | |
longest = max(map(len, cnt.keys())) | |
highest = max(cnt.values()) | |
print "word|times\n---:|:---" | |
for word, value in cnt.most_common(200): # can totally change the number here... but things got boring after a while | |
print word + "|" + str(value) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Also unicode problems. Btw, usual running time for 1 page is ~1 minute, in 26 requests. There's probably a much better way to do this.