Skip to content

Instantly share code, notes, and snippets.

@Sourceless
Last active December 11, 2015 10:38
Show Gist options
  • Save Sourceless/4588242 to your computer and use it in GitHub Desktop.
Save Sourceless/4588242 to your computer and use it in GitHub Desktop.
Get frequencies of words from the comments of n subreddit pages. Most of the variables you may want to change are in main(). It produces a reddit markdown table to console of the 200 most common words, and outputs a raw dump of all words. Rate limited to under 30 requests/second. If anyone wants to play with this, or take it on, it needs a user …
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# Author: github.com/Sourceless
from urllib2 import urlopen, HTTPError, URLError
from socket import timeout
from time import sleep
from collections import Counter
import json
def get_ids_json(redditlisting):
# Accepts a reddit/subreddit listing and gets the id of each post,
# returning them in a list.
ids = [] # I'm annoyed by the lack of apostrophe too
for item in redditlisting[u'data'][u'children']:
ids.append(item[u'data'][u'id'])
return ids
def get_comments_and_children(commentjson):
comments = []
#print "Parent Comment:"
for comment in commentjson[:-1]:
#print comment[u'data'][u'replies'][u'data'][u'children']
#print comment[u'data'][u'id']
if comment[u'data'][u'body'] != u'':
comments += [comment[u'data'][u'body']]
try:
comments += get_comments_and_children(comment[u'data'][u'replies'][u'data'][u'children'])
except TypeError, e:
#print "No more children for", comment[u'data'][u'link_id'], "by", comment[u'data'][u'author']
continue
return comments
def get_comments(subreddit, post_id):
# Accepts a self post/comments page and returns the text of each
# comments in list form.
successful = False
while not successful:
try:
fh = urlopen("http://www.reddit.com/r/" + subreddit + "/comments/" + post_id + "/.json", timeout=1)
fd = json.loads(fh.read())
successful = True
except (HTTPError, URLError, timeout), e:
print e, "- retrying"
else:
fh.close()
sleep(2) # Keep to reddit's 30 requests/min rule
comments = [fd[0][u'data'][u'children'][0][u'data'][u'selftext']] # add the selftext onto the comments list
# gives an empty string for non-self posts
comments += get_comments_and_children(fd[1][u'data'][u'children'])
return comments
## Don't need! Same as just reading the last comment's id
##def get_next_page_id(redditlisting):
## # Finds the id of the next page and returns it
## return redditlisting[u'data'][u'after']
def json_load_reddit(subreddit, after_comment_id=""): # Should merge with getmcomments
successful = False
while not successful:
try:
fh = urlopen("http://www.reddit.com/r/" + subreddit + "/.json?after=" + after_comment_id, timeout=1) # need to make a proper user agent
fd = json.loads(fh.read())
successful = True
except (HTTPError, URLError, timeout), e:
print e, "- retrying"
else:
fh.close()
sleep(2) # In keeping with reddit's rules
return fd
def main():
subreddit = "intp" # CHANGE THIS... potential other mbti types that want to use it... or anyone else...
numpages = 10 # Multiplies the running time!
ids = []
last_comment_id = ""
for _ in xrange(numpages):
page = json_load_reddit(subreddit, last_comment_id) # Get the first four pages
ids += get_ids_json(page)
last_comment_id = ids[-1]
comments = []
for thread in ids:
print "Scraping thread:", thread
comments += get_comments(subreddit, thread)
print "\nFetched", len(comments), "comments in", len(ids), "threads."
cnt = Counter()
fh = open(unicode('raw_dump.txt'), 'w')
for comment in comments:
for char in comment:
try:
fh.write(char)
except (UnicodeDecodeError, UnicodeEncodeError), e:
print e # this was really pissing me off... so losing performance ftw
fh.write('\n')
fh.close()
for comment in comments:
_comment = filter(lambda c: c.isalpha() or c in (' ', '\t', '\n', '/', ':'), comment) # Get rid of non-alphabetic characters, except for spaces and linkymathings (will regex out later, cba to now)
for word in _comment.split(' '):
cnt[word.lower()] += 1 # Count stuff
del cnt[''] # Get rid of empties, of course
longest = max(map(len, cnt.keys()))
highest = max(cnt.values())
print "word|times\n---:|:---"
for word, value in cnt.most_common(200): # can totally change the number here... but things got boring after a while
print word + "|" + str(value)
if __name__ == '__main__':
main()
@Sourceless
Copy link
Author

Also unicode problems. Btw, usual running time for 1 page is ~1 minute, in 26 requests. There's probably a much better way to do this.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment