Sourceless/gist:4588242

## gistfile1.py
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# Author: github.com/Sourceless

from urllib2 import urlopen, HTTPError, URLError
from socket import timeout
from time import sleep
from collections import Counter
import json

def get_ids_json(redditlisting):
    # Accepts a reddit/subreddit listing and gets the id of each post,
    # returning them in a list.
    ids = [] # I'm annoyed by the lack of apostrophe too

    for item in redditlisting[u'data'][u'children']:
        ids.append(item[u'data'][u'id'])

    return ids


def get_comments_and_children(commentjson):
    comments = []

    #print "Parent Comment:"

    for comment in commentjson[:-1]:
        #print comment[u'data'][u'replies'][u'data'][u'children']
        #print comment[u'data'][u'id']
        if comment[u'data'][u'body'] != u'':
            comments += [comment[u'data'][u'body']]
        try:
            comments += get_comments_and_children(comment[u'data'][u'replies'][u'data'][u'children'])
        except TypeError, e:
            #print "No more children for", comment[u'data'][u'link_id'], "by", comment[u'data'][u'author']
            continue

    return comments


def get_comments(subreddit, post_id):
    # Accepts a self post/comments page and returns the text of each
    # comments in list form.
    successful = False

    while not successful:
        try:
            fh = urlopen("http://www.reddit.com/r/" + subreddit + "/comments/" + post_id + "/.json", timeout=1)
            fd = json.loads(fh.read())
            successful = True
        except (HTTPError, URLError, timeout), e:
            print e, "- retrying"
        else:
            fh.close()
        sleep(2) # Keep to reddit's 30 requests/min rule

    comments = [fd[0][u'data'][u'children'][0][u'data'][u'selftext']] # add the selftext onto the comments list
                                                                      # gives an empty string for non-self posts
    comments += get_comments_and_children(fd[1][u'data'][u'children'])
    return comments


## Don't need! Same as just reading the last comment's id
##def get_next_page_id(redditlisting):
##    # Finds the id of the next page and returns it
##    return redditlisting[u'data'][u'after']


def json_load_reddit(subreddit, after_comment_id=""): # Should merge with getmcomments
    successful = False

    while not successful:
        try:
            fh = urlopen("http://www.reddit.com/r/" + subreddit + "/.json?after=" + after_comment_id, timeout=1) # need to make a proper user agent
            fd = json.loads(fh.read())
            successful = True
        except (HTTPError, URLError, timeout), e:
            print e, "- retrying"
        else:
            fh.close()
        sleep(2) # In keeping with reddit's rules

    return fd


def main():
    subreddit = "intp" # CHANGE THIS... potential other mbti types that want to use it... or anyone else...
    numpages  = 10      # Multiplies the running time!
    ids = []
    last_comment_id = ""

    for _ in xrange(numpages):
        page = json_load_reddit(subreddit, last_comment_id) # Get the first four pages
        ids += get_ids_json(page)
        last_comment_id = ids[-1]

    comments = []
    for thread in ids:
        print "Scraping thread:", thread
        comments += get_comments(subreddit, thread)

    print "\nFetched", len(comments), "comments in", len(ids), "threads."

    cnt = Counter()

    fh = open(unicode('raw_dump.txt'), 'w')
    for comment in comments:
        for char in comment:
            try:
                fh.write(char)
            except (UnicodeDecodeError, UnicodeEncodeError), e:
                print e # this was really pissing me off... so losing performance ftw
        fh.write('\n')
    fh.close()

    for comment in comments:
        _comment = filter(lambda c: c.isalpha() or c in (' ', '\t', '\n', '/', ':'), comment) # Get rid of non-alphabetic characters, except for spaces and linkymathings (will regex out later, cba to now)
        for word in _comment.split(' '):
            cnt[word.lower()] += 1 # Count stuff

    del cnt[''] # Get rid of empties, of course

    longest = max(map(len, cnt.keys()))
    highest = max(cnt.values())

    print "word|times\n---:|:---"
    for word, value in cnt.most_common(200): # can totally change the number here... but things got boring after a while
        print word + "|" + str(value)


if __name__ == '__main__':
    main()
	#! /usr/bin/env python
	# -- coding: utf-8 --
	# Author: github.com/Sourceless

	from urllib2 import urlopen, HTTPError, URLError
	from socket import timeout
	from time import sleep
	from collections import Counter
	import json

	def get_ids_json(redditlisting):
	# Accepts a reddit/subreddit listing and gets the id of each post,
	# returning them in a list.
	ids = [] # I'm annoyed by the lack of apostrophe too

	for item in redditlisting[u'data'][u'children']:
	ids.append(item[u'data'][u'id'])

	return ids


	def get_comments_and_children(commentjson):
	comments = []

	#print "Parent Comment:"

	for comment in commentjson[:-1]:
	#print comment[u'data'][u'replies'][u'data'][u'children']
	#print comment[u'data'][u'id']
	if comment[u'data'][u'body'] != u'':
	comments += [comment[u'data'][u'body']]
	try:
	comments += get_comments_and_children(comment[u'data'][u'replies'][u'data'][u'children'])
	except TypeError, e:
	#print "No more children for", comment[u'data'][u'link_id'], "by", comment[u'data'][u'author']
	continue

	return comments


	def get_comments(subreddit, post_id):
	# Accepts a self post/comments page and returns the text of each
	# comments in list form.
	successful = False

	while not successful:
	try:
	fh = urlopen("http://www.reddit.com/r/" + subreddit + "/comments/" + post_id + "/.json", timeout=1)
	fd = json.loads(fh.read())
	successful = True
	except (HTTPError, URLError, timeout), e:
	print e, "- retrying"
	else:
	fh.close()
	sleep(2) # Keep to reddit's 30 requests/min rule

	comments = [fd[0][u'data'][u'children'][0][u'data'][u'selftext']] # add the selftext onto the comments list
	# gives an empty string for non-self posts
	comments += get_comments_and_children(fd[1][u'data'][u'children'])
	return comments


	## Don't need! Same as just reading the last comment's id
	##def get_next_page_id(redditlisting):
	## # Finds the id of the next page and returns it
	## return redditlisting[u'data'][u'after']


	def json_load_reddit(subreddit, after_comment_id=""): # Should merge with getmcomments
	successful = False

	while not successful:
	try:
	fh = urlopen("http://www.reddit.com/r/" + subreddit + "/.json?after=" + after_comment_id, timeout=1) # need to make a proper user agent
	fd = json.loads(fh.read())
	successful = True
	except (HTTPError, URLError, timeout), e:
	print e, "- retrying"
	else:
	fh.close()
	sleep(2) # In keeping with reddit's rules

	return fd


	def main():
	subreddit = "intp" # CHANGE THIS... potential other mbti types that want to use it... or anyone else...
	numpages = 10 # Multiplies the running time!
	ids = []
	last_comment_id = ""

	for _ in xrange(numpages):
	page = json_load_reddit(subreddit, last_comment_id) # Get the first four pages
	ids += get_ids_json(page)
	last_comment_id = ids[-1]

	comments = []
	for thread in ids:
	print "Scraping thread:", thread
	comments += get_comments(subreddit, thread)

	print "\nFetched", len(comments), "comments in", len(ids), "threads."

	cnt = Counter()

	fh = open(unicode('raw_dump.txt'), 'w')
	for comment in comments:
	for char in comment:
	try:
	fh.write(char)
	except (UnicodeDecodeError, UnicodeEncodeError), e:
	print e # this was really pissing me off... so losing performance ftw
	fh.write('\n')
	fh.close()

	for comment in comments:
	_comment = filter(lambda c: c.isalpha() or c in (' ', '\t', '\n', '/', ':'), comment) # Get rid of non-alphabetic characters, except for spaces and linkymathings (will regex out later, cba to now)
	for word in _comment.split(' '):
	cnt[word.lower()] += 1 # Count stuff

	del cnt[''] # Get rid of empties, of course

	longest = max(map(len, cnt.keys()))
	highest = max(cnt.values())

	print "word\|times\n---:\|:---"
	for word, value in cnt.most_common(200): # can totally change the number here... but things got boring after a while
	print word + "\|" + str(value)


	if __name__ == '__main__':
	main()