nikolak/gist:8765020

## gistfile1.py
#!/usr/bin/env python

import sys
import json
import time
import urllib
import httplib2
from urllib import urlencode
from urlparse import urlparse, urlunparse, parse_qs

from optparse import OptionParser

from xml.sax.saxutils import escape as escape_html
from xml.sax.saxutils import unescape as unescape_html

# please don't hurt reddit
fetch_size = 100     # the higher the better, but reddit ignores +100
sleep_time = 1       # in seconds. how long to sleep between
                     # requests. higher is better
request_limit = None # how many requests to make to reddit before
                     # stopping (set to None to disable)

debug = False

http = httplib2.Http()


def login(username, password):
    url = 'http://www.reddit.com/api/login/%s' % username
    body = {'user': username,
            'passwd': password}
    headers = {'Content-type': 'application/x-www-form-urlencoded'}

    try:
        response, content = http.request(url, 'POST', headers=headers, body=urlencode(body))
    except Exception, e:
        print "Could not login"
        print e
        sys.exit(1)

    return response['set-cookie']

def get_links(sourceurl, login_cookie = '', requests = 0):
    '''
    Given a reddit JSON URL, yield the JSON Link API objects,
    following 'after' links
    '''
    # rip apart the URL, make sure it has .json at the end, and set
    # the limit
    scheme, host, path, params, query, fragment = urlparse(sourceurl)

    parsed_params = parse_qs(query) if query else {}
    parsed_params['limit'] = [fetch_size]
    fragment = None # erase the fragment, we don't use it
    assert path.endswith('.json') or path.endswith('/')
    if path.endswith('/'):
        path = path + '.json'

    new_urltuple = (scheme, host, path, params,
                    urlencode(parsed_params, doseq = True), fragment)
    composed_sourceurl = urlunparse(new_urltuple)

    if debug:
        sys.stderr.write('fetching %s\n' % composed_sourceurl)

    if login_cookie:
        headers = {'Cookie': login_cookie}
        response, text = http.request(composed_sourceurl, 'GET', headers=headers)
    else:
        text = urllib.urlopen(composed_sourceurl).read()
    parsed = json.loads(text)

    # there may be multiple listings, like on a comments-page, but we
    # can only export from pages with one listing
    assert parsed['kind'] == 'Listing'

    listing = parsed['data']

    for child in listing.get('children', []):
        yield child

    if (listing.get('after', None)
        and (request_limit is None
             or requests < request_limit - 1)):
        after_parsed_params = parsed_params.copy()
        after_parsed_params['after'] = [listing['after']]
        after_urltuple = (scheme, host, path, params,
                          urlencode(after_parsed_params, doseq = True),
                          fragment)
        after_sourceurl = urlunparse(after_urltuple)

        time.sleep(sleep_time)

        # yes, this is recursive, but if you're making enough requests
        # to blow out your stack, you're probably hurting reddit
        for link in get_links(after_sourceurl, login_cookie, requests+1):
            yield link

def main(sourceurl, username = None, password = None):
    '''
    Given a reddit JSON url, yield unicode strings that represent the
    exported HTML
    '''
    cookie = None

    karma={}
    # subreddit str: { ups: int,
    #                downs: int}

    if username and password:
        cookie = login(username, password)

    for link in get_links(sourceurl, cookie):
        data = link['data']

        if link['kind'] == 't3':
            # links
            pass

        elif link['kind'] == 't1':
            # comments
            subreddit = escape_html(data['subreddit'])
            ups = int(data['ups'])
            downs = int(data['downs'])
            try:
                karma[subreddit]['ups']+=ups
                karma[subreddit]['downs']+=downs
            except:
                karma[subreddit]={'ups':ups,
                                  'downs':downs}

        else:
            raise TypeError("I don't know how to decode %r" % link)

    for k,v in karma.items():
        print "/r/{} Upvotes: {}, Downvotes: {}, Karma: {}".format(k,
                            v['ups'],v['downs'],v['ups']-v['downs'])

if __name__=='__main__':
    username='wub_wub'
    password='your_password'
    sourceurl="http://www.reddit.com/user/wub_wub/"

    main(sourceurl, username, password)
    debug = False
	#!/usr/bin/env python

	import sys
	import json
	import time
	import urllib
	import httplib2
	from urllib import urlencode
	from urlparse import urlparse, urlunparse, parse_qs

	from optparse import OptionParser

	from xml.sax.saxutils import escape as escape_html
	from xml.sax.saxutils import unescape as unescape_html

	# please don't hurt reddit
	fetch_size = 100 # the higher the better, but reddit ignores +100
	sleep_time = 1 # in seconds. how long to sleep between
	# requests. higher is better
	request_limit = None # how many requests to make to reddit before
	# stopping (set to None to disable)

	debug = False

	http = httplib2.Http()


	def login(username, password):
	url = 'http://www.reddit.com/api/login/%s' % username
	body = {'user': username,
	'passwd': password}
	headers = {'Content-type': 'application/x-www-form-urlencoded'}

	try:
	response, content = http.request(url, 'POST', headers=headers, body=urlencode(body))
	except Exception, e:
	print "Could not login"
	print e
	sys.exit(1)

	return response['set-cookie']

	def get_links(sourceurl, login_cookie = '', requests = 0):
	'''
	Given a reddit JSON URL, yield the JSON Link API objects,
	following 'after' links
	'''
	# rip apart the URL, make sure it has .json at the end, and set
	# the limit
	scheme, host, path, params, query, fragment = urlparse(sourceurl)

	parsed_params = parse_qs(query) if query else {}
	parsed_params['limit'] = [fetch_size]
	fragment = None # erase the fragment, we don't use it
	assert path.endswith('.json') or path.endswith('/')
	if path.endswith('/'):
	path = path + '.json'

	new_urltuple = (scheme, host, path, params,
	urlencode(parsed_params, doseq = True), fragment)
	composed_sourceurl = urlunparse(new_urltuple)

	if debug:
	sys.stderr.write('fetching %s\n' % composed_sourceurl)

	if login_cookie:
	headers = {'Cookie': login_cookie}
	response, text = http.request(composed_sourceurl, 'GET', headers=headers)
	else:
	text = urllib.urlopen(composed_sourceurl).read()
	parsed = json.loads(text)

	# there may be multiple listings, like on a comments-page, but we
	# can only export from pages with one listing
	assert parsed['kind'] == 'Listing'

	listing = parsed['data']

	for child in listing.get('children', []):
	yield child

	if (listing.get('after', None)
	and (request_limit is None
	or requests < request_limit - 1)):
	after_parsed_params = parsed_params.copy()
	after_parsed_params['after'] = [listing['after']]
	after_urltuple = (scheme, host, path, params,
	urlencode(after_parsed_params, doseq = True),
	fragment)
	after_sourceurl = urlunparse(after_urltuple)

	time.sleep(sleep_time)

	# yes, this is recursive, but if you're making enough requests
	# to blow out your stack, you're probably hurting reddit
	for link in get_links(after_sourceurl, login_cookie, requests+1):
	yield link

	def main(sourceurl, username = None, password = None):
	'''
	Given a reddit JSON url, yield unicode strings that represent the
	exported HTML
	'''
	cookie = None

	karma={}
	# subreddit str: { ups: int,
	# downs: int}

	if username and password:
	cookie = login(username, password)

	for link in get_links(sourceurl, cookie):
	data = link['data']

	if link['kind'] == 't3':
	# links
	pass

	elif link['kind'] == 't1':
	# comments
	subreddit = escape_html(data['subreddit'])
	ups = int(data['ups'])
	downs = int(data['downs'])
	try:
	karma[subreddit]['ups']+=ups
	karma[subreddit]['downs']+=downs
	except:
	karma[subreddit]={'ups':ups,
	'downs':downs}

	else:
	raise TypeError("I don't know how to decode %r" % link)

	for k,v in karma.items():
	print "/r/{} Upvotes: {}, Downvotes: {}, Karma: {}".format(k,
	v['ups'],v['downs'],v['ups']-v['downs'])

	if __name__=='__main__':
	username='wub_wub'
	password='your_password'
	sourceurl="http://www.reddit.com/user/wub_wub/"

	main(sourceurl, username, password)
	debug = False