Skip to content

Instantly share code, notes, and snippets.

@nikolak
Created February 2, 2014 09:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nikolak/8765020 to your computer and use it in GitHub Desktop.
Save nikolak/8765020 to your computer and use it in GitHub Desktop.
Make a dictionary with subreddits as keys and values as upvotes and downvotes in that subreddit, works for last ~1000 comments
#!/usr/bin/env python
import sys
import json
import time
import urllib
import httplib2
from urllib import urlencode
from urlparse import urlparse, urlunparse, parse_qs
from optparse import OptionParser
from xml.sax.saxutils import escape as escape_html
from xml.sax.saxutils import unescape as unescape_html
# please don't hurt reddit
fetch_size = 100 # the higher the better, but reddit ignores +100
sleep_time = 1 # in seconds. how long to sleep between
# requests. higher is better
request_limit = None # how many requests to make to reddit before
# stopping (set to None to disable)
debug = False
http = httplib2.Http()
def login(username, password):
url = 'http://www.reddit.com/api/login/%s' % username
body = {'user': username,
'passwd': password}
headers = {'Content-type': 'application/x-www-form-urlencoded'}
try:
response, content = http.request(url, 'POST', headers=headers, body=urlencode(body))
except Exception, e:
print "Could not login"
print e
sys.exit(1)
return response['set-cookie']
def get_links(sourceurl, login_cookie = '', requests = 0):
'''
Given a reddit JSON URL, yield the JSON Link API objects,
following 'after' links
'''
# rip apart the URL, make sure it has .json at the end, and set
# the limit
scheme, host, path, params, query, fragment = urlparse(sourceurl)
parsed_params = parse_qs(query) if query else {}
parsed_params['limit'] = [fetch_size]
fragment = None # erase the fragment, we don't use it
assert path.endswith('.json') or path.endswith('/')
if path.endswith('/'):
path = path + '.json'
new_urltuple = (scheme, host, path, params,
urlencode(parsed_params, doseq = True), fragment)
composed_sourceurl = urlunparse(new_urltuple)
if debug:
sys.stderr.write('fetching %s\n' % composed_sourceurl)
if login_cookie:
headers = {'Cookie': login_cookie}
response, text = http.request(composed_sourceurl, 'GET', headers=headers)
else:
text = urllib.urlopen(composed_sourceurl).read()
parsed = json.loads(text)
# there may be multiple listings, like on a comments-page, but we
# can only export from pages with one listing
assert parsed['kind'] == 'Listing'
listing = parsed['data']
for child in listing.get('children', []):
yield child
if (listing.get('after', None)
and (request_limit is None
or requests < request_limit - 1)):
after_parsed_params = parsed_params.copy()
after_parsed_params['after'] = [listing['after']]
after_urltuple = (scheme, host, path, params,
urlencode(after_parsed_params, doseq = True),
fragment)
after_sourceurl = urlunparse(after_urltuple)
time.sleep(sleep_time)
# yes, this is recursive, but if you're making enough requests
# to blow out your stack, you're probably hurting reddit
for link in get_links(after_sourceurl, login_cookie, requests+1):
yield link
def main(sourceurl, username = None, password = None):
'''
Given a reddit JSON url, yield unicode strings that represent the
exported HTML
'''
cookie = None
karma={}
# subreddit str: { ups: int,
# downs: int}
if username and password:
cookie = login(username, password)
for link in get_links(sourceurl, cookie):
data = link['data']
if link['kind'] == 't3':
# links
pass
elif link['kind'] == 't1':
# comments
subreddit = escape_html(data['subreddit'])
ups = int(data['ups'])
downs = int(data['downs'])
try:
karma[subreddit]['ups']+=ups
karma[subreddit]['downs']+=downs
except:
karma[subreddit]={'ups':ups,
'downs':downs}
else:
raise TypeError("I don't know how to decode %r" % link)
for k,v in karma.items():
print "/r/{} Upvotes: {}, Downvotes: {}, Karma: {}".format(k,
v['ups'],v['downs'],v['ups']-v['downs'])
if __name__=='__main__':
username='wub_wub'
password='your_password'
sourceurl="http://www.reddit.com/user/wub_wub/"
main(sourceurl, username, password)
debug = False
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment