Created
February 2, 2014 09:04
-
-
Save nikolak/8765020 to your computer and use it in GitHub Desktop.
Make a dictionary with subreddits as keys and values as upvotes and downvotes in that subreddit, works for last ~1000 comments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import json | |
import time | |
import urllib | |
import httplib2 | |
from urllib import urlencode | |
from urlparse import urlparse, urlunparse, parse_qs | |
from optparse import OptionParser | |
from xml.sax.saxutils import escape as escape_html | |
from xml.sax.saxutils import unescape as unescape_html | |
# please don't hurt reddit | |
fetch_size = 100 # the higher the better, but reddit ignores +100 | |
sleep_time = 1 # in seconds. how long to sleep between | |
# requests. higher is better | |
request_limit = None # how many requests to make to reddit before | |
# stopping (set to None to disable) | |
debug = False | |
http = httplib2.Http() | |
def login(username, password): | |
url = 'http://www.reddit.com/api/login/%s' % username | |
body = {'user': username, | |
'passwd': password} | |
headers = {'Content-type': 'application/x-www-form-urlencoded'} | |
try: | |
response, content = http.request(url, 'POST', headers=headers, body=urlencode(body)) | |
except Exception, e: | |
print "Could not login" | |
print e | |
sys.exit(1) | |
return response['set-cookie'] | |
def get_links(sourceurl, login_cookie = '', requests = 0): | |
''' | |
Given a reddit JSON URL, yield the JSON Link API objects, | |
following 'after' links | |
''' | |
# rip apart the URL, make sure it has .json at the end, and set | |
# the limit | |
scheme, host, path, params, query, fragment = urlparse(sourceurl) | |
parsed_params = parse_qs(query) if query else {} | |
parsed_params['limit'] = [fetch_size] | |
fragment = None # erase the fragment, we don't use it | |
assert path.endswith('.json') or path.endswith('/') | |
if path.endswith('/'): | |
path = path + '.json' | |
new_urltuple = (scheme, host, path, params, | |
urlencode(parsed_params, doseq = True), fragment) | |
composed_sourceurl = urlunparse(new_urltuple) | |
if debug: | |
sys.stderr.write('fetching %s\n' % composed_sourceurl) | |
if login_cookie: | |
headers = {'Cookie': login_cookie} | |
response, text = http.request(composed_sourceurl, 'GET', headers=headers) | |
else: | |
text = urllib.urlopen(composed_sourceurl).read() | |
parsed = json.loads(text) | |
# there may be multiple listings, like on a comments-page, but we | |
# can only export from pages with one listing | |
assert parsed['kind'] == 'Listing' | |
listing = parsed['data'] | |
for child in listing.get('children', []): | |
yield child | |
if (listing.get('after', None) | |
and (request_limit is None | |
or requests < request_limit - 1)): | |
after_parsed_params = parsed_params.copy() | |
after_parsed_params['after'] = [listing['after']] | |
after_urltuple = (scheme, host, path, params, | |
urlencode(after_parsed_params, doseq = True), | |
fragment) | |
after_sourceurl = urlunparse(after_urltuple) | |
time.sleep(sleep_time) | |
# yes, this is recursive, but if you're making enough requests | |
# to blow out your stack, you're probably hurting reddit | |
for link in get_links(after_sourceurl, login_cookie, requests+1): | |
yield link | |
def main(sourceurl, username = None, password = None): | |
''' | |
Given a reddit JSON url, yield unicode strings that represent the | |
exported HTML | |
''' | |
cookie = None | |
karma={} | |
# subreddit str: { ups: int, | |
# downs: int} | |
if username and password: | |
cookie = login(username, password) | |
for link in get_links(sourceurl, cookie): | |
data = link['data'] | |
if link['kind'] == 't3': | |
# links | |
pass | |
elif link['kind'] == 't1': | |
# comments | |
subreddit = escape_html(data['subreddit']) | |
ups = int(data['ups']) | |
downs = int(data['downs']) | |
try: | |
karma[subreddit]['ups']+=ups | |
karma[subreddit]['downs']+=downs | |
except: | |
karma[subreddit]={'ups':ups, | |
'downs':downs} | |
else: | |
raise TypeError("I don't know how to decode %r" % link) | |
for k,v in karma.items(): | |
print "/r/{} Upvotes: {}, Downvotes: {}, Karma: {}".format(k, | |
v['ups'],v['downs'],v['ups']-v['downs']) | |
if __name__=='__main__': | |
username='wub_wub' | |
password='your_password' | |
sourceurl="http://www.reddit.com/user/wub_wub/" | |
main(sourceurl, username, password) | |
debug = False | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment