Skip to content

Instantly share code, notes, and snippets.

@pcote
Created March 11, 2012 21:57
Show Gist options
  • Save pcote/2018347 to your computer and use it in GitHub Desktop.
Save pcote/2018347 to your computer and use it in GitHub Desktop.
Grabs the top 200 submission entries from the My Little Pony Subreddit at reddit.com and throws is into a MySQL database where I can analyze the data.
#top_200_pony_entries.py
import urllib
from pdb import set_trace
import time
import MySQLdb
import json
import pickle
mysql = MySQLdb
def store_to_database( submission_list ):
conn = mysql.connect( user='', db='', passwd = '', host='' )
csr = conn.cursor()
for submission in submission_list:
permalink = submission['permalink']
title = submission['title'].encode( 'cp1252' )
url = submission['url']
author = submission['author'].encode( 'cp1252' )
num_comments = int(submission['num_comments'])
sql_query = """insert into submissions values( %s, %s, %s, %s, %s )"""
try:
res = csr.execute( sql_query, ( permalink, title, url, author, str(num_comments) ) )
except UnicodeEncodeError as e:
print( "unicode emergency: please hold" )
csr.close()
def grab_100_entries( after=None):
count_set = ( 0, )
base_url = "http://www.reddit.com/r/mylittlepony.json?limit=100"
# url assembly
offset_count_arg = ""
if after != None:
offset_count_arg = "&after=%s" % after
full_url = base_url + offset_count_arg
# get the data and tidy it up
sock = urllib.urlopen( full_url )
raw_data = sock.read()
string_data = raw_data.decode( "cp1252" )
json_data = json.loads( string_data )
return json_data
def store_data( json_coll ):
submission_list = []
# clean it up into a nice simplified list.
submissions = json_coll['data']['children']
submissions = [ sub['data'] for sub in submissions ]
submission_list.extend( submissions )
# now we can dump it to file
pickle_file = open( "pony_entries", "w" )
pickle.dump( submission_list, pickle_file )
def clear_old_stuff( csr ):
csr.execute("delete from submissions")
json_coll = grab_100_entries()
store_data( json_coll )
# note: no need to grab entries when I can just a previous fetch from a pickled version.
pickle_file = open( "pony_entries" )
submission_list = pickle.load( pickle_file )
store_to_database( submission_list )
after_arg = json_coll['data']['after']
time.sleep(2)
json_coll = grab_100_entries(after=after_arg)
store_data( json_coll )
# note: no need to grab entries when I can just a previous fetch from a pickled version.
pickle_file = open( "pony_entries" )
submission_list = pickle.load( pickle_file )
store_to_database( submission_list )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment