Created
March 11, 2012 21:57
-
-
Save pcote/2018347 to your computer and use it in GitHub Desktop.
Grabs the top 200 submission entries from the My Little Pony Subreddit at reddit.com and throws is into a MySQL database where I can analyze the data.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#top_200_pony_entries.py | |
import urllib | |
from pdb import set_trace | |
import time | |
import MySQLdb | |
import json | |
import pickle | |
mysql = MySQLdb | |
def store_to_database( submission_list ): | |
conn = mysql.connect( user='', db='', passwd = '', host='' ) | |
csr = conn.cursor() | |
for submission in submission_list: | |
permalink = submission['permalink'] | |
title = submission['title'].encode( 'cp1252' ) | |
url = submission['url'] | |
author = submission['author'].encode( 'cp1252' ) | |
num_comments = int(submission['num_comments']) | |
sql_query = """insert into submissions values( %s, %s, %s, %s, %s )""" | |
try: | |
res = csr.execute( sql_query, ( permalink, title, url, author, str(num_comments) ) ) | |
except UnicodeEncodeError as e: | |
print( "unicode emergency: please hold" ) | |
csr.close() | |
def grab_100_entries( after=None): | |
count_set = ( 0, ) | |
base_url = "http://www.reddit.com/r/mylittlepony.json?limit=100" | |
# url assembly | |
offset_count_arg = "" | |
if after != None: | |
offset_count_arg = "&after=%s" % after | |
full_url = base_url + offset_count_arg | |
# get the data and tidy it up | |
sock = urllib.urlopen( full_url ) | |
raw_data = sock.read() | |
string_data = raw_data.decode( "cp1252" ) | |
json_data = json.loads( string_data ) | |
return json_data | |
def store_data( json_coll ): | |
submission_list = [] | |
# clean it up into a nice simplified list. | |
submissions = json_coll['data']['children'] | |
submissions = [ sub['data'] for sub in submissions ] | |
submission_list.extend( submissions ) | |
# now we can dump it to file | |
pickle_file = open( "pony_entries", "w" ) | |
pickle.dump( submission_list, pickle_file ) | |
def clear_old_stuff( csr ): | |
csr.execute("delete from submissions") | |
json_coll = grab_100_entries() | |
store_data( json_coll ) | |
# note: no need to grab entries when I can just a previous fetch from a pickled version. | |
pickle_file = open( "pony_entries" ) | |
submission_list = pickle.load( pickle_file ) | |
store_to_database( submission_list ) | |
after_arg = json_coll['data']['after'] | |
time.sleep(2) | |
json_coll = grab_100_entries(after=after_arg) | |
store_data( json_coll ) | |
# note: no need to grab entries when I can just a previous fetch from a pickled version. | |
pickle_file = open( "pony_entries" ) | |
submission_list = pickle.load( pickle_file ) | |
store_to_database( submission_list ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment