Skip to content

Instantly share code, notes, and snippets.

@akesling
Created March 29, 2013 22:19
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save akesling/5274075 to your computer and use it in GitHub Desktop.
Save akesling/5274075 to your computer and use it in GitHub Desktop.
# Copyright 2013 echoet@echoet.com
import requests
import urllib
import datetime
import time
import os
import json
RUN_ID = int(time.mktime(datetime.datetime.now().timetuple()))
MAX_POSTS_REQUESTED = 1000
HTTP_SUCCESS = 200
CNN_TECH = 'http://rss.cnn.com/rss/cnn_tech.rss'
def sow(feed, delay=0, posts=1000, directory='chunks'):
path = os.path.join(directory, '%s.%s' % (unicode(RUN_ID), 'items'))
if not os.path.exists(path):
os.makedirs(path)
post_inc = min(MAX_POSTS_REQUESTED, posts)
num_items = 0
cur = retrieve(feed, posts=post_inc)
while cur.status_code == HTTP_SUCCESS and num_items < posts:
with open(os.path.join(path, '%s.json'%num_items), 'w+') as f:
f.write(cur.text.encode('utf8'))
obj = cur.json()
cur = retrieve(feed, posts=post_inc, continuation=obj['continuation'])
num_items += len(obj['items'])
time.sleep(delay)
if cur.status_code != HTTP_SUCCESS:
print 'RAGE QUIT AT STATUS CODE %s ON PAGE %s!!!!1!' % (cur.status_code, cur.url)
else:
print 'Gracefully exiting after guzzling %s posts.' % num_items
def retrieve(feed, posts=1000, continuation=None):
url = 'http://www.google.com/reader/api/0/stream/contents/feed/%s'
get_params = {
'allcomments': 'false',
'output': 'json',
'ck': int(time.mktime(datetime.datetime.now().timetuple())),
'ot': 0,
'n': posts,
'client': 'scroll',
}
if continuation:
get_params['c'] = continuation
return requests.get(url % urllib.quote_plus(feed), params=get_params)
def scrape(items, directory='chunk', delay=0):
path = os.path.join(directory, '%s.%s' % (unicode(RUN_ID), 'pages'))
if not os.path.exists(path):
os.makedirs(path)
for i in items:
try:
href = i['canonical'][0]['href']
page = requests.get(href)
if page.status_code == HTTP_SUCCESS:
with open(os.path.join(path, urllib.quote_plus(href)), 'w+') as f:
f.write(page.text.encode('utf8'))
print 'Great success for %s!' % (page.url)
else:
print 'RAGE QUIT AT STATUS CODE %s ON PAGE %s!!!!1!' % (page.status_code, page.url)
except requests.ConnectionError:
pass
def reap(directory):
for f in os.listdir(directory):
path = os.path.join(directory, f)
if os.path.isfile(path):
with open(path, 'r') as feedfile:
feed = json.load(feedfile)
scrape(feed['items'], directory)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment