akesling/gread.py

## gread.py
# Copyright 2013 echoet@echoet.com
import requests
import urllib
import datetime
import time
import os
import json

RUN_ID = int(time.mktime(datetime.datetime.now().timetuple()))
MAX_POSTS_REQUESTED = 1000
HTTP_SUCCESS = 200
CNN_TECH = 'http://rss.cnn.com/rss/cnn_tech.rss'

def sow(feed, delay=0, posts=1000, directory='chunks'):
    path = os.path.join(directory, '%s.%s' % (unicode(RUN_ID), 'items'))
    if not os.path.exists(path):
        os.makedirs(path)

    post_inc = min(MAX_POSTS_REQUESTED, posts)
    num_items = 0
    cur = retrieve(feed, posts=post_inc)
    while cur.status_code == HTTP_SUCCESS and num_items < posts:
        with open(os.path.join(path, '%s.json'%num_items), 'w+') as f:
            f.write(cur.text.encode('utf8'))
        obj = cur.json()
        cur = retrieve(feed, posts=post_inc, continuation=obj['continuation'])
        num_items += len(obj['items'])
        time.sleep(delay)

    if cur.status_code != HTTP_SUCCESS:
        print 'RAGE QUIT AT STATUS CODE %s ON PAGE %s!!!!1!' % (cur.status_code, cur.url)
    else:
        print 'Gracefully exiting after guzzling %s posts.' % num_items

def retrieve(feed, posts=1000, continuation=None):
    url = 'http://www.google.com/reader/api/0/stream/contents/feed/%s'
    get_params = {
        'allcomments': 'false',
        'output': 'json',
        'ck': int(time.mktime(datetime.datetime.now().timetuple())),
        'ot': 0,
        'n': posts,
        'client': 'scroll',
    }
    if continuation:
        get_params['c'] = continuation
    return requests.get(url % urllib.quote_plus(feed), params=get_params)

def scrape(items, directory='chunk', delay=0):
    path = os.path.join(directory, '%s.%s' % (unicode(RUN_ID), 'pages'))
    if not os.path.exists(path):
        os.makedirs(path)

    for i in items:
        try:
            href = i['canonical'][0]['href']
            page = requests.get(href)
            if page.status_code == HTTP_SUCCESS:
                with open(os.path.join(path, urllib.quote_plus(href)), 'w+') as f:
                    f.write(page.text.encode('utf8'))
                print 'Great success for %s!' % (page.url)
            else:
                print 'RAGE QUIT AT STATUS CODE %s ON PAGE %s!!!!1!' % (page.status_code, page.url)
        except requests.ConnectionError:
            pass

def reap(directory):
    for f in os.listdir(directory):
        path = os.path.join(directory, f)
        if os.path.isfile(path):
            with open(path, 'r') as feedfile:
                feed = json.load(feedfile)
                scrape(feed['items'], directory)
	# Copyright 2013 echoet@echoet.com
	import requests
	import urllib
	import datetime
	import time
	import os
	import json

	RUN_ID = int(time.mktime(datetime.datetime.now().timetuple()))
	MAX_POSTS_REQUESTED = 1000
	HTTP_SUCCESS = 200
	CNN_TECH = 'http://rss.cnn.com/rss/cnn_tech.rss'

	def sow(feed, delay=0, posts=1000, directory='chunks'):
	path = os.path.join(directory, '%s.%s' % (unicode(RUN_ID), 'items'))
	if not os.path.exists(path):
	os.makedirs(path)

	post_inc = min(MAX_POSTS_REQUESTED, posts)
	num_items = 0
	cur = retrieve(feed, posts=post_inc)
	while cur.status_code == HTTP_SUCCESS and num_items < posts:
	with open(os.path.join(path, '%s.json'%num_items), 'w+') as f:
	f.write(cur.text.encode('utf8'))
	obj = cur.json()
	cur = retrieve(feed, posts=post_inc, continuation=obj['continuation'])
	num_items += len(obj['items'])
	time.sleep(delay)

	if cur.status_code != HTTP_SUCCESS:
	print 'RAGE QUIT AT STATUS CODE %s ON PAGE %s!!!!1!' % (cur.status_code, cur.url)
	else:
	print 'Gracefully exiting after guzzling %s posts.' % num_items

	def retrieve(feed, posts=1000, continuation=None):
	url = 'http://www.google.com/reader/api/0/stream/contents/feed/%s'
	get_params = {
	'allcomments': 'false',
	'output': 'json',
	'ck': int(time.mktime(datetime.datetime.now().timetuple())),
	'ot': 0,
	'n': posts,
	'client': 'scroll',
	}
	if continuation:
	get_params['c'] = continuation
	return requests.get(url % urllib.quote_plus(feed), params=get_params)

	def scrape(items, directory='chunk', delay=0):
	path = os.path.join(directory, '%s.%s' % (unicode(RUN_ID), 'pages'))
	if not os.path.exists(path):
	os.makedirs(path)

	for i in items:
	try:
	href = i['canonical'][0]['href']
	page = requests.get(href)
	if page.status_code == HTTP_SUCCESS:
	with open(os.path.join(path, urllib.quote_plus(href)), 'w+') as f:
	f.write(page.text.encode('utf8'))
	print 'Great success for %s!' % (page.url)
	else:
	print 'RAGE QUIT AT STATUS CODE %s ON PAGE %s!!!!1!' % (page.status_code, page.url)
	except requests.ConnectionError:
	pass

	def reap(directory):
	for f in os.listdir(directory):
	path = os.path.join(directory, f)
	if os.path.isfile(path):
	with open(path, 'r') as feedfile:
	feed = json.load(feedfile)
	scrape(feed['items'], directory)