A scrappy data collector for Goodreads and Amazon pricing information.
Instructions:
- Fill in
CONSUMER_KEY
andCONSUMER_SECRET
inscraper.py
$ python scraper.py
def amazon_price(book_type): | |
return r'>' + book_type + '<\/span>[^$]*\n[ ]+[$]([0-9]+[,]*[0-9]*\.[0-9]{2})' | |
re_kindle = amazon_price('Kindle') | |
re_hardcover = amazon_price('Hardcover') | |
re_paperback = amazon_price('Paperback') | |
re_friend = r'\/user\/show\/([0-9]*)-' | |
re_book = r'[ ]{4}([0-9]{10})\n' |
import re | |
import csv | |
import requests | |
from time import sleep | |
from random import random | |
from rauth.service import OAuth1Service, OAuth1Session | |
from custom_regex import re_friend, re_book, re_hardcover, re_paperback, re_kindle | |
# Get a real consumer key & secret from: https://www.goodreads.com/api/keys | |
CONSUMER_KEY = 'FILL IN' | |
CONSUMER_SECRET = 'FILL IN' | |
goodreads = OAuth1Service( | |
consumer_key=CONSUMER_KEY, | |
consumer_secret=CONSUMER_SECRET, | |
name='goodreads', | |
request_token_url='https://www.goodreads.com/oauth/request_token', | |
authorize_url='https://www.goodreads.com/oauth/authorize', | |
access_token_url='https://www.goodreads.com/oauth/access_token', | |
base_url='https://www.goodreads.com/' | |
) | |
request_token, request_token_secret = goodreads.get_request_token(header_auth=True) | |
authorize_url = goodreads.get_authorize_url(request_token) | |
print 'Visit this URL in your browser: ' + authorize_url | |
accepted = 'n' | |
while accepted.lower() == 'n': | |
accepted = raw_input('Have you authorized me? (y/n) ') | |
session = goodreads.get_auth_session(request_token, request_token_secret) | |
response = session.get('https://www.goodreads.com/friend/user/2365897') | |
base_url = 'https://www.goodreads.com/' | |
# get Goodreads friend IDs | |
friend_ids = [] | |
page = 1 | |
while True: | |
response = session.get(base_url + 'friend?page=' + `page`) | |
friend_page = re.findall(re_friend, response.text) | |
if len(friend_page) <= 3: | |
break | |
else: | |
friend_ids += friend_page | |
page += 1 | |
friend_ids = list(set(friend_ids)) | |
print 'Friends list includes: ', len(friend_ids) | |
# Get all ISBNs | |
isbns = [] | |
for friend_id in friend_ids: | |
url = base_url + 'review/list/' + friend_id.encode('utf-8') + '?shelf=read&per_page=100' | |
r = requests.get(url=url) | |
my_books = re.findall(re_book, r.text) | |
isbns += my_books | |
print 'All Book IDs: ', isbns | |
# Grab pricing from Amazon | |
with open('scraped_data.csv', 'w') as csvfile: | |
fieldnames = ['isbn', 'kindle_price', 'hardcover_price', 'paperback_price'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for isbn in isbns: | |
isbn = isbn.encode('utf-8') | |
url = 'https://www.amazon.com/dp/' + isbn | |
r = session.get(url) | |
kindle_price = re.findall(re_kindle, r.text) | |
hardcover_price = re.findall(re_hardcover, r.text) | |
paperback_price = re.findall(re_paperback, r.text) | |
if len(kindle_price) < 1: | |
kindle_price = ['0.00'] | |
if len(hardcover_price) < 1: | |
hardcover_price = ['0.00'] | |
if len(paperback_price) < 1: | |
paperback_price = ['0.00'] | |
# prevent amazon blockage | |
if kindle_price[0] == '0.00' \ | |
and hardcover_price[0] == '0.00' \ | |
and paperback_price[0] == '0.00': | |
sleep(60) | |
print 'Book ISBN: ', isbn | |
print 'Kindle Price', kindle_price[0] | |
print 'Hardcover Price', hardcover_price[0] | |
print 'Paperback Price', paperback_price[0] | |
writer.writerow( | |
{ | |
'isbn': isbn, | |
'kindle_price': kindle_price[0], | |
'hardcover_price': hardcover_price[0], | |
'paperback_price': paperback_price[0] | |
}) | |
sleep(random() * 10) |