Skip to content

Instantly share code, notes, and snippets.

@mkausas
Created February 1, 2018 04:03
Show Gist options
  • Save mkausas/20722ecee06a8a8c1906f5590bfcab83 to your computer and use it in GitHub Desktop.
Save mkausas/20722ecee06a8a8c1906f5590bfcab83 to your computer and use it in GitHub Desktop.

A scrappy data collector for Goodreads and Amazon pricing information.

Instructions:

  1. Fill in CONSUMER_KEY and CONSUMER_SECRET in scraper.py
  2. $ python scraper.py
def amazon_price(book_type):
return r'>' + book_type + '<\/span>[^$]*\n[ ]+[$]([0-9]+[,]*[0-9]*\.[0-9]{2})'
re_kindle = amazon_price('Kindle')
re_hardcover = amazon_price('Hardcover')
re_paperback = amazon_price('Paperback')
re_friend = r'\/user\/show\/([0-9]*)-'
re_book = r'[ ]{4}([0-9]{10})\n'
import re
import csv
import requests
from time import sleep
from random import random
from rauth.service import OAuth1Service, OAuth1Session
from custom_regex import re_friend, re_book, re_hardcover, re_paperback, re_kindle
# Get a real consumer key & secret from: https://www.goodreads.com/api/keys
CONSUMER_KEY = 'FILL IN'
CONSUMER_SECRET = 'FILL IN'
goodreads = OAuth1Service(
consumer_key=CONSUMER_KEY,
consumer_secret=CONSUMER_SECRET,
name='goodreads',
request_token_url='https://www.goodreads.com/oauth/request_token',
authorize_url='https://www.goodreads.com/oauth/authorize',
access_token_url='https://www.goodreads.com/oauth/access_token',
base_url='https://www.goodreads.com/'
)
request_token, request_token_secret = goodreads.get_request_token(header_auth=True)
authorize_url = goodreads.get_authorize_url(request_token)
print 'Visit this URL in your browser: ' + authorize_url
accepted = 'n'
while accepted.lower() == 'n':
accepted = raw_input('Have you authorized me? (y/n) ')
session = goodreads.get_auth_session(request_token, request_token_secret)
response = session.get('https://www.goodreads.com/friend/user/2365897')
base_url = 'https://www.goodreads.com/'
# get Goodreads friend IDs
friend_ids = []
page = 1
while True:
response = session.get(base_url + 'friend?page=' + `page`)
friend_page = re.findall(re_friend, response.text)
if len(friend_page) <= 3:
break
else:
friend_ids += friend_page
page += 1
friend_ids = list(set(friend_ids))
print 'Friends list includes: ', len(friend_ids)
# Get all ISBNs
isbns = []
for friend_id in friend_ids:
url = base_url + 'review/list/' + friend_id.encode('utf-8') + '?shelf=read&per_page=100'
r = requests.get(url=url)
my_books = re.findall(re_book, r.text)
isbns += my_books
print 'All Book IDs: ', isbns
# Grab pricing from Amazon
with open('scraped_data.csv', 'w') as csvfile:
fieldnames = ['isbn', 'kindle_price', 'hardcover_price', 'paperback_price']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for isbn in isbns:
isbn = isbn.encode('utf-8')
url = 'https://www.amazon.com/dp/' + isbn
r = session.get(url)
kindle_price = re.findall(re_kindle, r.text)
hardcover_price = re.findall(re_hardcover, r.text)
paperback_price = re.findall(re_paperback, r.text)
if len(kindle_price) < 1:
kindle_price = ['0.00']
if len(hardcover_price) < 1:
hardcover_price = ['0.00']
if len(paperback_price) < 1:
paperback_price = ['0.00']
# prevent amazon blockage
if kindle_price[0] == '0.00' \
and hardcover_price[0] == '0.00' \
and paperback_price[0] == '0.00':
sleep(60)
print 'Book ISBN: ', isbn
print 'Kindle Price', kindle_price[0]
print 'Hardcover Price', hardcover_price[0]
print 'Paperback Price', paperback_price[0]
writer.writerow(
{
'isbn': isbn,
'kindle_price': kindle_price[0],
'hardcover_price': hardcover_price[0],
'paperback_price': paperback_price[0]
})
sleep(random() * 10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment