Skip to content

Instantly share code, notes, and snippets.

@gpiancastelli
Created November 9, 2010 15:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gpiancastelli/669243 to your computer and use it in GitHub Desktop.
Save gpiancastelli/669243 to your computer and use it in GitHub Desktop.
A Python web scraper for wish lists on The Book Depository
# A web scraper for wish lists on The Book Depository.
#
# TBD offers a mail notification service for books in a wish list when some
# price drops. Unfortunately, you are notified only if prices drop of 10% or
# more; also, the message you receive does not contain the new dropped price,
# nor the old, higher price. Finally, when I got one of those notifications,
# that very day I visited the book page on TBD only to discover that the price
# got higher again. Not very effective as a service, you know.
#
# So I decided to write a script myself.
#
# The scraper uses lxml to parse HTML documents; this is the only external
# dependency.
#
# Books data are cached in a JSON file in the user's home directory. The format
# is a dictionary where keys are the book's title, values a 2-tuple of prices:
# first the current price (i.e. the price of the book as of the latest run of
# the script), second the lowest price to date.
#
# Tested with Python 2.6.5.
import lxml.html
import os.path
from threading import Thread
from urllib2 import urlopen
# replace your own CODE and NAME in the wish list URL
URL = 'http://www.bookdepository.co.uk/wishlist/CODE/NAME'
# avoid downloading the first wish list page more than once
URL += '/?&page=1#pagination'
CACHE = os.path.join(os.path.expanduser('~'), '.bookdepository.json')
OUT_OF_STOCK = 9999
class Scraper(Thread):
def __init__(self, url):
Thread.__init__(self)
self.url = url
def run(self):
doc = urlopen(self.url)
self.root = lxml.html.parse(doc).getroot()
div = self.root.get_element_by_id('account')
wishlist = div.xpath('.//ul')[0]
# harvest books in wish list page
self.books = {}
for book in wishlist.iterchildren():
title = book.xpath('div/h3/a')[0].text
# cut the currency symbol
price = book.xpath('.//span[@class="price"]/strong')
if price:
self.books[title] = float(price[0].text[1:])
else:
# book is out of stock
self.books[title] = OUT_OF_STOCK
def pages(self):
'''Get references to wish list pages linked from this page.'''
pagination = self.root.get_element_by_id('pagination')
pages = pagination.xpath('.//span[contains(@class, "search pageNumber")]')
return [page.getchildren()[0].attrib['href']
for page in pages if 'active' not in page.attrib['class']]
import json
from Queue import Queue, Empty
def scrape():
current_books = []
seen_pages = set()
seen_pages.add(URL)
def producer(minions, pages):
while len(current_books) < len(seen_pages):
page = pages.get(True)
if page is None and pages.empty():
break
s = Scraper(page)
s.start()
minions.put(s)
def consumer(minions, pages):
while len(current_books) < len(seen_pages):
m = minions.get(True)
m.join()
current_books.append(m.books)
s = set(m.pages()).difference(seen_pages)
if not s:
pages.put(None)
continue
for page in set(m.pages()).difference(seen_pages):
seen_pages.add(page)
pages.put(page)
minions = Queue()
pages = Queue()
pages.put(URL)
produce = Thread(target=producer, args=(minions, pages))
consume = Thread(target=consumer, args=(minions, pages))
produce.start()
consume.start()
produce.join()
consume.join()
return current_books
current_books = {}
for books in scrape():
current_books.update(books)
cached_books = {}
if os.path.exists(CACHE):
with open(CACHE) as f:
cached_books = json.load(f)
books = {}
books_with_dropped_price = []
for title, current_price in current_books.items():
if title in cached_books:
old_price, lowest_price = cached_books[title]
if current_price <= lowest_price and current_price is not OUT_OF_STOCK:
lowest_price = current_price
books_with_dropped_price.append((title, current_price))
books[title] = (current_price, lowest_price)
else:
books[title] = (current_price, current_price)
for title, price in books_with_dropped_price:
print '%s is at its lowest price of %.2f' % (title, price)
with open(CACHE, 'w') as f:
json.dump(books, f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment