gpiancastelli/tbd.py

## tbd.py
# A web scraper for wish lists on The Book Depository.
#
# TBD offers a mail notification service for books in a wish list when some
# price drops. Unfortunately, you are notified only if prices drop of 10% or
# more; also, the message you receive does not contain the new dropped price,
# nor the old, higher price. Finally, when I got one of those notifications,
# that very day I visited the book page on TBD only to discover that the price
# got higher again. Not very effective as a service, you know.
#
# So I decided to write a script myself.
#
# The scraper uses lxml to parse HTML documents; this is the only external
# dependency.
#
# Books data are cached in a JSON file in the user's home directory. The format
# is a dictionary where keys are the book's title, values a 2-tuple of prices:
# first the current price (i.e. the price of the book as of the latest run of
# the script), second the lowest price to date.
#
# Tested with Python 2.6.5.

import lxml.html
import os.path
from threading import Thread
from urllib2 import urlopen

# replace your own CODE and NAME in the wish list URL
URL = 'http://www.bookdepository.co.uk/wishlist/CODE/NAME'
# avoid downloading the first wish list page more than once
URL += '/?&page=1#pagination'
CACHE = os.path.join(os.path.expanduser('~'), '.bookdepository.json')
OUT_OF_STOCK = 9999

class Scraper(Thread):

    def __init__(self, url):
        Thread.__init__(self)
        self.url = url

    def run(self):
        doc = urlopen(self.url)
        self.root = lxml.html.parse(doc).getroot()
        div = self.root.get_element_by_id('account')
        wishlist = div.xpath('.//ul')[0]
        # harvest books in wish list page
        self.books = {}
        for book in wishlist.iterchildren():
            title = book.xpath('div/h3/a')[0].text
            # cut the currency symbol
            price = book.xpath('.//span[@class="price"]/strong')
            if price:
                self.books[title] = float(price[0].text[1:])
            else:
                # book is out of stock
                self.books[title] = OUT_OF_STOCK

    def pages(self):
        '''Get references to wish list pages linked from this page.'''
        pagination = self.root.get_element_by_id('pagination')
        pages = pagination.xpath('.//span[contains(@class, "search pageNumber")]')
        return [page.getchildren()[0].attrib['href']
                for page in pages if 'active' not in page.attrib['class']]


import json
from Queue import Queue, Empty

def scrape():
    current_books = []
    seen_pages = set()
    seen_pages.add(URL)

    def producer(minions, pages):
        while len(current_books) < len(seen_pages):
            page = pages.get(True)
            if page is None and pages.empty():
                break
            s = Scraper(page)
            s.start()
            minions.put(s)

    def consumer(minions, pages):
        while len(current_books) < len(seen_pages):
            m = minions.get(True)
            m.join()
            current_books.append(m.books)
            s = set(m.pages()).difference(seen_pages)
            if not s:
                pages.put(None)
                continue
            for page in set(m.pages()).difference(seen_pages):
                seen_pages.add(page)
                pages.put(page)

    minions = Queue()
    pages = Queue()
    pages.put(URL)
    produce = Thread(target=producer, args=(minions, pages))
    consume = Thread(target=consumer, args=(minions, pages))
    produce.start()
    consume.start()
    produce.join()
    consume.join()
    return current_books

current_books = {}
for books in scrape():
    current_books.update(books)

cached_books = {}
if os.path.exists(CACHE):
    with open(CACHE) as f:
        cached_books = json.load(f)

books = {}
books_with_dropped_price = []
for title, current_price in current_books.items():
    if title in cached_books:
        old_price, lowest_price = cached_books[title]
        if current_price <= lowest_price and current_price is not OUT_OF_STOCK:
            lowest_price = current_price
            books_with_dropped_price.append((title, current_price))
        books[title] = (current_price, lowest_price)
    else:
        books[title] = (current_price, current_price)

for title, price in books_with_dropped_price:
    print '%s is at its lowest price of %.2f' % (title, price)

with open(CACHE, 'w') as f:
    json.dump(books, f)
	# A web scraper for wish lists on The Book Depository.
	#
	# TBD offers a mail notification service for books in a wish list when some
	# price drops. Unfortunately, you are notified only if prices drop of 10% or
	# more; also, the message you receive does not contain the new dropped price,
	# nor the old, higher price. Finally, when I got one of those notifications,
	# that very day I visited the book page on TBD only to discover that the price
	# got higher again. Not very effective as a service, you know.
	#
	# So I decided to write a script myself.
	#
	# The scraper uses lxml to parse HTML documents; this is the only external
	# dependency.
	#
	# Books data are cached in a JSON file in the user's home directory. The format
	# is a dictionary where keys are the book's title, values a 2-tuple of prices:
	# first the current price (i.e. the price of the book as of the latest run of
	# the script), second the lowest price to date.
	#
	# Tested with Python 2.6.5.

	import lxml.html
	import os.path
	from threading import Thread
	from urllib2 import urlopen

	# replace your own CODE and NAME in the wish list URL
	URL = 'http://www.bookdepository.co.uk/wishlist/CODE/NAME'
	# avoid downloading the first wish list page more than once
	URL += '/?&page=1#pagination'
	CACHE = os.path.join(os.path.expanduser('~'), '.bookdepository.json')
	OUT_OF_STOCK = 9999

	class Scraper(Thread):

	def __init__(self, url):
	Thread.__init__(self)
	self.url = url

	def run(self):
	doc = urlopen(self.url)
	self.root = lxml.html.parse(doc).getroot()
	div = self.root.get_element_by_id('account')
	wishlist = div.xpath('.//ul')[0]
	# harvest books in wish list page
	self.books = {}
	for book in wishlist.iterchildren():
	title = book.xpath('div/h3/a')[0].text
	# cut the currency symbol
	price = book.xpath('.//span[@class="price"]/strong')
	if price:
	self.books[title] = float(price[0].text[1:])
	else:
	# book is out of stock
	self.books[title] = OUT_OF_STOCK

	def pages(self):
	'''Get references to wish list pages linked from this page.'''
	pagination = self.root.get_element_by_id('pagination')
	pages = pagination.xpath('.//span[contains(@class, "search pageNumber")]')
	return [page.getchildren()[0].attrib['href']
	for page in pages if 'active' not in page.attrib['class']]


	import json
	from Queue import Queue, Empty

	def scrape():
	current_books = []
	seen_pages = set()
	seen_pages.add(URL)

	def producer(minions, pages):
	while len(current_books) < len(seen_pages):
	page = pages.get(True)
	if page is None and pages.empty():
	break
	s = Scraper(page)
	s.start()
	minions.put(s)

	def consumer(minions, pages):
	while len(current_books) < len(seen_pages):
	m = minions.get(True)
	m.join()
	current_books.append(m.books)
	s = set(m.pages()).difference(seen_pages)
	if not s:
	pages.put(None)
	continue
	for page in set(m.pages()).difference(seen_pages):
	seen_pages.add(page)
	pages.put(page)

	minions = Queue()
	pages = Queue()
	pages.put(URL)
	produce = Thread(target=producer, args=(minions, pages))
	consume = Thread(target=consumer, args=(minions, pages))
	produce.start()
	consume.start()
	produce.join()
	consume.join()
	return current_books

	current_books = {}
	for books in scrape():
	current_books.update(books)

	cached_books = {}
	if os.path.exists(CACHE):
	with open(CACHE) as f:
	cached_books = json.load(f)

	books = {}
	books_with_dropped_price = []
	for title, current_price in current_books.items():
	if title in cached_books:
	old_price, lowest_price = cached_books[title]
	if current_price <= lowest_price and current_price is not OUT_OF_STOCK:
	lowest_price = current_price
	books_with_dropped_price.append((title, current_price))
	books[title] = (current_price, lowest_price)
	else:
	books[title] = (current_price, current_price)

	for title, price in books_with_dropped_price:
	print '%s is at its lowest price of %.2f' % (title, price)

	with open(CACHE, 'w') as f:
	json.dump(books, f)