Skip to content

Instantly share code, notes, and snippets.

@dvoiss
Created March 27, 2012 06:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dvoiss/2213275 to your computer and use it in GitHub Desktop.
Save dvoiss/2213275 to your computer and use it in GitHub Desktop.
Command-line script parses my Amazon wishlist with Nokogiri, gets related ISBNs for each book from the library-thing API, then searches Chicago Public Library for the books
# This script accepts an email address to use to retrieve an Amazon wishlist
# for, and an optional branch-ID for the Chicago Public Library system. The
# script parses the wishlist and finds the books that are available for
# *CHECK OUT* (unavailable books, in-transit, on hold, etc. are ignored).
require 'net/https'
require "open-uri"
require 'uri'
require 'zlib'
require "nokogiri"
# the library thing isbn api allows us to get related ISBNs,
# so if we're searching for a book and the library has an older or newer
# edition or a reprint, etc. we can search for that as well
LIBRARY_THING_ISBN_URL = "http://www.librarything.com/api/thingISBN/"
# library constants
LIBRARY_BASE_URL = "http://www.chipublib.org"
LIBRARY_SEARCH_URL = "#{LIBRARY_BASE_URL}/search/results/"
LIBRARY_REFERER_URL = "#{LIBRARY_BASE_URL}/search/advanced/"
LIBRARY_NO_RESULTS_STRING = "Your search did not produce any results."
LIBRARY_MY_STRING = "My Library"
LIBRARY_NOT_CHECKED_OUT = "Not checked out"
# for printing to term, ANSI color, windows support = ?
ORANGE_COLOR = "\033[33m"
GREEN_COLOR = "\033[32m"
CLEAR_COLOR = "\033[0m"
# loop through the pages of the wishlist,
# on the first time through after we receive the first page, we'll grab the
# maximum number of pages
def get_wishlist(email)
books = []
puts "Retrieving wishlist"
# attempt to retrieve the wishlist
uri = URI.parse("http://www.amazon.com/registry/search.html?type=wishlist&field-name=#{email}")
uri_path = "#{uri.path}?#{uri.query}"
request = Net::HTTP::Get.new(uri_path)
response = Net::HTTP.start(uri.host, uri.port) do |http|
http.request(request)
end
wishlist_url = response['location']
if wishlist_url.nil?
puts "Cannot find the wishlist for #{email}"
exit
end
# the filter=3 is filter by books
page = Nokogiri::HTML(open("#{wishlist_url}&filter=3&layout=compact"))
# get the divs and parse out their title and author
page.css('tbody[class=itemWrapper]').each do |part|
link_with_isbn = part.css('span[class="small productTitle"] a')
link = link_with_isbn.length > 0 ? link_with_isbn.first : nil
if link.nil?; next end
# get the href attribute and try to get a match for the ISBN
isbn_available = link.attr('href').match(/dp\/([\d\w]+)\//)
# did we get an ISBN?
if isbn_available && isbn_available.captures.one?
isbn = isbn_available.captures.pop
title = link_with_isbn.text.strip
books.push({ :title => title, :isbn => isbn })
# right now, I'm not using author:
next
# grab the author, eliminating the " by " text if it exists,
# also eliminate the special characters
authors = part.css('span[class="tiny"]').first.text.gsub(/\s*by\s+/, '').match(/([\w\s\'\-\.]+)/)
if authors and not authors.captures.empty?
# temporarily use the first author
books.push({ :author => authors.captures.first.strip, :title => title, :isbn => isbn })
end
end
end
books
end
# for fetching the library pages
def fetch(uri_str, limit = 5, page_type = "search")
if limit > 0
uri = URI.parse(uri_str)
# header
header = {
"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7",
"Referer" => LIBRARY_REFERER_URL,
"Host" => uri.host,
"Accept-Encoding" => "gzip,deflate,sdch"
}
# reference: http://ruby-doc.org/stdlib/libdoc/net/http/rdoc/classes/Net/HTTP.html
request = Net::HTTP::Get.new(uri.request_uri)
request.initialize_http_header(header)
response = Net::HTTP.start(uri.host, uri.port) { |http| http.request(request) }
case response
when Net::HTTPSuccess then { :response => response, :page_type => page_type }
when Net::HTTPRedirection then fetch("#{LIBRARY_BASE_URL + response['location']}", limit - 1, "detail")
else
response.error!
end
else
# todo
raise "TOO MANY REDIRECTS"
end
end
# use librarything's ISBN api to retrieve related ISBNs,
# so we don't miss the book we want because of a reprint or different edition,
# one book can have many ISBNs as a result of different versions, etc.
def get_related_isbns(isbn)
response = open("http://www.librarything.com/api/thingISBN/" + isbn)
if response.nil?; return [isbn] end
page = Nokogiri::XML(response)
related_isbns = []
page.css('isbn').each { |related_isbn| related_isbns.push(related_isbn.text) }
related_isbns
end
# parse a "detail page" (page which results when a book has been found)
def parse_detail(body)
libraries = []
page = Nokogiri::HTML(body)
page.css('table[class=summary] tr').each do |tablerow|
if tablerow.to_s.index(LIBRARY_NOT_CHECKED_OUT) != nil
# it's at our library, break out of loop
if LIBRARY_BRANCH_LOCATION != nil && tablerow.previous_sibling.to_s.index("Your Library") != nil
return [LIBRARY_MY_STRING]
end
# it isn't checked out at this location, save library name
libraries.push tablerow.css('td').first.text
end
end
libraries
end
# parse the "search results page"
# get the libraries the book is available at, if it is available at the library
# specified by LIBRARY_MY_STRING, then just return that library
def parse_search_results(page)
libraries_available = []
links = page.css("ol[class=result] li[class=clearfix] h3 a")
links.each do |link|
fetch_result = fetch("#{LIBRARY_BASE_URL + (link.attr 'href')}")
body = Zlib::GzipReader.new(StringIO.new(fetch_result[:response].body)).read
libraries_available.concat parse_detail(body)
# should we go to the next link? do we already know if it's available at our library?
if libraries_available.include? LIBRARY_MY_STRING; return [LIBRARY_MY_STRING] end
end
libraries_available
end
# go through books and tell me if they are available at my local library
def find_books(books, library)
puts "Finding books..."
book_available = false
books.each do |book|
# get related isbns and limit collection to a maximum of 20 ISBNs
related_isbns = get_related_isbns(book[:isbn])[0...20]
libraries_available = []
# search through ISBNs, 5 at a time (due to limits on chipublib search),
# break at first results found
(0...related_isbns.length).step(5) do |count|
isbn_search_range = related_isbns[count...count+5]
isbn_search_string = isbn_search_range.join('+or+')
fetch_result = fetch("#{LIBRARY_SEARCH_URL}?&isbn=#{isbn_search_string}&location=#{LIBRARY_BRANCH_LOCATION}&format=Book&advancedSearch=submitted")
# unzip
body = Zlib::GzipReader.new(StringIO.new(fetch_result[:response].body)).read
if fetch_result[:page_type] == "search"
if body.index(LIBRARY_NO_RESULTS_STRING) == nil
# assemble a collection of results
page = Nokogiri::HTML(body)
libraries_available = parse_search_results(page)
else
# no results for this book's ISBN(s)
# puts LIBRARY_NO_RESULTS_STRING
end
else # detail, one result for this book's ISBN(s)
libraries_available = parse_detail(body)
end
# if it's available at our library, don't bother going through any other ISBNs for this book,
# just tell me it's available so the next book can be processed
if libraries_available.include? LIBRARY_MY_STRING; break end
# don't make too many requests too fast :)
# sleep 1/10
end
# show where the book is available
if libraries_available.include? LIBRARY_MY_STRING
book_available = true
puts "#{GREEN_COLOR}#{book[:title]}#{CLEAR_COLOR} is available at your library."
elsif libraries_available.length > 0 && LIBRARY_BRANCH_LOCATION == ''
book_available = true
puts "#{GREEN_COLOR}#{book[:title]}#{CLEAR_COLOR} is available at: #{libraries_available.uniq.join(', ')}"
else
# show those unavailable
# puts "#{ORANGE_COLOR}#{book[:title]}#{CLEAR_COLOR} is not available."
end
end
puts "#{ORANGE_COLOR}No books available#{CLEAR_COLOR}" unless book_available == true
end
# usage:
unless ARGV.length == 1 || ARGV.length == 2
puts "Usage: #{$0} [email] [library-id]"
puts "Library id can be left blank, otherwise an id is needed corresponding to"
puts "the library you want, example: 70 for Chinatown, 320 for Bucktown-Wicker park"
puts "defaults to Bucktown-Wicker Park (ids are from chipublib.org's catalog search)"
exit
end
email = ARGV[0]
# second arg not specified?
case ARGV[1]
when nil then library = ''
else library = ARGV[1]
end
LIBRARY_BRANCH_LOCATION = library
# TODO:
# Sinatra-fy into heroku app (view github repo)
books = get_wishlist(email)
find_books(books, library)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment