Skip to content

Instantly share code, notes, and snippets.

@seeRead
Last active January 1, 2016 20:49
Show Gist options
  • Save seeRead/8199309 to your computer and use it in GitHub Desktop.
Save seeRead/8199309 to your computer and use it in GitHub Desktop.
import re
import pprint
import urlparse
import urllib2
#http://blog.ianbicking.org/2008/12/10/lxml-an-underappreciated-web-scraping-library/
from lxml.html import parse, tostring, fromstring #for better css selectors than Beautiful Soup
from lxml.html.diff import htmldiff
from lxml import cssselect, etree
from urlparse import urlparse
from pytz import timezone
import pytz
pp = pprint.PrettyPrinter(indent=4)
#set your wishlist here
wishlist = ''
#set your library catalog search url here
catalog = 'https://catalog.houstonlibrary.org/client/hou/search/results?&dt=list&qu='
(true,false,null) = (True,False,None)
items = []
#set this to working endpoint for https://github.com/doitlikejustin/amazon-wish-lister
url = 'http://wishlist.dev/wishlist.php?id='+wishlist+'&reveal=all'
print url
json = urllib2.urlopen(url).read()
pp.pprint(items)
items += eval(json)
print 'WISHLIST PAGE'
print page
print 'TOTAL ITEMS'
print len(items)
print 'GETTING ISBNs'
for idx, item in enumerate(items):
try:
#item['isbn'] = item['link'].split("/")[5].strip("\\")
item['isbn'] = item['link'].split("/")[2].strip("\\")
items[idx] = item
except Exception, err:
print err
print 'FIRST ISBN WAS'
print items[0]['isbn']
success = []
fail = []
opener = urllib2.build_opener()
opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11')]
result = None
#do as enumerable
for item in items:
search = catalog + item['isbn']
result = opener.open(search).read()
page = fromstring(result)
page.make_links_absolute(search)
#if you are adapting to a different catalog, you will need to change this selector. it should match if the page returns a book. it shouldn't match if your library doesn't have the book.
link = page.cssselect('#detail_main_wrapper0')
#TODO add search to item dict
if len(link) > 0:
success.append(item)
print 'success - ', item['name'] ,' - ', item['isbn'],' - ', search
else :
fail.append(item)
print 'fail - ', item['name']
print 'SUCCESS'
for idx, item in enumerate(success):
search = catalog + item['isbn']
print idx, ' - ', item['name'], ' - ',item['isbn'], ' - ', search
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment