Skip to content

Instantly share code, notes, and snippets.

@skyronic
Created September 11, 2012 13:58
Show Gist options
  • Save skyronic/3698698 to your computer and use it in GitHub Desktop.
Save skyronic/3698698 to your computer and use it in GitHub Desktop.
Get a book from google books
#!/bin/python
import re, requests, logging, os, json
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
BOOK_ID = "l8B2NWPTDWcC"
PROXY = None
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.64 Safari/537.1"
COOKIE = "Set your google cookie here"
HEADERS = {
'User-Agent':USER_AGENT,
'Cookie':COOKIE
}
def get_contents(url):
""" Gets the contents of a URL """
return requests.get(url, headers=HEADERS).text
def get_binary_contents(url):
""" Gets the binary contents of a URL for saving as image"""
return requests.get(url, headers=HEADERS).content
def main():
""" Main entry point of the application """
logging.info("Starting GetBook")
# Information is found in the Cover URL
coverUrl = "http://books.google.com/books?id=%s&hl=en&printsec=frontcover" % BOOK_ID
# Fetch information from the cover URL
coverHtml = get_contents(coverUrl)
# There's a function in the source code "OC_Run" which contains
# arguments in JSON, which in-turn contains information on all the
# page IDs
match = re.search("_OC_Run\((.*?)\)", coverHtml)
if not match:
logging.error("Unable to find information about OC_Run")
return
# Use the JSON parser to load the match arguments.
oc_args = json.loads("[%s]" % match.group(1))
if(len(oc_args) < 2):
logging.error("Format of OC_Run must have changed")
return
# The first and second arguments to OC_Run are the page info
# and the book info. Use an array slice to extract these parameters
pages_info, book_info = oc_args[:2]
# Perform an array pluck of all the pages and sort them.
page_ids = [x["pid"] for x in sorted(pages_info["page"], key=lambda d: d["order"])]
logging.info("Successfully found information for %d pages", len(page_ids))
# Get the "prefix"
prefix = pages_info["prefix"].decode("raw_unicode_escape")
# Now, simply adding the page id to the prefix will give the URL of the page
prefix += "&pg="
logging.info("The prefix is %s" % prefix)
# Store all images in a directory for book id
baseDir = BOOK_ID # Change this to absolute path of wherever you want files
if not os.path.isdir(baseDir):
os.mkdir(baseDir)
index = 0
for pid in page_ids:
logging.info("\n\n\n\nProcessing page #%d. ID - %s" % (index, pid))
index += 1
fileName = os.path.join(baseDir, pid + '.png')
# Check if the page is already downloaded
if os.path.isfile(fileName):
logging.info("File already exists. Skipping.")
continue
pageInfoHtml = get_contents(prefix + pid)
match = re.search(r"preloadImg.src = '([^']*?)'", pageInfoHtml)
if not match:
logging.error("Unable to find an image inside the page")
continue
imageUrl = match.group(1)
# open the file handle for writing
fileHandle = open(fileName, "w")
fileHandle.write(get_binary_contents(imageUrl))
fileHandle.close()
logging.info("Successfully downloaded %s" % pid)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment