sferacu/getbook.py

## getbook.py
#!/bin/python
import re, requests, logging, os, json
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

BOOK_ID = "duk7k-YoYwEC"
PROXY = None
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.64 Safari/537.1"
COOKIE = "GA1.3.617046422.1538474875"

HEADERS = {
	'User-Agent':USER_AGENT,
	'Cookie':COOKIE
}

def get_contents(url):
	""" Gets the contents of a URL """
	return requests.get(url, headers=HEADERS).text

def get_binary_contents(url):
	""" Gets the binary contents of a URL for saving as image"""
	return requests.get(url, headers=HEADERS).content

def main():
	""" Main entry point of the application """
	logging.info("Starting GetBook")

	# Information is found in the Cover URL
	coverUrl = "http://books.google.com/books?id=%s&hl=en&printsec=frontcover" % BOOK_ID

	# Fetch information from the cover URL
	coverHtml = get_contents(coverUrl)

	# There's a function in the source code "OC_Run" which contains
	# arguments in JSON, which in-turn contains information on all the
	# page IDs
	match = re.search("_OC_Run\((.*?)\)", coverHtml)

	if not match:
		logging.error("Unable to find information about OC_Run")
		return

	# Use the JSON parser to load the match arguments.
	oc_args = json.loads("[%s]" % match.group(1))
	if(len(oc_args) < 2):
		logging.error("Format of OC_Run must have changed")
		return

	# The first and second arguments to OC_Run are the page info
	# and the book info. Use an array slice to extract these parameters
	pages_info, book_info = oc_args[:2]

	# Perform an array pluck of all the pages and sort them.
	page_ids = [x["pid"] for x in sorted(pages_info["page"], key=lambda d: d["order"])]
	logging.info("Successfully found information for %d pages", len(page_ids))

	# Get the "prefix"
	prefix = pages_info["prefix"].decode("raw_unicode_escape")
	# Now, simply adding the page id to the prefix will give the URL of the page
	prefix += "&pg="
	logging.info("The prefix is %s" % prefix)

	# Store all images in a directory for book id
	baseDir = BOOK_ID # Change this to absolute path of wherever you want files
	if not os.path.isdir(baseDir):
		os.mkdir(baseDir)

	index = 0
	for pid in page_ids:
		logging.info("\n\n\n\nProcessing page #%d. ID - %s" % (index, pid))
		index += 1
		fileName = os.path.join(baseDir, pid + '.png')

		# Check if the page is already downloaded
		if os.path.isfile(fileName):
			logging.info("File already exists. Skipping.")
			continue

		pageInfoHtml = get_contents(prefix + pid)

		match = re.search(r"preloadImg.src = '([^']*?)'", pageInfoHtml)

		if not match:
			logging.error("Unable to find an image inside the page")
			continue

		imageUrl = match.group(1)

		# open the file handle for writing
		fileHandle = open(fileName, "w")
		fileHandle.write(get_binary_contents(imageUrl))
		fileHandle.close()

		logging.info("Successfully downloaded %s" % pid)


if __name__ == "__main__":
	main()
	#!/bin/python
	import re, requests, logging, os, json
	logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

	BOOK_ID = "duk7k-YoYwEC"
	PROXY = None
	USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.64 Safari/537.1"
	COOKIE = "GA1.3.617046422.1538474875"

	HEADERS = {
	'User-Agent':USER_AGENT,
	'Cookie':COOKIE
	}

	def get_contents(url):
	""" Gets the contents of a URL """
	return requests.get(url, headers=HEADERS).text

	def get_binary_contents(url):
	""" Gets the binary contents of a URL for saving as image"""
	return requests.get(url, headers=HEADERS).content

	def main():
	""" Main entry point of the application """
	logging.info("Starting GetBook")

	# Information is found in the Cover URL
	coverUrl = "http://books.google.com/books?id=%s&hl=en&printsec=frontcover" % BOOK_ID

	# Fetch information from the cover URL
	coverHtml = get_contents(coverUrl)

	# There's a function in the source code "OC_Run" which contains
	# arguments in JSON, which in-turn contains information on all the
	# page IDs
	match = re.search("_OC_Run\((.*?)\)", coverHtml)

	if not match:
	logging.error("Unable to find information about OC_Run")
	return

	# Use the JSON parser to load the match arguments.
	oc_args = json.loads("[%s]" % match.group(1))
	if(len(oc_args) < 2):
	logging.error("Format of OC_Run must have changed")
	return

	# The first and second arguments to OC_Run are the page info
	# and the book info. Use an array slice to extract these parameters
	pages_info, book_info = oc_args[:2]

	# Perform an array pluck of all the pages and sort them.
	page_ids = [x["pid"] for x in sorted(pages_info["page"], key=lambda d: d["order"])]
	logging.info("Successfully found information for %d pages", len(page_ids))

	# Get the "prefix"
	prefix = pages_info["prefix"].decode("raw_unicode_escape")
	# Now, simply adding the page id to the prefix will give the URL of the page
	prefix += "&pg="
	logging.info("The prefix is %s" % prefix)

	# Store all images in a directory for book id
	baseDir = BOOK_ID # Change this to absolute path of wherever you want files
	if not os.path.isdir(baseDir):
	os.mkdir(baseDir)

	index = 0
	for pid in page_ids:
	logging.info("\n\n\n\nProcessing page #%d. ID - %s" % (index, pid))
	index += 1
	fileName = os.path.join(baseDir, pid + '.png')

	# Check if the page is already downloaded
	if os.path.isfile(fileName):
	logging.info("File already exists. Skipping.")
	continue

	pageInfoHtml = get_contents(prefix + pid)

	match = re.search(r"preloadImg.src = '([^']*?)'", pageInfoHtml)

	if not match:
	logging.error("Unable to find an image inside the page")
	continue

	imageUrl = match.group(1)

	# open the file handle for writing
	fileHandle = open(fileName, "w")
	fileHandle.write(get_binary_contents(imageUrl))
	fileHandle.close()

	logging.info("Successfully downloaded %s" % pid)


	if __name__ == "__main__":
	main()