santiagobasulto/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Download HumbleBundle books

This is a quick Python script I wrote to download HumbleBundle books in batch. I bought the amazing Machine Learning by O'Reilly bundle. There were 15 books to download, with 3 different file formats per book. So I scratched a quick script to download all of them in batch.

  
(Final Result: books downloaded)
It's a simple script, the only problem is extracting the generated HTML from Humble Bundle. Here is a step by step guide:
Step 1: Open the download page

After your purchase, open the download page:

This is how mine looks like
Step 2: Inspect element

I'm using Chrome, but Firefox also works for this. Right click anywhere on the page and click on "Inspect Element":

Once you click on Inspect, the developer window should pop up:

Step 3: Scroll all the way up

Scroll up until you see the initial <html> element. Once you've identified it, right click on it and do: Copy > Copy Element

Step 4: Paste the content

Create a new file in your favorite editor and paste the contents that you've just copied from the previous step.

Use a good name for the html file because we'll use it next. For example: humble_bundle_ml.html
Step 5: Run the command!

Important: this script requires Python 3
Now you're ready to download those books. In your command line tool, create a virtualenv and install dependencies:
$ pip install beautifulsoup4 requests
Now you can invoke the actual command:
$ python hb_download.py humble_bundle_ml.html --epub --pdf
By default it'll download the books in a directory named books/. You can change that with the -d command.
Command Usage

❯ python hb_download.py --help
usage: hb_download.py [-h] [-d DESTINATION_DIR] [--epub] [--pdf] [--mobi]
                      html_file

Download

positional arguments:
  html_file             HTML file to download books from

optional arguments:
  -h, --help            show this help message and exit
  -d DESTINATION_DIR, --destination-dir DESTINATION_DIR
                        Directory where books will be saved
  --epub
  --pdf
  --mobi


## hb_download.py
import argparse
from pathlib import Path
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup


def parse_download_links(html_file_content):
    soup = BeautifulSoup(html_file_content)
    external_wrapper_div = soup.find('div', class_='js-all-downloads-holder')
    wrapper_div = external_wrapper_div.find('div', class_='whitebox-redux')

    books = []

    for div in wrapper_div.find_all('div'):
        data_div = div.find('div', attrs={'data-human-name': True})
        if not data_div:
            continue
        download_div = div.find('div', class_='download-buttons')
        download_links = {}
        for button_div in download_div.find_all('div', class_='small'):
            label = button_div.find('span', class_='label').text
            download_link = button_div.find(
                'a', class_='a', attrs={'href': True})['href']

            download_links[label] = download_link

        books.append({
            'title': data_div['data-human-name'],
            'slug': data_div['data-human-name'].lower().replace(' ', '-'),
            'download_links': download_links
        })
    return books


def safe_create_dir(path):
    path.mkdir(exist_ok=True)


def download_file_from_url(base_path, url, chunk_size=None):
    chunk_size = chunk_size or (4 * 1024)
    filename = urlparse(url).path.replace('/', '')
    book_path = base_path / filename
    if book_path.exists():
        # book already downloaded
        return (book_path, False)
    with requests.get(url, stream=True) as resp:
        with book_path.open('wb') as fp:
            for chunk in resp.iter_content(chunk_size=chunk_size):
                if chunk:
                    fp.write(chunk)

    return (book_path, True)


def download_books(html_file_content, download_dir='./books', pdf=False, epub=False, mobi=False):
    books_parsed = parse_download_links(html_file_content)
    base_path = Path(download_dir)
    safe_create_dir(base_path)
    for book in books_parsed:
        book_base_path = base_path / book['title']
        safe_create_dir(book_base_path)
        download_urls = [
            url for should_download, url in [
                (pdf, book['download_links'].get('PDF')),
                (mobi, book['download_links'].get('MOBI')),
                (epub, book['download_links'].get('EPUB')),
            ]
            if should_download
        ]

        for url in download_urls:
            result, downloaded = download_file_from_url(book_base_path, url)
            if not downloaded:
                print("Skipped: ", result)
            else:
                print("Downloaded: ", result)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Download ')
    parser.add_argument(
        'html_file', type=argparse.FileType(),
        help='HTML file to download books from')
    parser.add_argument(
        '-d', '--destination-dir', type=str,
        help="Directory where books will be saved", default='books')
    parser.add_argument('--epub', action='store_true', default=True)
    parser.add_argument('--pdf', action='store_true')
    parser.add_argument('--mobi', action='store_true')

    args = parser.parse_args()
    html = args.html_file.read()

    download_books(
        html, args.destination_dir,
        pdf=args.pdf, epub=args.epub, mobi=args.mobi,
    )
	import argparse
	from pathlib import Path
	from urllib.parse import urlparse

	import requests
	from bs4 import BeautifulSoup


	def parse_download_links(html_file_content):
	soup = BeautifulSoup(html_file_content)
	external_wrapper_div = soup.find('div', class_='js-all-downloads-holder')
	wrapper_div = external_wrapper_div.find('div', class_='whitebox-redux')

	books = []

	for div in wrapper_div.find_all('div'):
	data_div = div.find('div', attrs={'data-human-name': True})
	if not data_div:
	continue
	download_div = div.find('div', class_='download-buttons')
	download_links = {}
	for button_div in download_div.find_all('div', class_='small'):
	label = button_div.find('span', class_='label').text
	download_link = button_div.find(
	'a', class_='a', attrs={'href': True})['href']

	download_links[label] = download_link

	books.append({
	'title': data_div['data-human-name'],
	'slug': data_div['data-human-name'].lower().replace(' ', '-'),
	'download_links': download_links
	})
	return books


	def safe_create_dir(path):
	path.mkdir(exist_ok=True)


	def download_file_from_url(base_path, url, chunk_size=None):
	chunk_size = chunk_size or (4 * 1024)
	filename = urlparse(url).path.replace('/', '')
	book_path = base_path / filename
	if book_path.exists():
	# book already downloaded
	return (book_path, False)
	with requests.get(url, stream=True) as resp:
	with book_path.open('wb') as fp:
	for chunk in resp.iter_content(chunk_size=chunk_size):
	if chunk:
	fp.write(chunk)

	return (book_path, True)


	def download_books(html_file_content, download_dir='./books', pdf=False, epub=False, mobi=False):
	books_parsed = parse_download_links(html_file_content)
	base_path = Path(download_dir)
	safe_create_dir(base_path)
	for book in books_parsed:
	book_base_path = base_path / book['title']
	safe_create_dir(book_base_path)
	download_urls = [
	url for should_download, url in [
	(pdf, book['download_links'].get('PDF')),
	(mobi, book['download_links'].get('MOBI')),
	(epub, book['download_links'].get('EPUB')),
	]
	if should_download
	]

	for url in download_urls:
	result, downloaded = download_file_from_url(book_base_path, url)
	if not downloaded:
	print("Skipped: ", result)
	else:
	print("Downloaded: ", result)


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Download ')
	parser.add_argument(
	'html_file', type=argparse.FileType(),
	help='HTML file to download books from')
	parser.add_argument(
	'-d', '--destination-dir', type=str,
	help="Directory where books will be saved", default='books')
	parser.add_argument('--epub', action='store_true', default=True)
	parser.add_argument('--pdf', action='store_true')
	parser.add_argument('--mobi', action='store_true')

	args = parser.parse_args()
	html = args.html_file.read()

	download_books(
	html, args.destination_dir,
	pdf=args.pdf, epub=args.epub, mobi=args.mobi,
	)