robert-clayton/standard_ebooks_opds_downloader.py

## standard_ebooks_opds_downloader.py
####
#
# Download all of the Standard eBooks catalogue from their OPDS feed.
#
# https://standardebooks.org/
#
# Modified to download all files for each book except the SVG cover and to
# organise the files in subfolders by Author > Book Title.
# The original script downloads only the EPUB file for each book into the
# working directory with no subfolders.
#
# This means that 6 files are downloaded per book instead of 1, making AnalogJ's
# original script much faster than mine. You also fill a lot less disk space.
#
# As a compromise, swap the IF statements on lines 34/35 to only download the
# single EPUB file per book but still organise them into subfolders.
#
# Original script by AnalogJ
# https://gist.github.com/AnalogJ/bfa4a497dedf507beddeb0173c3d98b0
#
#####

import os
import urllib.request
import xml.etree.cElementTree as et
import multiprocessing
import time

def fetch_urls(element):
    """Builds and returns a list of all desired urls to download from.

    :param xml.etree.ElementTree.Element element: The tree to iterate through for urls.
    """
    return ["https://standardebooks.org{0}".format(atype.get('href')) for atype in element.iter('{http://www.w3.org/2005/Atom}link') if atype.get('href') and atype.get('type') != "image/svg+xml" and not "application/atom+xml" in atype.get('type')]

def download_url(url):
    """Downloads urls into folders.
    Saves to: `./{author}/{title}/{file_name}`
    :param str url: The given url to download.
    """
    try:
        author = url.split("/")[-3]
        title = url.split("/")[-2]
        file_name = url.split("/")[-1]
        save_path = f'./{author}/{title}/'
        try:
            os.makedirs(save_path)
        except:
            pass
        urllib.request.urlretrieve(url, save_path + file_name)
    except Exception as e:
        raise Exception(e.message)

if __name__ == '__main__':
    try:
        t0 = time.time()

        # Fetch URLs
        print('Fetching URLs to download...')
        element = et.ElementTree(file=urllib.request.urlopen('https://standardebooks.org/opds/all')).getroot()
        urls = fetch_urls(element)
        # Set up pool and go
        pool = multiprocessing.Pool()
        print('Downloading...')
        pool.map(download_url, urls)
        pool.close()
        pool.join()
        print(f'\t...Complete! Took {round(time.time() - t0, 2)} seconds.')
    # 'Listen' for interrupt Ctrl+C
    except KeyboardInterrupt:
        print('Stopping workers...')
        pool.close()
        pool.terminate()
	####
	#
	# Download all of the Standard eBooks catalogue from their OPDS feed.
	#
	# https://standardebooks.org/
	#
	# Modified to download all files for each book except the SVG cover and to
	# organise the files in subfolders by Author > Book Title.
	# The original script downloads only the EPUB file for each book into the
	# working directory with no subfolders.
	#
	# This means that 6 files are downloaded per book instead of 1, making AnalogJ's
	# original script much faster than mine. You also fill a lot less disk space.
	#
	# As a compromise, swap the IF statements on lines 34/35 to only download the
	# single EPUB file per book but still organise them into subfolders.
	#
	# Original script by AnalogJ
	# https://gist.github.com/AnalogJ/bfa4a497dedf507beddeb0173c3d98b0
	#
	#####

	import os
	import urllib.request
	import xml.etree.cElementTree as et
	import multiprocessing
	import time

	def fetch_urls(element):
	"""Builds and returns a list of all desired urls to download from.

	:param xml.etree.ElementTree.Element element: The tree to iterate through for urls.
	"""
	return ["https://standardebooks.org{0}".format(atype.get('href')) for atype in element.iter('{http://www.w3.org/2005/Atom}link') if atype.get('href') and atype.get('type') != "image/svg+xml" and not "application/atom+xml" in atype.get('type')]

	def download_url(url):
	"""Downloads urls into folders.
	Saves to: `./{author}/{title}/{file_name}`
	:param str url: The given url to download.
	"""
	try:
	author = url.split("/")[-3]
	title = url.split("/")[-2]
	file_name = url.split("/")[-1]
	save_path = f'./{author}/{title}/'
	try:
	os.makedirs(save_path)
	except:
	pass
	urllib.request.urlretrieve(url, save_path + file_name)
	except Exception as e:
	raise Exception(e.message)

	if __name__ == '__main__':
	try:
	t0 = time.time()

	# Fetch URLs
	print('Fetching URLs to download...')
	element = et.ElementTree(file=urllib.request.urlopen('https://standardebooks.org/opds/all')).getroot()
	urls = fetch_urls(element)
	# Set up pool and go
	pool = multiprocessing.Pool()
	print('Downloading...')
	pool.map(download_url, urls)
	pool.close()
	pool.join()
	print(f'\t...Complete! Took {round(time.time() - t0, 2)} seconds.')
	# 'Listen' for interrupt Ctrl+C
	except KeyboardInterrupt:
	print('Stopping workers...')
	pool.close()
	pool.terminate()