Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save robert-clayton/4319eb44688bbc48b4e985f00e202a2a to your computer and use it in GitHub Desktop.
Save robert-clayton/4319eb44688bbc48b4e985f00e202a2a to your computer and use it in GitHub Desktop.
Download all ebooks from the Standard eBooks OPDS catalogue, organised into subfolders by author and book title.
####
#
# Download all of the Standard eBooks catalogue from their OPDS feed.
#
# https://standardebooks.org/
#
# Modified to download all files for each book except the SVG cover and to
# organise the files in subfolders by Author > Book Title.
# The original script downloads only the EPUB file for each book into the
# working directory with no subfolders.
#
# This means that 6 files are downloaded per book instead of 1, making AnalogJ's
# original script much faster than mine. You also fill a lot less disk space.
#
# As a compromise, swap the IF statements on lines 34/35 to only download the
# single EPUB file per book but still organise them into subfolders.
#
# Original script by AnalogJ
# https://gist.github.com/AnalogJ/bfa4a497dedf507beddeb0173c3d98b0
#
#####
import os
import urllib.request
import xml.etree.cElementTree as et
import multiprocessing
import time
def fetch_urls(element):
"""Builds and returns a list of all desired urls to download from.
:param xml.etree.ElementTree.Element element: The tree to iterate through for urls.
"""
return ["https://standardebooks.org{0}".format(atype.get('href')) for atype in element.iter('{http://www.w3.org/2005/Atom}link') if atype.get('href') and atype.get('type') != "image/svg+xml" and not "application/atom+xml" in atype.get('type')]
def download_url(url):
"""Downloads urls into folders.
Saves to: `./{author}/{title}/{file_name}`
:param str url: The given url to download.
"""
try:
author = url.split("/")[-3]
title = url.split("/")[-2]
file_name = url.split("/")[-1]
save_path = f'./{author}/{title}/'
try:
os.makedirs(save_path)
except:
pass
urllib.request.urlretrieve(url, save_path + file_name)
except Exception as e:
raise Exception(e.message)
if __name__ == '__main__':
try:
t0 = time.time()
# Fetch URLs
print('Fetching URLs to download...')
element = et.ElementTree(file=urllib.request.urlopen('https://standardebooks.org/opds/all')).getroot()
urls = fetch_urls(element)
# Set up pool and go
pool = multiprocessing.Pool()
print('Downloading...')
pool.map(download_url, urls)
pool.close()
pool.join()
print(f'\t...Complete! Took {round(time.time() - t0, 2)} seconds.')
# 'Listen' for interrupt Ctrl+C
except KeyboardInterrupt:
print('Stopping workers...')
pool.close()
pool.terminate()
@robert-clayton
Copy link
Author

Multi-processed Ultrabenosaurus' version of the script.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment