Forked from Ultrabenosaurus/standard_ebooks_opds_downloader.py
Last active
September 2, 2019 18:15
-
-
Save robert-clayton/4319eb44688bbc48b4e985f00e202a2a to your computer and use it in GitHub Desktop.
Download all ebooks from the Standard eBooks OPDS catalogue, organised into subfolders by author and book title.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#### | |
# | |
# Download all of the Standard eBooks catalogue from their OPDS feed. | |
# | |
# https://standardebooks.org/ | |
# | |
# Modified to download all files for each book except the SVG cover and to | |
# organise the files in subfolders by Author > Book Title. | |
# The original script downloads only the EPUB file for each book into the | |
# working directory with no subfolders. | |
# | |
# This means that 6 files are downloaded per book instead of 1, making AnalogJ's | |
# original script much faster than mine. You also fill a lot less disk space. | |
# | |
# As a compromise, swap the IF statements on lines 34/35 to only download the | |
# single EPUB file per book but still organise them into subfolders. | |
# | |
# Original script by AnalogJ | |
# https://gist.github.com/AnalogJ/bfa4a497dedf507beddeb0173c3d98b0 | |
# | |
##### | |
import os | |
import urllib.request | |
import xml.etree.cElementTree as et | |
import multiprocessing | |
import time | |
def fetch_urls(element): | |
"""Builds and returns a list of all desired urls to download from. | |
:param xml.etree.ElementTree.Element element: The tree to iterate through for urls. | |
""" | |
return ["https://standardebooks.org{0}".format(atype.get('href')) for atype in element.iter('{http://www.w3.org/2005/Atom}link') if atype.get('href') and atype.get('type') != "image/svg+xml" and not "application/atom+xml" in atype.get('type')] | |
def download_url(url): | |
"""Downloads urls into folders. | |
Saves to: `./{author}/{title}/{file_name}` | |
:param str url: The given url to download. | |
""" | |
try: | |
author = url.split("/")[-3] | |
title = url.split("/")[-2] | |
file_name = url.split("/")[-1] | |
save_path = f'./{author}/{title}/' | |
try: | |
os.makedirs(save_path) | |
except: | |
pass | |
urllib.request.urlretrieve(url, save_path + file_name) | |
except Exception as e: | |
raise Exception(e.message) | |
if __name__ == '__main__': | |
try: | |
t0 = time.time() | |
# Fetch URLs | |
print('Fetching URLs to download...') | |
element = et.ElementTree(file=urllib.request.urlopen('https://standardebooks.org/opds/all')).getroot() | |
urls = fetch_urls(element) | |
# Set up pool and go | |
pool = multiprocessing.Pool() | |
print('Downloading...') | |
pool.map(download_url, urls) | |
pool.close() | |
pool.join() | |
print(f'\t...Complete! Took {round(time.time() - t0, 2)} seconds.') | |
# 'Listen' for interrupt Ctrl+C | |
except KeyboardInterrupt: | |
print('Stopping workers...') | |
pool.close() | |
pool.terminate() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Multi-processed Ultrabenosaurus' version of the script.