-
-
Save Ultrabenosaurus/7011cbf9fd72e9449fe0f95f6526f7aa to your computer and use it in GitHub Desktop.
Download all ebooks from the Standard eBooks OPDS catalogue, organised into subfolders by author and book title.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##### | |
# | |
# Download all of the Standard eBooks catalogue from their OPDS feed. | |
# | |
# https://standardebooks.org/ | |
# | |
# Modified to download all files for each book except the SVG cover and to | |
# organise the files in subfolders by Author > Book Title. | |
# The original script downloads only the EPUB file for each book into the | |
# working directory with no subfolders. | |
# | |
# This means that 6 files are downloaded per book instead of 1, making AnalogJ's | |
# original script much faster than mine. You also fill a lot less disk space. | |
# | |
# As a compromise, swap the IF statements on lines 34/35 to only download the | |
# single EPUB file per book but still organise them into subfolders. | |
# | |
# Original script by AnalogJ | |
# https://gist.github.com/AnalogJ/bfa4a497dedf507beddeb0173c3d98b0 | |
# | |
##### | |
import urllib2 | |
import os | |
import urllib, urlparse | |
import xml.etree.cElementTree as et | |
e = et.ElementTree(file=urllib2.urlopen('https://standardebooks.org/opds/all')).getroot() | |
print(e) | |
print("parsing") | |
for atype in e.iter('{http://www.w3.org/2005/Atom}link'): | |
# if atype.get('href') and atype.get('type') == "application/epub+zip" and atype.get('href').split('.')[-1] == 'epub': | |
if atype.get('href') and atype.get('type') != "image/svg+xml" and not "application/atom+xml" in atype.get('type'): | |
dl_url = "https://standardebooks.org{0}".format(atype.get('href')) | |
print(dl_url) | |
split = urlparse.urlsplit(dl_url) | |
fpath = "./" + split.path.split("/")[2] + "/" + split.path.split("/")[3] + "/" | |
if not os.path.exists(fpath): | |
os.makedirs(fpath) | |
filename = fpath + split.path.split("/")[-1] | |
urllib.urlretrieve (dl_url, filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment