Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Ultrabenosaurus/7011cbf9fd72e9449fe0f95f6526f7aa to your computer and use it in GitHub Desktop.
Save Ultrabenosaurus/7011cbf9fd72e9449fe0f95f6526f7aa to your computer and use it in GitHub Desktop.
Download all ebooks from the Standard eBooks OPDS catalogue, organised into subfolders by author and book title.
#####
#
# Download all of the Standard eBooks catalogue from their OPDS feed.
#
# https://standardebooks.org/
#
# Modified to download all files for each book except the SVG cover and to
# organise the files in subfolders by Author > Book Title.
# The original script downloads only the EPUB file for each book into the
# working directory with no subfolders.
#
# This means that 6 files are downloaded per book instead of 1, making AnalogJ's
# original script much faster than mine. You also fill a lot less disk space.
#
# As a compromise, swap the IF statements on lines 34/35 to only download the
# single EPUB file per book but still organise them into subfolders.
#
# Original script by AnalogJ
# https://gist.github.com/AnalogJ/bfa4a497dedf507beddeb0173c3d98b0
#
#####
import urllib2
import os
import urllib, urlparse
import xml.etree.cElementTree as et
e = et.ElementTree(file=urllib2.urlopen('https://standardebooks.org/opds/all')).getroot()
print(e)
print("parsing")
for atype in e.iter('{http://www.w3.org/2005/Atom}link'):
# if atype.get('href') and atype.get('type') == "application/epub+zip" and atype.get('href').split('.')[-1] == 'epub':
if atype.get('href') and atype.get('type') != "image/svg+xml" and not "application/atom+xml" in atype.get('type'):
dl_url = "https://standardebooks.org{0}".format(atype.get('href'))
print(dl_url)
split = urlparse.urlsplit(dl_url)
fpath = "./" + split.path.split("/")[2] + "/" + split.path.split("/")[3] + "/"
if not os.path.exists(fpath):
os.makedirs(fpath)
filename = fpath + split.path.split("/")[-1]
urllib.urlretrieve (dl_url, filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment