Ultrabenosaurus/standard_ebooks_opds_downloader.py

## standard_ebooks_opds_downloader.py
#####
#
# Download all of the Standard eBooks catalogue from their OPDS feed.
#
# https://standardebooks.org/
#
# Modified to download all files for each book except the SVG cover and to
# organise the files in subfolders by Author > Book Title.
# The original script downloads only the EPUB file for each book into the
# working directory with no subfolders.
#
# This means that 6 files are downloaded per book instead of 1, making AnalogJ's
# original script much faster than mine. You also fill a lot less disk space.
#
# As a compromise, swap the IF statements on lines 34/35 to only download the
# single EPUB file per book but still organise them into subfolders.
#
# Original script by AnalogJ
# https://gist.github.com/AnalogJ/bfa4a497dedf507beddeb0173c3d98b0
#
#####

import urllib2
import os
import urllib, urlparse
import xml.etree.cElementTree as et

e = et.ElementTree(file=urllib2.urlopen('https://standardebooks.org/opds/all')).getroot()

print(e)
print("parsing")
for atype in e.iter('{http://www.w3.org/2005/Atom}link'):

	# if atype.get('href') and atype.get('type') == "application/epub+zip" and atype.get('href').split('.')[-1] == 'epub':
	if atype.get('href') and atype.get('type') != "image/svg+xml" and not "application/atom+xml" in atype.get('type'):
		dl_url = "https://standardebooks.org{0}".format(atype.get('href'))
		print(dl_url)

		split = urlparse.urlsplit(dl_url)
		fpath = "./" + split.path.split("/")[2] + "/" + split.path.split("/")[3] + "/"
		if not os.path.exists(fpath):
			os.makedirs(fpath)
		filename = fpath + split.path.split("/")[-1]

		urllib.urlretrieve (dl_url, filename)
	#####
	#
	# Download all of the Standard eBooks catalogue from their OPDS feed.
	#
	# https://standardebooks.org/
	#
	# Modified to download all files for each book except the SVG cover and to
	# organise the files in subfolders by Author > Book Title.
	# The original script downloads only the EPUB file for each book into the
	# working directory with no subfolders.
	#
	# This means that 6 files are downloaded per book instead of 1, making AnalogJ's
	# original script much faster than mine. You also fill a lot less disk space.
	#
	# As a compromise, swap the IF statements on lines 34/35 to only download the
	# single EPUB file per book but still organise them into subfolders.
	#
	# Original script by AnalogJ
	# https://gist.github.com/AnalogJ/bfa4a497dedf507beddeb0173c3d98b0
	#
	#####

	import urllib2
	import os
	import urllib, urlparse
	import xml.etree.cElementTree as et

	e = et.ElementTree(file=urllib2.urlopen('https://standardebooks.org/opds/all')).getroot()

	print(e)
	print("parsing")
	for atype in e.iter('{http://www.w3.org/2005/Atom}link'):

	# if atype.get('href') and atype.get('type') == "application/epub+zip" and atype.get('href').split('.')[-1] == 'epub':
	if atype.get('href') and atype.get('type') != "image/svg+xml" and not "application/atom+xml" in atype.get('type'):
	dl_url = "https://standardebooks.org{0}".format(atype.get('href'))
	print(dl_url)

	split = urlparse.urlsplit(dl_url)
	fpath = "./" + split.path.split("/")[2] + "/" + split.path.split("/")[3] + "/"
	if not os.path.exists(fpath):
	os.makedirs(fpath)
	filename = fpath + split.path.split("/")[-1]

	urllib.urlretrieve (dl_url, filename)