Skip to content

Instantly share code, notes, and snippets.

@ayyybe
Last active September 23, 2021 00:22
Show Gist options
  • Save ayyybe/85b431586bb98908a8a3d983886c12a5 to your computer and use it in GitHub Desktop.
Save ayyybe/85b431586bb98908a8a3d983886c12a5 to your computer and use it in GitHub Desktop.
download & reconstruct a local copy of hosted EPUBs (to download online textbooks, etc.)
#!/usr/bin/env python3
import os
import shutil
import zipfile
import tempfile
try:
import readline # input gets limited to 1024 characters for some reason if this isn't imported (macos only?)
except ImportError:
pass
import urllib.request
import xml.etree.ElementTree as ET
from threading import Thread
# EPUB spec:
# https://www.w3.org/publishing/epub3/epub-spec.html
# this script is not fully spec compliant and does not support epubs with multiple renditions/root files, and also doesn't check any optional META-INF stuff (encryption.xml, manifest.xml, metadata.xml, rights.xml, signatures.xml)
# that being said, i have yet to find any epubs that actually use any of these features
def dl(path):
print('Downloading ' + path)
url = os.path.join(root, path)
staging_path = os.path.join(staging, path)
os.makedirs(os.path.dirname(staging_path), exist_ok=True)
urllib.request.urlretrieve(url, staging_path)
return staging_path
if __name__ == '__main__':
root = input('EPUB Root URL: ') or 'https://education.wiley.com/content/Hughes_Hallett_Calculus_7e/ebook/epub/9781119320494/'
cookie = input('Cookie: ') or ''
dest = os.path.abspath(input('Destination filename: ') or 'book.epub')
staging = tempfile.mkdtemp()
opener = urllib.request.build_opener()
opener.addheaders = [('cookie', cookie)]
urllib.request.install_opener(opener)
print('\n====================\n')
print('EPUB Root URL: ' + root)
print('Cookie: ' + cookie)
print('Destination filename: ' + dest)
print('Staging directory: ' + staging)
input('\nPress any key to begin download')
print('')
# required mimetype file
dl('mimetype')
# required container.xml file, also contains path to rendition/rootfile
container = ET.parse(dl('META-INF/container.xml')).getroot()
rootfile_path = container[0][0].attrib['full-path']
root_dir = os.path.dirname(rootfile_path)
print('Found rootfile: ' + rootfile_path)
# download rootfile & rip all linked resources
rootfile = ET.parse(dl(rootfile_path)).getroot()
threads = []
for el in rootfile.iter():
if el.tag.split('}', 1)[1] == 'item':
url = os.path.join(root_dir, el.attrib['href'])
thread = Thread(target=dl, args=(url,))
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
# zip everything up
print('Creating epub...')
shutil.make_archive(dest, 'zip', staging)
os.rename(dest + '.zip', os.path.basename(dest))
# delete staging dir
print('Cleaning up...')
shutil.rmtree(staging)
print('\nDone! EPUB has been reconstructed at ' + dest)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment