-
-
Save xflr6/759737dc06b290a009352d3307782a2b to your computer and use it in GitHub Desktop.
"""Download all available audio books from DB ICE Portal.""" | |
import json | |
import os | |
import urllib.parse | |
import urllib.request | |
BASE = 'http://iceportal.de/api1/rs/' | |
def load_json(url: str, *, verbose: bool = True): | |
if verbose: | |
print(url) | |
with urllib.request.urlopen(url) as f: | |
doc = json.load(f) | |
return doc | |
def get_page(href: str, *, | |
base: str = urllib.parse.urljoin(BASE, 'page/')): | |
url = urllib.parse.urljoin(base, href.lstrip('/')) | |
return load_json(url) | |
def retrieve(source, target, *, | |
base: str = urllib.parse.urljoin(BASE, 'audiobooks/path/')) -> None: | |
sheet = urllib.parse.urljoin(base, source.lstrip('/')) | |
path = load_json(sheet)['path'] | |
url = urllib.parse.urljoin(base, path) | |
urllib.request.urlretrieve(url, filename=target) | |
audiobooks = get_page('hoerbuecher') | |
for group in audiobooks['teaserGroups']: | |
for item in group['items']: | |
print('', item['title'], sep='\n') | |
page = get_page(item['navigation']['href']) | |
dirname = page['title'] | |
# fix invalid | |
dirname = dirname.replace('.', '_') | |
for remove_char in ('"', '?', '&', '/', '|'): | |
dirname = dirname.replace(remove_char, '') | |
dirname, _, _ = dirname.partition(':') | |
if not os.path.exists(dirname): | |
os.makedirs(dirname) | |
for file in page['files']: | |
url = file['path'] | |
target = os.path.join(dirname, | |
'{:d} - {}'.format(file['serialNumber'], | |
url.rpartition('/')[2])) | |
if not os.path.exists(target): | |
retrieve(url, target) |
Hi! i used this script yesterday, it works for quite a while.
But then i saw the behaviour that a file was downloaded, it's size was shrinked to zero, redownloaded, shrinked to zero, redownloaded, .... . It was more or less an endless loop until the wifi connection itself got lost.
Then i debugged and saw that this behaviour was in the line urllib.request.urlretrieve(url, filename=target)
in the retrieve
-function.
Did someone else saw this behaviour and/or has an idea how to stop that?
Could it be that the urlretrieve got a redirect while it's loading, does a redownload, got a redirect, does a redownload and so on?
Is there a parameter for this function which would trigger to ignore such redirects/redownloads, or an other internal function which does more or less the same?
I would be happy if this urlretrieve would throw an expection/returns with an error code if this happens, so script could catch that and download the remaining files.
Thanks @contrequarte, adapted so that the file names now always start with the serial number.