-
-
Save xflr6/759737dc06b290a009352d3307782a2b to your computer and use it in GitHub Desktop.
"""Download all available audio books from DB ICE Portal.""" | |
import json | |
import os | |
import urllib.parse | |
import urllib.request | |
BASE = 'http://iceportal.de/api1/rs/' | |
def load_json(url: str, *, verbose: bool = True): | |
if verbose: | |
print(url) | |
with urllib.request.urlopen(url) as f: | |
doc = json.load(f) | |
return doc | |
def get_page(href: str, *, | |
base: str = urllib.parse.urljoin(BASE, 'page/')): | |
url = urllib.parse.urljoin(base, href.lstrip('/')) | |
return load_json(url) | |
def retrieve(source, target, *, | |
base: str = urllib.parse.urljoin(BASE, 'audiobooks/path/')) -> None: | |
sheet = urllib.parse.urljoin(base, source.lstrip('/')) | |
path = load_json(sheet)['path'] | |
url = urllib.parse.urljoin(base, path) | |
urllib.request.urlretrieve(url, filename=target) | |
audiobooks = get_page('hoerbuecher') | |
for group in audiobooks['teaserGroups']: | |
for item in group['items']: | |
print('', item['title'], sep='\n') | |
page = get_page(item['navigation']['href']) | |
dirname = page['title'] | |
# fix invalid | |
dirname = dirname.replace('.', '_') | |
for remove_char in ('"', '?', '&', '/', '|'): | |
dirname = dirname.replace(remove_char, '') | |
dirname, _, _ = dirname.partition(':') | |
if not os.path.exists(dirname): | |
os.makedirs(dirname) | |
for file in page['files']: | |
url = file['path'] | |
target = os.path.join(dirname, | |
'{:d} - {}'.format(file['serialNumber'], | |
url.rpartition('/')[2])) | |
if not os.path.exists(target): | |
retrieve(url, target) |
Thanks. Might take a look (need to check/update next time on board otherwise).
So here is the current json output :)
Thanks. Updated, fingers crossed :)
Working fine. Hint: Check the python version to make that 3.x is used. Otherwise you get a syntax error in line 10
look like downloading movies is not so easy? do you know any possibility?
@BoKa33: nope, no experience
Just tried that and work but not, if the filename contains a "pipe" or "ampersand". or "forward slash"
So simply add some more replaces in line 40
dirname = dirname.replace('.', '').replace('"', '').replace('?', '').replace('&', '').replace('|', '').replace('/', '')
if the filename contains a "pipe" or "ampersand". or "forward slash"
So simply add some more replaces in line 40
Thanks, adapted.
Nice work! As it seems, when downloading podcasts, only one episode is downloaded, as the naming convention for podcast episodes is different compared to audiobooks. Therefore I've added the serial number contained in the JSON to the filename used to save locally. I've added these changes to my fork, as I didn't know, if this behaviour was intended by your code. (If not please feel free to add it.)
Thanks @contrequarte, adapted so that the file names now always start with the serial number.
Hi! i used this script yesterday, it works for quite a while.
But then i saw the behaviour that a file was downloaded, it's size was shrinked to zero, redownloaded, shrinked to zero, redownloaded, .... . It was more or less an endless loop until the wifi connection itself got lost.
Then i debugged and saw that this behaviour was in the line urllib.request.urlretrieve(url, filename=target)
in the retrieve
-function.
Did someone else saw this behaviour and/or has an idea how to stop that?
Could it be that the urlretrieve got a redirect while it's loading, does a redownload, got a redirect, does a redownload and so on?
Is there a parameter for this function which would trigger to ignore such redirects/redownloads, or an other internal function which does more or less the same?
I would be happy if this urlretrieve would throw an expection/returns with an error code if this happens, so script could catch that and download the remaining files.
Its not working currently. From what i see its "teaserGroups" But this leads to an error:
If needed i can provide the content of audiobooks