Skip to content

Instantly share code, notes, and snippets.

@jelmervdl
Created May 17, 2023 11:16
Show Gist options
  • Save jelmervdl/c89b21c3d8d8ed2a00a42bd122f5d668 to your computer and use it in GitHub Desktop.
Save jelmervdl/c89b21c3d8d8ed2a00a42bd122f5d668 to your computer and use it in GitHub Desktop.
Resumeable downloads with plain Python
#!/usr/bin/env python3
import sys
import os
from argparse import ArgumentParser
from contextlib import ExitStack
from email.utils import parsedate_to_datetime
from http.client import HTTPResponse
from shutil import copyfileobj
from tempfile import TemporaryFile
from time import sleep
from typing import cast, BinaryIO, Optional
from urllib.request import urlopen, Request, URLError
from urllib.error import HTTPError
from urllib.parse import urlparse
from warnings import warn
BUFSIZE=2**16
def get_content_length(response:HTTPResponse) -> int:
"""Get whole content length from either a normal or a Range request."""
content_range = response.getheader('Content-Range', '').split('/')
if len(content_range) == 2 and content_range[1] != '*':
return int(content_range[1])
size = response.getheader('Content-Length')
if size is not None:
return int(size)
raise ValueError('No content size')
def parse_retry_after(retry_after: Optional[str]) -> int:
if retry_after is None:
raise ValueError('No Retry-After header')
if retry_after.isdigit():
return int(retry_after)
else :
diff = parsedate_to_datetime(retry_after) - datetime.now()
return diff.total_seconds()
def download(url:str, file:BinaryIO, *, retries:int=10, wait:float=30.0) -> None:
attempt = 0
timeout = 0
size = None
while size is None or file.tell() < size:
attempt += 1
if attempt > retries:
raise Exception('Ran out of retries')
if timeout > 0:
sleep(timeout)
request = Request(url, headers={
'Range': f'bytes={file.tell()}-'
})
if file.tell() > 0:
warn(f'Resuming download from {file.tell()}')
try:
with urlopen(request) as fin:
response = cast(HTTPResponse, fin)
if response.status not in {200, 206}:
raise RuntimeError(f'Server responded with {response.status}')
# make sure we get a partial response. If not (i.e. 200 instead of 206)
# then start writing our output from the start as well.
if response.status == 200:
warn('Server does not support Range requests')
file.seek(0)
# Get the expected full content length (throws if not available)
size = get_content_length(response)
# Read downloaded bytes, writing them to the file.
while True:
chunk = fin.read(BUFSIZE)
if len(chunk) == 0:
break
file.write(chunk)
# If we're somehow past our expected size, something went wrong
# and we can't recover from that by retrying
if file.tell() > size:
raise Exception(f'Downloaded too much: {file.tell()} > {size}')
# Incomplete? Retry without timeout because there wasn't an error,
# the connection just got closed early?
if file.tell() < size:
warn(f'Server gave incomplete response: {file.tell()} < {size}')
timeout = 0
except HTTPError as e:
if e.code >= 500 and e.code < 600:
# Back-off at least
timeout = wait if timeout == 0 else timeout * 2
# Oh no someone is rate-limiting us, lets try to listen to them
if e.code == 503:
try:
timeout = parse_retry_after(e.headers.get('Retry-After'))
except ValueError:
pass
warn(f'Server responded with {e.code}, retry {attempt} after {timeout} pause')
continue
else:
raise
# At the end of the loop, we assume we've got all our data
assert size is not None and file.tell() == size
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--retries', '-r', type=int, default=10)
parser.add_argument('--wait', '-w', type=float, default=30.0)
parser.add_argument('--output', '-o', type=str, default='./')
parser.add_argument('url', type=str, nargs='+')
args = parser.parse_args()
for url in args.url:
with ExitStack() as ctx:
use_stdout = args.output in {'-', '/dev/stdout'}
if use_stdout:
dest = ctx.enter_context(TemporaryFile('a+b'))
else:
if args.output.endswith('/') and not os.path.exists(args.output):
os.makedirs(args.output)
if os.path.isdir(args.output):
filename = os.path.basename(urlparse(url).path.rstrip('/'))
output = os.path.join(args.output, filename)
else:
if len(args.url) > 1:
raise RuntimeError('Downloading multiple urls to the same output file does not make much sense')
output = args.output
dest = ctx.enter_context(open(output, 'a+b'))
download(url, dest, retries=args.retries, wait=args.wait)
if use_stdout:
dest.seek(0)
copyfileobj(dest, sys.stdout.buffer)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment