jelmervdl/download.py

## download.py
#!/usr/bin/env python3
import sys
import os

from argparse import ArgumentParser
from contextlib import ExitStack
from email.utils import parsedate_to_datetime
from http.client import HTTPResponse
from shutil import copyfileobj
from tempfile import TemporaryFile
from time import sleep
from typing import cast, BinaryIO, Optional
from urllib.request import urlopen, Request, URLError
from urllib.error import HTTPError
from urllib.parse import urlparse
from warnings import warn


BUFSIZE=2**16


def get_content_length(response:HTTPResponse) -> int:
	"""Get whole content length from either a normal or a Range request."""
	content_range = response.getheader('Content-Range', '').split('/')
	if len(content_range) == 2 and content_range[1] != '*':
		return int(content_range[1])

	size = response.getheader('Content-Length')
	if size is not None:
		return int(size)

	raise ValueError('No content size')


def parse_retry_after(retry_after: Optional[str]) -> int:
	if retry_after is None:
		raise ValueError('No Retry-After header')

	if retry_after.isdigit():
		return int(retry_after)
	else :
		diff = parsedate_to_datetime(retry_after) - datetime.now()
		return diff.total_seconds()


def download(url:str, file:BinaryIO, *, retries:int=10, wait:float=30.0) -> None:
	attempt = 0
	timeout = 0
	size = None

	while size is None or file.tell() < size:
		attempt += 1

		if attempt > retries:
			raise Exception('Ran out of retries')

		if timeout > 0:
			sleep(timeout)

		request = Request(url, headers={
			'Range': f'bytes={file.tell()}-'
		})

		if file.tell() > 0:
			warn(f'Resuming download from {file.tell()}')

		try:
			with urlopen(request) as fin:
				response = cast(HTTPResponse, fin)

				if response.status not in {200, 206}:
					raise RuntimeError(f'Server responded with {response.status}')

				# make sure we get a partial response. If not (i.e. 200 instead of 206)
				# then start writing our output from the start as well.
				if response.status == 200:
					warn('Server does not support Range requests')
					file.seek(0)

				# Get the expected full content length (throws if not available)
				size = get_content_length(response)

				# Read downloaded bytes, writing them to the file.
				while True:
					chunk = fin.read(BUFSIZE)
					if len(chunk) == 0:
						break
					file.write(chunk)

				# If we're somehow past our expected size, something went wrong
				# and we can't recover from that by retrying
				if file.tell() > size:
					raise Exception(f'Downloaded too much: {file.tell()} > {size}')

				# Incomplete? Retry without timeout because there wasn't an error,
				# the connection just got closed early?
				if file.tell() < size:
					warn(f'Server gave incomplete response: {file.tell()} < {size}')
					timeout = 0
		except HTTPError as e:
			if e.code >= 500 and e.code < 600:
				# Back-off at least
				timeout = wait if timeout == 0 else timeout * 2

				# Oh no someone is rate-limiting us, lets try to listen to them
				if e.code == 503:
					try:
						timeout = parse_retry_after(e.headers.get('Retry-After'))
					except ValueError:
						pass

				warn(f'Server responded with {e.code}, retry {attempt} after {timeout} pause')
				continue
			else:
				raise

	# At the end of the loop, we assume we've got all our data
	assert size is not None and file.tell() == size


if __name__ == '__main__':
	parser = ArgumentParser()
	parser.add_argument('--retries', '-r', type=int, default=10)
	parser.add_argument('--wait', '-w', type=float, default=30.0)
	parser.add_argument('--output', '-o', type=str, default='./')
	parser.add_argument('url', type=str, nargs='+')

	args = parser.parse_args()

	for url in args.url:
		with ExitStack() as ctx:
			use_stdout = args.output in {'-', '/dev/stdout'}

			if use_stdout:
				dest = ctx.enter_context(TemporaryFile('a+b'))
			else:
				if args.output.endswith('/') and not os.path.exists(args.output):
					os.makedirs(args.output)

				if os.path.isdir(args.output):
					filename = os.path.basename(urlparse(url).path.rstrip('/'))
					output = os.path.join(args.output, filename)
				else:
					if len(args.url) > 1:
						raise RuntimeError('Downloading multiple urls to the same output file does not make much sense')
					output = args.output

				dest = ctx.enter_context(open(output, 'a+b'))

			download(url, dest, retries=args.retries, wait=args.wait)

			if use_stdout:
				dest.seek(0)
				copyfileobj(dest, sys.stdout.buffer)
	#!/usr/bin/env python3
	import sys
	import os

	from argparse import ArgumentParser
	from contextlib import ExitStack
	from email.utils import parsedate_to_datetime
	from http.client import HTTPResponse
	from shutil import copyfileobj
	from tempfile import TemporaryFile
	from time import sleep
	from typing import cast, BinaryIO, Optional
	from urllib.request import urlopen, Request, URLError
	from urllib.error import HTTPError
	from urllib.parse import urlparse
	from warnings import warn


	BUFSIZE=2**16


	def get_content_length(response:HTTPResponse) -> int:
	"""Get whole content length from either a normal or a Range request."""
	content_range = response.getheader('Content-Range', '').split('/')
	if len(content_range) == 2 and content_range[1] != '*':
	return int(content_range[1])

	size = response.getheader('Content-Length')
	if size is not None:
	return int(size)

	raise ValueError('No content size')


	def parse_retry_after(retry_after: Optional[str]) -> int:
	if retry_after is None:
	raise ValueError('No Retry-After header')

	if retry_after.isdigit():
	return int(retry_after)
	else :
	diff = parsedate_to_datetime(retry_after) - datetime.now()
	return diff.total_seconds()


	def download(url:str, file:BinaryIO, *, retries:int=10, wait:float=30.0) -> None:
	attempt = 0
	timeout = 0
	size = None

	while size is None or file.tell() < size:
	attempt += 1

	if attempt > retries:
	raise Exception('Ran out of retries')

	if timeout > 0:
	sleep(timeout)

	request = Request(url, headers={
	'Range': f'bytes={file.tell()}-'
	})

	if file.tell() > 0:
	warn(f'Resuming download from {file.tell()}')

	try:
	with urlopen(request) as fin:
	response = cast(HTTPResponse, fin)

	if response.status not in {200, 206}:
	raise RuntimeError(f'Server responded with {response.status}')

	# make sure we get a partial response. If not (i.e. 200 instead of 206)
	# then start writing our output from the start as well.
	if response.status == 200:
	warn('Server does not support Range requests')
	file.seek(0)

	# Get the expected full content length (throws if not available)
	size = get_content_length(response)

	# Read downloaded bytes, writing them to the file.
	while True:
	chunk = fin.read(BUFSIZE)
	if len(chunk) == 0:
	break
	file.write(chunk)

	# If we're somehow past our expected size, something went wrong
	# and we can't recover from that by retrying
	if file.tell() > size:
	raise Exception(f'Downloaded too much: {file.tell()} > {size}')

	# Incomplete? Retry without timeout because there wasn't an error,
	# the connection just got closed early?
	if file.tell() < size:
	warn(f'Server gave incomplete response: {file.tell()} < {size}')
	timeout = 0
	except HTTPError as e:
	if e.code >= 500 and e.code < 600:
	# Back-off at least
	timeout = wait if timeout == 0 else timeout * 2

	# Oh no someone is rate-limiting us, lets try to listen to them
	if e.code == 503:
	try:
	timeout = parse_retry_after(e.headers.get('Retry-After'))
	except ValueError:
	pass

	warn(f'Server responded with {e.code}, retry {attempt} after {timeout} pause')
	continue
	else:
	raise

	# At the end of the loop, we assume we've got all our data
	assert size is not None and file.tell() == size


	if __name__ == '__main__':
	parser = ArgumentParser()
	parser.add_argument('--retries', '-r', type=int, default=10)
	parser.add_argument('--wait', '-w', type=float, default=30.0)
	parser.add_argument('--output', '-o', type=str, default='./')
	parser.add_argument('url', type=str, nargs='+')

	args = parser.parse_args()

	for url in args.url:
	with ExitStack() as ctx:
	use_stdout = args.output in {'-', '/dev/stdout'}

	if use_stdout:
	dest = ctx.enter_context(TemporaryFile('a+b'))
	else:
	if args.output.endswith('/') and not os.path.exists(args.output):
	os.makedirs(args.output)

	if os.path.isdir(args.output):
	filename = os.path.basename(urlparse(url).path.rstrip('/'))
	output = os.path.join(args.output, filename)
	else:
	if len(args.url) > 1:
	raise RuntimeError('Downloading multiple urls to the same output file does not make much sense')
	output = args.output

	dest = ctx.enter_context(open(output, 'a+b'))

	download(url, dest, retries=args.retries, wait=args.wait)

	if use_stdout:
	dest.seek(0)
	copyfileobj(dest, sys.stdout.buffer)