Skip to content

Instantly share code, notes, and snippets.

@AliMirlou
Forked from idolpx/download.py
Last active October 7, 2020 11:26
Show Gist options
  • Save AliMirlou/faadc0b2424ed827349f6bafdb78bda6 to your computer and use it in GitHub Desktop.
Save AliMirlou/faadc0b2424ed827349f6bafdb78bda6 to your computer and use it in GitHub Desktop.
Python HTTP download with resume and optional MD5 hash checking
#!/usr/bin/env python
import logging
logger = logging.getLogger(__name__)
import argparse
import os.path
import requests
import shutil
import hashlib
from multiprocessing.dummy import Pool as ThreadPool
def validate_file(file_path, hash):
"""
Validates a file against an MD5 hash value
:param file_path: path to the file for hash validation
:type file_path: string
:param hash: expected hash value of the file
:type hash: string -- MD5 hash value
"""
m = hashlib.md5()
with open(file_path, 'rb') as f:
while True:
chunk = f.read(1000 * 1000) # 1MB
if not chunk:
break
m.update(chunk)
return m.hexdigest() == hash
def download_with_resume(url, file_path=None, hash=None, timeout=10, block_size=1000 * 1000):
"""
Performs a HTTP(S) download that can be restarted if prematurely terminated.
The HTTP server must support byte ranges.
:param url: url of the file to be downloaded
:param file_path: the path to the file to write to disk
:type file_path: string
:param hash: hash value for file validation
:type hash: string (MD5 hash value)
:param timeout: time to give up on response in seconds
:param block_size: size of the blocks to download
"""
print("started")
# don't download if the file exists
if os.path.exists(file_path):
# still validate file
if hash and not validate_file(file_path, hash):
raise Exception('Error validating the file against its MD5 hash')
logger.info('File passes MD5 validation')
return
tmp_file_path = file_path + '.part'
first_byte = os.path.getsize(tmp_file_path) if os.path.exists(tmp_file_path) else 0
if first_byte != 0:
logging.debug('Resuming download at %.1fMB' % (first_byte / 1e6))
file_mode = 'ab' if first_byte else 'wb'
file_size = -1
try:
file_size = int(requests.head(url).headers['Content-Length'])
logging.debug('File size is %s' % file_size)
headers = {"Range": "bytes=%s-" % first_byte}
r = requests.get(url, headers=headers, stream=True, timeout=timeout)
with open(tmp_file_path, file_mode) as f:
for chunk in r.iter_content(chunk_size=block_size):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
except IOError as e:
logging.debug('IO Error - %s' % e)
finally:
# rename the temp download file to the correct name if fully downloaded
if file_size == os.path.getsize(tmp_file_path):
# if there's a hash value, validate the file
if hash and not validate_file(tmp_file_path, hash):
raise Exception('Error validating the file against its MD5 hash')
logger.info('File passes MD5 validation')
shutil.move(tmp_file_path, file_path)
logger.info("URL '%s' downloaded to '%s' ok", url, file_path)
return True
elif file_size == -1:
raise Exception('Error getting Content-Length from server: %s' % url)
return False
def download_all(urls, file_paths, hashes=None, timeout=10, block_size=1000 * 1000):
length = len(urls)
if hashes is None:
hashes = (None for _ in range(length))
with ThreadPool() as pool:
return all(pool.starmap(download_with_resume, zip(urls, file_paths, hashes, (timeout for _ in range(length)),
(block_size for _ in range(length)))))
def console_logging(level=logging.DEBUG, format="%(message)s"):
for h in list(logger.handlers):
logger.removeHandler(h)
stream = logging.StreamHandler(sys.stdout)
stream.setFormatter(logging.Formatter(format))
logger.addHandler(stream)
logger.setLevel(level)
def main(arg_list=None):
if arg_list is None:
arg_list = []
parser = argparse.ArgumentParser(description='Restarting Downloader')
parser.add_argument('URL',
metavar='<url>',
help='URL to load')
parser.add_argument('OUTFILE',
metavar='<filename>',
help='Filename to write')
parser.add_argument('-t', '--timeout', required=False, help='timeout', metavar='<seconds>', type=int, default='10')
parser.add_argument('-c', '--chunk_size', required=False, help='Chunk size', metavar='<bytes>', type=int,
default=1024 * 1024)
parser.add_argument('-m', '--md5', required=False, help='MD5 hash to check', metavar='<md5>')
parser.add_argument('-v', required=False, help='verbose output', action='store_true')
args = parser.parse_args(arg_list)
if args.v:
logger.setLevel(logging.DEBUG)
download_with_resume(args.URL, args.OUTFILE, hash=args.md5, timeout=args.timeout, block_size=args.chunksize)
if __name__ == "__main__":
import sys
console_logging(level=logging.INFO)
main(sys.argv[1:])
@AliMirlou
Copy link
Author

Combined this fork and this fork because both were great. And improved them a bit.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment