Skip to content

Instantly share code, notes, and snippets.

@mjohnsullivan
Last active December 17, 2022 10:05
Show Gist options
  • Star 12 You must be signed in to star a gist
  • Fork 11 You must be signed in to fork a gist
  • Save mjohnsullivan/9322154 to your computer and use it in GitHub Desktop.
Save mjohnsullivan/9322154 to your computer and use it in GitHub Desktop.
Python HTTP download with resume and optional MD5 hash checking
import os.path
import shutil
import hashlib
import logging
# Support both Python 2 and 3 urllib2 importing
try:
from urllib.request import urlopen, Request
except ImportError:
from urllib2 import urlopen, Request
def validate_file(file_path, hash):
"""
Validates a file against an MD5 hash value
:param file_path: path to the file for hash validation
:type file_path: string
:param hash: expected hash value of the file
:type hash: string -- MD5 hash value
"""
m = hashlib.md5()
with open(file_path, 'rb') as f:
while True:
chunk = f.read(1000 * 1000) # 1MB
if not chunk:
break
m.update(chunk)
return m.hexdigest() == hash
def download_with_resume(url, file_path, hash=None, timeout=10):
"""
Performs a HTTP(S) download that can be restarted if prematurely terminated.
The HTTP server must support byte ranges.
:param file_path: the path to the file to write to disk
:type file_path: string
:param hash: hash value for file validation
:type hash: string (MD5 hash value)
:param timout: timeout for http request
:type timeout: int
"""
# don't download if the file exists
if os.path.exists(file_path):
return
block_size = 1000 * 1000 # 1MB
tmp_file_path = file_path + '.part'
first_byte = os.path.getsize(
tmp_file_path) if os.path.exists(tmp_file_path) else 0
logging.debug('Starting download at %.1fMB' % (first_byte / 1e6))
file_size = -1
try:
file_size = int(urlopen(url).info().get('Content-Length', -1))
logging.debug('File size is %s' % file_size)
while first_byte < file_size:
last_byte = first_byte + block_size \
if first_byte + block_size < file_size \
else file_size - 1
logging.debug('Downloading byte range %d - %d' %
(first_byte, last_byte))
# create the request and set the byte range in the header
req = Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (first_byte, last_byte)
data_chunk = urlopen(req, timeout=timeout).read()
# read the data from the URL and write it to the file
with open(tmp_file_path, 'ab') as f:
f.write(data_chunk)
first_byte = last_byte + 1
except IOError as e:
logging.debug('IO Error - %s' % e)
finally:
# rename the temp download file to the correct name if fully downloaded
if file_size == os.path.getsize(tmp_file_path):
# if there's a hash value, validate the file
if hash and not validate_file(tmp_file_path, hash):
raise Exception(
'Error validating the file against its MD5 hash')
shutil.move(tmp_file_path, file_path)
elif file_size == -1:
raise Exception(
'Error getting Content-Length from server: %s' % url)
@datashaman
Copy link

I think you'll get an off-by-one error with this code.

Try running through these three scenarios:

  • File size is 0, should be 100, block size is 100 = 1 block from 0-99
  • File size is 0, should be 200, block size is 100 = 2 blocks from 0-99, and 100-199
  • File size is 50, should be 200, block size is 100 = 2 blocks from 50-149, and 150-199

@idolpx
Copy link

idolpx commented Nov 23, 2017

Thanks for sharing.

I forked and modified your code to use the 'requests' lib instead of 'urllib2' and download using a stream.
It significantly increased the transfer speed.

@yamyamyuo
Copy link

this is great! it works for me

@mjohnsullivan
Copy link
Author

Updated to support Python 3 and fixed a tiny but troublesome bug where the final chunk range was one byte too large

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment