Skip to content

Instantly share code, notes, and snippets.

@jesserobertson
Created March 26, 2019 02:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jesserobertson/061e4dda314fcc83f97cfc6d26997aba to your computer and use it in GitHub Desktop.
Save jesserobertson/061e4dda314fcc83f97cfc6d26997aba to your computer and use it in GitHub Desktop.
Nice downloads in Python for largish files
""" file: download.py
author: Jess Robertson (@jesserobertson)
date: Tuesday, 26 March 2019
description: Download large files with requests and a tqdm progress bar.
Modified lightly from https://stackoverflow.com/questions/37573483
"""
import pathlib
from urllib.parse import urlparse
import requests
from tqdm import tqdm
def download(url, download_folder=None, overwrite=False):
"""
Stream some large download with a progress bar from tqdm to a download folder
Parameters:
url - the thing to download
download_folder - the folder to download to. Will be created (with parents)
if it doesn't already exist.
overwrite - whether to overwrite a local file if already downloaded
(default False)
Returns:
a pathlib.Path pointing to the downloaded file
"""
# Make sure we've got somewhere to download to
filename = pathlib.Path(urlparse(url).path).name
if download_folder is not None:
download_folder = pathlib.Path(download_folder)
else:
download_folder = pathlib.Path('.').absolute()
if not download_folder.exists():
download_folder.mkdir(parents=True)
output_location = download_folder / filename
if not overwrite and output_location.exists():
print(f'{output_location} already exists and overwrite=False, skipping')
return output_location
# Open a streaming connection to the endpoint
response = requests.get(url, stream=True)
# Set up progress bar with total size
block_size, wrote_size = 2048, 0
total_size = int(response.headers.get('content-length', 0))
tqdm_kwargs = {
'total': total_size,
'unit': 'MB',
'unit_scale': True,
'desc': f'Downloading {filename}'
}
# Actually stream to file - we can clobber if we're here
with open(output_location, 'wb') as sink, \
tqdm(**tqdm_kwargs) as pbar:
for data in response.iter_content(block_size):
sink.write(data)
wrote_size += len(data)
pbar.update(block_size)
# Check we got everything
if total_size != 0 and wrote_size != total_size:
raise IOError(f"Something went wrong downloading {output_location} - we're missing data!")
else:
return output_location
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment