Skip to content

Instantly share code, notes, and snippets.

@Bharat-B
Last active February 3, 2024 05:12
Show Gist options
  • Save Bharat-B/796ea2c1b17fe3d63ad39258a84b384d to your computer and use it in GitHub Desktop.
Save Bharat-B/796ea2c1b17fe3d63ad39258a84b384d to your computer and use it in GitHub Desktop.
A python script that downloads multiple files in parallel with support for s3:// | http:// | https:// protocols

Python

A script that downloads multiple files in parallel with support for s3:// | http:// | https:// protocols

Description

downloader.py

Usage: python downloader.py url1 url2 url3


Installation

Mac OS X: A version of Python is already installed.
Windows: You will need to install one of the 2.x versions available at python.org.

Dependencies

Some of the required additional Python packages need to be installed to run on the command line. Here is a list of the packages:

  • Use pip to install
    • boto3
    • requests
#!/usr/bin/env python
import sys
import boto3
import os
import requests
import botocore
from multiprocessing import Process
import re
#Collect arguements and remove the first element as it would be the filename
urls = sys.argv
del urls[0]
#Create a new dictionary that will hold the types of urls that will be passed on to the file ( this will be usefull to assign all urls to each type of download )
filtered_urls = {}
def is_downloadable(url):
"""
Does the url contain a downloadable resource
"""
h = requests.head(url)
header = h.headers
content_type = header.get('content-type')
if 'text' in content_type.lower():
return False
if 'html' in content_type.lower():
return False
return True
def s3dl(path):
try:
# Match regex and breakdown to get bucket, filepath and filename
uri = re.match(r's3:\/\/(.+?)\/(.+)',path)
bucket = uri.group(1)
file_path = uri.group(2)
if file_path.find('/'):
file_name = file_path.rsplit('/', 1)[1]
else:
file_name = file_path
s3 = boto3.resource('s3')
s3.Bucket(bucket).download_file(file_path, file_name)
return True
except botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] == "404":
print("The object does not exist.")
else:
raise
def httpdl(path):
try:
# Check if file is downloadable
if is_downloadable(path):
# Get file name from url
filename = path.rsplit('/',1)[-1]
r = requests.get(path, stream=True)
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
else:
print('File is not downloadable.')
return False
except:
print("Error downloading file.")
return False
if __name__ == "__main__":
# Itterate through urls and filter them by schema
for url in urls:
explode = url.split('://')
if not explode[0] in filtered_urls:
filtered_urls[explode[0]] = []
filtered_urls[explode[0]].append(explode[1])
#Create an array that will hold the simultaneous processes that will be responsible for downloads
processes = []
# Itterate through filtered urls and download them by relative method
for download_type in filtered_urls:
for path in filtered_urls[download_type]:
if download_type == "s3":
download = download_type+'://'+path
process = multiprocessing.Process(target=s3dl,args=(download_type+'://'+path))
processes.append(process)
if download_type == "http" or download_type == "https":
download = download_type+'://'+path
process = Process(target=httpdl,args=(download,))
processes.append(process)
# Start the processes
for process in processes:
process.start()
# Ensure all processes are done and list their time to download as well
for process in processes:
process.join()
print(process)
print("All downloads have been completed")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment