Skip to content

Instantly share code, notes, and snippets.

@lightheaded
Created November 19, 2020 15:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lightheaded/6c95220d47c93e8c2b339baef39b787d to your computer and use it in GitHub Desktop.
Save lightheaded/6c95220d47c93e8c2b339baef39b787d to your computer and use it in GitHub Desktop.
# the input file should contain newline-separated s3 URLS, such as s3://bucket/prefix/file.jpg
import multiprocessing as mp
import os
from urllib.parse import urlparse
import boto3
s3 = boto3.resource('s3')
my_bucket = s3.Bucket('ubird-assets')
def s3download(object_key_file):
my_bucket.download_file(object_key_file[0], object_key_file[1])
print('downloaded file with object name... {}'.format(object_key_file[0]))
print('downloaded file with file name... {}'.format(object_key_file[1]))
# TODO
def parallel_s3_download(file_path, file_list):
object_key_file = []
for o in file_list:
path, filename = os.path.split(o)
parsed = urlparse(o)
object_key_file.append((o, filename))
# for s3_object in my_bucket.objects.filter(Prefix='directory_name/'):
# # Need to split s3_object.key into path and file name, else it will give error file not found.
# path, filename = os.path.split(s3_object.key)
# object_key_file.append((s3_object.key, filename))
object_key_file.pop(0)
pool = mp.Pool(min(mp.cpu_count(), 10)) # number of workers
pool.map(s3download, object_key_file, chunksize=1)
pool.close()
if __name__ == '__main__':
file_path = '/Users/tom/kaizen/scripts/tmp/files-to-cp.txt'
file_list = open(file_path).read().splitlines()
parallel_s3_download(file_path, file_list)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment