Created
November 19, 2020 15:00
-
-
Save lightheaded/6c95220d47c93e8c2b339baef39b787d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# the input file should contain newline-separated s3 URLS, such as s3://bucket/prefix/file.jpg | |
import multiprocessing as mp | |
import os | |
from urllib.parse import urlparse | |
import boto3 | |
s3 = boto3.resource('s3') | |
my_bucket = s3.Bucket('ubird-assets') | |
def s3download(object_key_file): | |
my_bucket.download_file(object_key_file[0], object_key_file[1]) | |
print('downloaded file with object name... {}'.format(object_key_file[0])) | |
print('downloaded file with file name... {}'.format(object_key_file[1])) | |
# TODO | |
def parallel_s3_download(file_path, file_list): | |
object_key_file = [] | |
for o in file_list: | |
path, filename = os.path.split(o) | |
parsed = urlparse(o) | |
object_key_file.append((o, filename)) | |
# for s3_object in my_bucket.objects.filter(Prefix='directory_name/'): | |
# # Need to split s3_object.key into path and file name, else it will give error file not found. | |
# path, filename = os.path.split(s3_object.key) | |
# object_key_file.append((s3_object.key, filename)) | |
object_key_file.pop(0) | |
pool = mp.Pool(min(mp.cpu_count(), 10)) # number of workers | |
pool.map(s3download, object_key_file, chunksize=1) | |
pool.close() | |
if __name__ == '__main__': | |
file_path = '/Users/tom/kaizen/scripts/tmp/files-to-cp.txt' | |
file_list = open(file_path).read().splitlines() | |
parallel_s3_download(file_path, file_list) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment