Skip to content

Instantly share code, notes, and snippets.

@vlio20
Created January 17, 2024 19:37
Show Gist options
  • Save vlio20/477908c6951dfce75f1ac01768261b65 to your computer and use it in GitHub Desktop.
Save vlio20/477908c6951dfce75f1ac01768261b65 to your computer and use it in GitHub Desktop.
import boto3
import os
from concurrent.futures import ThreadPoolExecutor
# Global variables
S3_BUCKET_NAME = 'yyyyy'
S3_PREFIX = 'xxxxl'
LOCAL_DOWNLOAD_DIRECTORY = './dest'
NUM_THREADS = 100
def download_file(bucket_name, key, local_directory):
local_file_path = os.path.join(local_directory, os.path.basename(key))
s3 = boto3.client('s3')
try:
s3.download_file(bucket_name, key, local_file_path)
print(f"Downloaded: {key} to {local_file_path}")
except Exception as e:
print(f"Error downloading {key}: {e}")
def download_files_from_s3(bucket_name, prefix, local_directory, num_threads=10):
s3 = boto3.client('s3')
continuation_token = None
while True:
# List objects in the specified S3 bucket and prefix
list_objects_params = {
'Bucket': bucket_name,
'Prefix': prefix,
}
if continuation_token:
list_objects_params['ContinuationToken'] = continuation_token
response = s3.list_objects_v2(**list_objects_params)
keys = [obj['Key'] for obj in response.get('Contents', [])]
# Ensure the local download directory exists
if not os.path.exists(local_directory):
os.makedirs(local_directory)
with ThreadPoolExecutor(max_workers=num_threads) as executor:
# Use executor to run downloads in parallel
for key in keys:
executor.submit(download_file, bucket_name, key, local_directory)
if response.get('IsTruncated'):
continuation_token = response['NextContinuationToken']
else:
break
if __name__ == "__main__":
# Download files from S3 in parallel with pagination
download_files_from_s3(S3_BUCKET_NAME, S3_PREFIX, LOCAL_DOWNLOAD_DIRECTORY, NUM_THREADS)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment