vlio20/download_files_from_prefix_s3.py

## download_files_from_prefix_s3.py
import boto3
import os
from concurrent.futures import ThreadPoolExecutor

# Global variables
S3_BUCKET_NAME = 'yyyyy'
S3_PREFIX = 'xxxxl'
LOCAL_DOWNLOAD_DIRECTORY = './dest'
NUM_THREADS = 100


def download_file(bucket_name, key, local_directory):
    local_file_path = os.path.join(local_directory, os.path.basename(key))
    s3 = boto3.client('s3')

    try:
        s3.download_file(bucket_name, key, local_file_path)
        print(f"Downloaded: {key} to {local_file_path}")
    except Exception as e:
        print(f"Error downloading {key}: {e}")


def download_files_from_s3(bucket_name, prefix, local_directory, num_threads=10):
    s3 = boto3.client('s3')

    continuation_token = None
    while True:
        # List objects in the specified S3 bucket and prefix
        list_objects_params = {
            'Bucket': bucket_name,
            'Prefix': prefix,
        }
        if continuation_token:
            list_objects_params['ContinuationToken'] = continuation_token
        response = s3.list_objects_v2(**list_objects_params)

        keys = [obj['Key'] for obj in response.get('Contents', [])]

        # Ensure the local download directory exists
        if not os.path.exists(local_directory):
            os.makedirs(local_directory)

        with ThreadPoolExecutor(max_workers=num_threads) as executor:
            # Use executor to run downloads in parallel
            for key in keys:
                executor.submit(download_file, bucket_name, key, local_directory)

        if response.get('IsTruncated'):
            continuation_token = response['NextContinuationToken']
        else:
            break


if __name__ == "__main__":
    # Download files from S3 in parallel with pagination
    download_files_from_s3(S3_BUCKET_NAME, S3_PREFIX, LOCAL_DOWNLOAD_DIRECTORY, NUM_THREADS)
	import boto3
	import os
	from concurrent.futures import ThreadPoolExecutor

	# Global variables
	S3_BUCKET_NAME = 'yyyyy'
	S3_PREFIX = 'xxxxl'
	LOCAL_DOWNLOAD_DIRECTORY = './dest'
	NUM_THREADS = 100


	def download_file(bucket_name, key, local_directory):
	local_file_path = os.path.join(local_directory, os.path.basename(key))
	s3 = boto3.client('s3')

	try:
	s3.download_file(bucket_name, key, local_file_path)
	print(f"Downloaded: {key} to {local_file_path}")
	except Exception as e:
	print(f"Error downloading {key}: {e}")


	def download_files_from_s3(bucket_name, prefix, local_directory, num_threads=10):
	s3 = boto3.client('s3')

	continuation_token = None
	while True:
	# List objects in the specified S3 bucket and prefix
	list_objects_params = {
	'Bucket': bucket_name,
	'Prefix': prefix,
	}
	if continuation_token:
	list_objects_params['ContinuationToken'] = continuation_token
	response = s3.list_objects_v2(**list_objects_params)

	keys = [obj['Key'] for obj in response.get('Contents', [])]

	# Ensure the local download directory exists
	if not os.path.exists(local_directory):
	os.makedirs(local_directory)

	with ThreadPoolExecutor(max_workers=num_threads) as executor:
	# Use executor to run downloads in parallel
	for key in keys:
	executor.submit(download_file, bucket_name, key, local_directory)

	if response.get('IsTruncated'):
	continuation_token = response['NextContinuationToken']
	else:
	break


	if __name__ == "__main__":
	# Download files from S3 in parallel with pagination
	download_files_from_s3(S3_BUCKET_NAME, S3_PREFIX, LOCAL_DOWNLOAD_DIRECTORY, NUM_THREADS)