Created
January 28, 2021 09:15
-
-
Save liangfu/bc2766986a75e32242ea0a0448bdb9b4 to your computer and use it in GitHub Desktop.
Enable batch operations for recursively copying s3 objects
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
def recursive_copy(src_bucket, src_prefix, dst_bucket, dst_prefix): | |
def get_partition(): | |
if boto3.session.Session().region_name in ["cn-northwest-1", "cn-north-1"]: | |
return "aws-cn" | |
else: | |
return "aws" | |
partition = get_partition() | |
region_name = boto3.session.Session().region_name | |
if src_prefix[-1] == "/": | |
src_prefix = src_prefix[:-1] | |
if dst_prefix[-1] == "/": | |
dst_prefix = dst_prefix[:-1] | |
batchOperationsRole = "arn:aws-cn:iam::123456789012:role/AdminRoleForS3BatchOperations" # WARNING: create a role for this. | |
# boto3.set_stream_logger('') | |
s3 = boto3.client("s3", region_name=region_name) | |
s3control = boto3.client('s3control', region_name=region_name) | |
account_id = boto3.client('sts').get_caller_identity().get('Account') | |
# collect all keys in the bucket and save to manifest.csv file | |
response = s3.list_objects_v2(Bucket=src_bucket, Prefix=src_prefix) | |
all_keys = [] | |
while True: | |
keys = response["Contents"] | |
for ctx in keys: | |
src_key = ctx["Key"] | |
all_keys.append(src_key) | |
truncated = response["IsTruncated"] | |
if not truncated: | |
break | |
token = response["NextContinuationToken"] | |
response = s3.list_objects_v2(Bucket=src_bucket, Prefix=src_prefix, ContinuationToken=token) | |
# print(all_keys) | |
manifest_csv = "" | |
for key in all_keys: | |
manifest_csv += f"{src_bucket},{key}\n" | |
# print(manifest_csv) | |
# upload the manifest file to s3://{src_bucket}/{src_prefix}/manifest.csv | |
manifest_key = f"{src_prefix}/manifest.csv" | |
manifest_uri = f"s3://{src_bucket}/{manifest_key}" | |
manifest_arn = f"arn:{partition}:s3:::{src_bucket}/{manifest_key}" | |
response = s3.put_object( | |
Body = manifest_csv, | |
Bucket = src_bucket, | |
Key = manifest_key | |
) | |
etag = boto3.resource('s3').Object(src_bucket, manifest_key).e_tag.strip('"') | |
print(f"Uploaded manifest to {manifest_uri}.") | |
dst_bucket_arn = f"arn:{partition}:s3:::{dst_bucket}" | |
kwargs = { | |
"AccountId": account_id, | |
"ConfirmationRequired": False, | |
"RoleArn": batchOperationsRole, | |
"Priority": 10, | |
"Manifest": { | |
"Spec": { | |
"Format": "S3BatchOperations_CSV_20180820", | |
"Fields": ["Bucket", "Key"], | |
}, | |
"Location": { | |
"ObjectArn": manifest_arn, | |
"ETag": etag | |
}, | |
}, | |
"Operation": { | |
'S3PutObjectCopy': { | |
"TargetResource": dst_bucket_arn, | |
'TargetKeyPrefix': dst_prefix, | |
"MetadataDirective": "COPY", | |
"RequesterPays": False, | |
"StorageClass": "STANDARD", | |
}, | |
}, | |
"Report": { | |
'Bucket': dst_bucket_arn, | |
'Format': 'Report_CSV_20180820', | |
'Enabled': False, | |
'Prefix': dst_prefix, | |
'ReportScope': 'AllTasks', | |
} | |
} | |
import json | |
print(json.dumps(kwargs, indent=4)) | |
s3control.create_job(**kwargs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I'm trying to do the same thing. What I want to is copy the files to the root of the target bucket so I tried to set TargetKeyPrefix to empty string "". However it throws exception saying "Operation.S3PutObjectCopy.TargetKeyPrefix, value: 0, valid min length: 1". Any idea for this? Seems to be a bug in the boto3? Thanks.