Skip to content

Instantly share code, notes, and snippets.

@liangfu
Created January 28, 2021 09:15
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save liangfu/bc2766986a75e32242ea0a0448bdb9b4 to your computer and use it in GitHub Desktop.
Save liangfu/bc2766986a75e32242ea0a0448bdb9b4 to your computer and use it in GitHub Desktop.
Enable batch operations for recursively copying s3 objects
import boto3
def recursive_copy(src_bucket, src_prefix, dst_bucket, dst_prefix):
def get_partition():
if boto3.session.Session().region_name in ["cn-northwest-1", "cn-north-1"]:
return "aws-cn"
else:
return "aws"
partition = get_partition()
region_name = boto3.session.Session().region_name
if src_prefix[-1] == "/":
src_prefix = src_prefix[:-1]
if dst_prefix[-1] == "/":
dst_prefix = dst_prefix[:-1]
batchOperationsRole = "arn:aws-cn:iam::123456789012:role/AdminRoleForS3BatchOperations" # WARNING: create a role for this.
# boto3.set_stream_logger('')
s3 = boto3.client("s3", region_name=region_name)
s3control = boto3.client('s3control', region_name=region_name)
account_id = boto3.client('sts').get_caller_identity().get('Account')
# collect all keys in the bucket and save to manifest.csv file
response = s3.list_objects_v2(Bucket=src_bucket, Prefix=src_prefix)
all_keys = []
while True:
keys = response["Contents"]
for ctx in keys:
src_key = ctx["Key"]
all_keys.append(src_key)
truncated = response["IsTruncated"]
if not truncated:
break
token = response["NextContinuationToken"]
response = s3.list_objects_v2(Bucket=src_bucket, Prefix=src_prefix, ContinuationToken=token)
# print(all_keys)
manifest_csv = ""
for key in all_keys:
manifest_csv += f"{src_bucket},{key}\n"
# print(manifest_csv)
# upload the manifest file to s3://{src_bucket}/{src_prefix}/manifest.csv
manifest_key = f"{src_prefix}/manifest.csv"
manifest_uri = f"s3://{src_bucket}/{manifest_key}"
manifest_arn = f"arn:{partition}:s3:::{src_bucket}/{manifest_key}"
response = s3.put_object(
Body = manifest_csv,
Bucket = src_bucket,
Key = manifest_key
)
etag = boto3.resource('s3').Object(src_bucket, manifest_key).e_tag.strip('"')
print(f"Uploaded manifest to {manifest_uri}.")
dst_bucket_arn = f"arn:{partition}:s3:::{dst_bucket}"
kwargs = {
"AccountId": account_id,
"ConfirmationRequired": False,
"RoleArn": batchOperationsRole,
"Priority": 10,
"Manifest": {
"Spec": {
"Format": "S3BatchOperations_CSV_20180820",
"Fields": ["Bucket", "Key"],
},
"Location": {
"ObjectArn": manifest_arn,
"ETag": etag
},
},
"Operation": {
'S3PutObjectCopy': {
"TargetResource": dst_bucket_arn,
'TargetKeyPrefix': dst_prefix,
"MetadataDirective": "COPY",
"RequesterPays": False,
"StorageClass": "STANDARD",
},
},
"Report": {
'Bucket': dst_bucket_arn,
'Format': 'Report_CSV_20180820',
'Enabled': False,
'Prefix': dst_prefix,
'ReportScope': 'AllTasks',
}
}
import json
print(json.dumps(kwargs, indent=4))
s3control.create_job(**kwargs)
@atjshop
Copy link

atjshop commented Apr 2, 2022

I'm trying to do the same thing. What I want to is copy the files to the root of the target bucket so I tried to set TargetKeyPrefix to empty string "". However it throws exception saying "Operation.S3PutObjectCopy.TargetKeyPrefix, value: 0, valid min length: 1". Any idea for this? Seems to be a bug in the boto3? Thanks.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment