Last active
March 7, 2024 02:30
-
-
Save timoguin/5da5ab385fcbb90036599250d14342ce to your computer and use it in GitHub Desktop.
S3 Batch Job that copies objects from a source to a destination bucket
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# List objects in an S3 bucket and generate a CSV manifest suitable for | |
# driving an S3 Batch Job. Works for smol buckets. Just pass a valid S3 URL. | |
# | |
# Usage: | |
# | |
# ./create_manifest.sh s3://muh-buckit/muh-prefix/ | |
# | |
NUM_ARGS=$# | |
if [ $NUM_ARGS = 0 ]; then | |
echo "We need some args, pattycake..." | |
exit 1 | |
elif [ $NUM_ARGS -gt 1 ]; then | |
echo "Oh no, boy... what is you doin'?" | |
exit 1 | |
fi | |
SOURCE_PATH=$1 | |
PARSE_URL_GREP_PATTERN='s3://\([a-z0-9\-]\+\)/\?\(.*\)\?/\?$' | |
PARSE_URL_SED_PATTERN='s|s3://\([a-z0-9\-]\+\)/\?\(.*\)$|\1 \2|' | |
# Make sure it's a valid S3 URL | |
echo $SOURCE_PATH | grep -q "$PARSE_URL_GREP_PATTERN" | |
if [ $? -ne 0 ]; then echo "Failed to parse input a valid S3 URL"; exit 1; fi | |
# Parse S3 URL into bucket name and prefix | |
SOURCE_BUCKET=$(echo $SOURCE_PATH | sed "$PARSE_URL_SED_PATTERN" | awk '{print $1}') | |
SOURCE_PREFIX=$(echo $SOURCE_PATH | sed "$PARSE_URL_SED_PATTERN" | awk '{print $2}') | |
# Check that bucket is valid and accessible | |
SOURCE_BUCKET_REGION=$(aws s3api head-bucket --bucket $SOURCE_BUCKET --query "BucketRegion" --output text >/dev/null 2>&1) | |
retcode=$? | |
if [ $retcode -ne 0 ]; then | |
echo "ERROR: bucket \"${SOURCE_BUCKET}\" is inaccessible (Error code $retcode)" | |
read -p "The head-bucket operation failed. Would you like to attempt listing the bucket contents anyway? (Y/n)" ignore_head_bucket_failure | |
if [ "$ignore_head_bucket_failure" = "n" ]; then | |
echo "Quitting..." | |
exit 1 | |
else | |
echo "Continuing..." | |
fi | |
fi | |
unset retcode | |
DATE_STRING=$(date +'%Y%m%d') | |
# Recursively list all objects under the path and output a CSV manifest file | |
aws s3api list-objects-v2 --bucket $SOURCE_BUCKET --prefix $SOURCE_PREFIX --query "Contents[]" \ | |
| jq -rc --arg source_bucket $SOURCE_BUCKET '.[] | [$source_bucket, .Key] | @csv' \ | |
> manifest-$DATE_STRING.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# Upload a CSV manifest file to S3 for use in an S3 Batch Job. Just pass a local path | |
# and an S3 URL to upload to. This here is the greatest shell script ever written. | |
# | |
# Usage: | |
# | |
# ./upload_manifest.sh muh-manifest s3://muh-manifest-buckit/foo/ | |
# | |
aws s3 cp $1 $2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# Using a destination config for the reports to be delivered, | |
# pointing to a desired manifest to operate against, | |
# Submit a batch job that performs a CopyObject from a source | |
# to a destination bucket. | |
# | |
# This needs some args, but I'm busy. | |
DESTINATION_REGION=${AWS_REGION:-$AWS_DEFAULT_REGION} | |
DESTINATION_ACCOUNT_ID=$(aws sts get-caller-identity --query "Account" --output text) | |
# Make an arg | |
DESTINATION_BUCKET="redacty-$DESTINATION_ACCOUNT_ID-$DESTINATION_REGION" | |
DESTINATION_BUCKET_ARN="arn:aws:s3:::$DESTINATION_BUCKET" | |
# Make an arg | |
DESTINATION_PREFIX="data/muh-data/" | |
REPORT_PREFIX="reports" | |
REPORT_FORMAT="Report_CSV_20180820" | |
# Make this an arg | |
MANIFEST_PREFIX="manifests/where-the-manifests-are-stored/" | |
MANIFEST_FILENAME="manifest-$(date +'%Y%m%d').csv" | |
MANIFEST_OBJECT_ARN="$DESTINATION_BUCKET_ARN/$MANIFEST_PREFIX/$MANIFEST_FILENAME" | |
MANIFEST_FORMAT="S3BatchOperations_CSV_20180820" | |
MANIFEST_ETAG=$(aws s3api head-object --bucket $DESTINATION_BUCKET --key $MANIFEST_PREFIX/$MANIFEST_FILENAME --query "ETag" --output text | sed 's/"//g') | |
# Make role an arg | |
ROLE_ARN="arn:aws:iam::$DESTINATION_ACCOUNT_ID:role/batch-operations" | |
# Submit dem jobbin | |
aws s3control create-job \ | |
--region $DESTINATION_REGION \ | |
--account-id $DESTINATION_ACCOUNT_ID \ | |
--operation '{"S3PutObjectCopy":{"TargetResource":"'$DESTINATION_BUCKET_ARN'","TargetKeyPrefix":"'$DESTINATION_PREFIX'"}}' \ | |
--manifest '{"Spec":{"Format":"'$MANIFEST_FORMAT'","Fields":["Bucket","Key"]},"Location":{"ObjectArn":"'$MANIFEST_OBJECT_ARN'","ETag":"'$MANIFEST_ETAG'"}}' \ | |
--report '{"Bucket":"'$DESTINATION_BUCKET_ARN'","Prefix":"'$REPORT_PREFIX'","Format":"'$REPORT_FORMAT'","Enabled":true,"ReportScope":"AllTasks"}' \ | |
--priority 100 \ | |
--role-arn $ROLE_ARN \ | |
--client-request-token $(uuidgen) \ | |
--description "Generate my description via some dynamic metadata plz" \ | |
--no-confirmation-required |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment