Skip to content

Instantly share code, notes, and snippets.

@timoguin
Last active March 7, 2024 02:30
Show Gist options
  • Save timoguin/5da5ab385fcbb90036599250d14342ce to your computer and use it in GitHub Desktop.
Save timoguin/5da5ab385fcbb90036599250d14342ce to your computer and use it in GitHub Desktop.
S3 Batch Job that copies objects from a source to a destination bucket
#!/bin/sh
# List objects in an S3 bucket and generate a CSV manifest suitable for
# driving an S3 Batch Job. Works for smol buckets. Just pass a valid S3 URL.
#
# Usage:
#
# ./create_manifest.sh s3://muh-buckit/muh-prefix/
#
NUM_ARGS=$#
if [ $NUM_ARGS = 0 ]; then
echo "We need some args, pattycake..."
exit 1
elif [ $NUM_ARGS -gt 1 ]; then
echo "Oh no, boy... what is you doin'?"
exit 1
fi
SOURCE_PATH=$1
PARSE_URL_GREP_PATTERN='s3://\([a-z0-9\-]\+\)/\?\(.*\)\?/\?$'
PARSE_URL_SED_PATTERN='s|s3://\([a-z0-9\-]\+\)/\?\(.*\)$|\1 \2|'
# Make sure it's a valid S3 URL
echo $SOURCE_PATH | grep -q "$PARSE_URL_GREP_PATTERN"
if [ $? -ne 0 ]; then echo "Failed to parse input a valid S3 URL"; exit 1; fi
# Parse S3 URL into bucket name and prefix
SOURCE_BUCKET=$(echo $SOURCE_PATH | sed "$PARSE_URL_SED_PATTERN" | awk '{print $1}')
SOURCE_PREFIX=$(echo $SOURCE_PATH | sed "$PARSE_URL_SED_PATTERN" | awk '{print $2}')
# Check that bucket is valid and accessible
SOURCE_BUCKET_REGION=$(aws s3api head-bucket --bucket $SOURCE_BUCKET --query "BucketRegion" --output text >/dev/null 2>&1)
retcode=$?
if [ $retcode -ne 0 ]; then
echo "ERROR: bucket \"${SOURCE_BUCKET}\" is inaccessible (Error code $retcode)"
read -p "The head-bucket operation failed. Would you like to attempt listing the bucket contents anyway? (Y/n)" ignore_head_bucket_failure
if [ "$ignore_head_bucket_failure" = "n" ]; then
echo "Quitting..."
exit 1
else
echo "Continuing..."
fi
fi
unset retcode
DATE_STRING=$(date +'%Y%m%d')
# Recursively list all objects under the path and output a CSV manifest file
aws s3api list-objects-v2 --bucket $SOURCE_BUCKET --prefix $SOURCE_PREFIX --query "Contents[]" \
| jq -rc --arg source_bucket $SOURCE_BUCKET '.[] | [$source_bucket, .Key] | @csv' \
> manifest-$DATE_STRING.csv
#!/bin/sh
# Upload a CSV manifest file to S3 for use in an S3 Batch Job. Just pass a local path
# and an S3 URL to upload to. This here is the greatest shell script ever written.
#
# Usage:
#
# ./upload_manifest.sh muh-manifest s3://muh-manifest-buckit/foo/
#
aws s3 cp $1 $2
#!/bin/sh
# Using a destination config for the reports to be delivered,
# pointing to a desired manifest to operate against,
# Submit a batch job that performs a CopyObject from a source
# to a destination bucket.
#
# This needs some args, but I'm busy.
DESTINATION_REGION=${AWS_REGION:-$AWS_DEFAULT_REGION}
DESTINATION_ACCOUNT_ID=$(aws sts get-caller-identity --query "Account" --output text)
# Make an arg
DESTINATION_BUCKET="redacty-$DESTINATION_ACCOUNT_ID-$DESTINATION_REGION"
DESTINATION_BUCKET_ARN="arn:aws:s3:::$DESTINATION_BUCKET"
# Make an arg
DESTINATION_PREFIX="data/muh-data/"
REPORT_PREFIX="reports"
REPORT_FORMAT="Report_CSV_20180820"
# Make this an arg
MANIFEST_PREFIX="manifests/where-the-manifests-are-stored/"
MANIFEST_FILENAME="manifest-$(date +'%Y%m%d').csv"
MANIFEST_OBJECT_ARN="$DESTINATION_BUCKET_ARN/$MANIFEST_PREFIX/$MANIFEST_FILENAME"
MANIFEST_FORMAT="S3BatchOperations_CSV_20180820"
MANIFEST_ETAG=$(aws s3api head-object --bucket $DESTINATION_BUCKET --key $MANIFEST_PREFIX/$MANIFEST_FILENAME --query "ETag" --output text | sed 's/"//g')
# Make role an arg
ROLE_ARN="arn:aws:iam::$DESTINATION_ACCOUNT_ID:role/batch-operations"
# Submit dem jobbin
aws s3control create-job \
--region $DESTINATION_REGION \
--account-id $DESTINATION_ACCOUNT_ID \
--operation '{"S3PutObjectCopy":{"TargetResource":"'$DESTINATION_BUCKET_ARN'","TargetKeyPrefix":"'$DESTINATION_PREFIX'"}}' \
--manifest '{"Spec":{"Format":"'$MANIFEST_FORMAT'","Fields":["Bucket","Key"]},"Location":{"ObjectArn":"'$MANIFEST_OBJECT_ARN'","ETag":"'$MANIFEST_ETAG'"}}' \
--report '{"Bucket":"'$DESTINATION_BUCKET_ARN'","Prefix":"'$REPORT_PREFIX'","Format":"'$REPORT_FORMAT'","Enabled":true,"ReportScope":"AllTasks"}' \
--priority 100 \
--role-arn $ROLE_ARN \
--client-request-token $(uuidgen) \
--description "Generate my description via some dynamic metadata plz" \
--no-confirmation-required
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment