Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
#!/usr/bin/env bash
#
# Split single S3 Inventory manifest into sequential subsets.
#
# Usage:
# $ env INVENTORY_BUCKET=my-inventory INVENTORY_PATH=sample-name ./s3.batch.operations.manifest.split.sh
#
set -euo pipefail
fatal () {
echo "Error: $*" >&2
exit 1
}
# inputs
: "${DEBUG:=}"
: "${INVENTORY_BUCKET:=}" # inventory S3 bucket name
: "${INVENTORY_PATH:=}" # inventory S3 path w/o date
: "${INVENTORY_DATE:=}" # inventory date (default: autodetect)
: "${JOBS_COUNT:=4}" # number of jobs (default: 4)
test -z "${DEBUG}" || set -x
test -n "${INVENTORY_BUCKET}" || fatal "env INVENTORY_BUCKET is empty"
test -n "${INVENTORY_PATH}" || fatal "env INVENTORY_PATH is empty"
test -n "${INVENTORY_DATE}" || {
# detect latest inventory date
INVENTORY_DATE=$(aws s3 ls "s3://${INVENTORY_BUCKET}/${INVENTORY_PATH}/" | awk '$1 == "PRE" && $2 ~ /^[0-9]{4}-.*Z\/$/ {print $2}' | sort | tail -n1 | sed -e 's/\/$//')
}
test -n "${INVENTORY_DATE}" || fatal "inventory date was not detected"
echo "Spit inventory manifest.json into ${JOBS_COUNT} jobs from s3://${INVENTORY_BUCKET}/${INVENTORY_PATH}/${INVENTORY_DATE}/ location?"
echo "Are you sure? Ctrl+C to abort..."
read
# pull origin data locally
aws s3 sync --delete "s3://${INVENTORY_BUCKET}/${INVENTORY_PATH}/${INVENTORY_DATE}" "${INVENTORY_DATE}"
# prepare for split
DST_DIR="${INVENTORY_DATE}-jobs-${JOBS_COUNT}"
rm -rf "${DST_DIR}"
mkdir "${DST_DIR}"
TOTAL_COUNT=$(jq -r ".files[].key" "${INVENTORY_DATE}/manifest.json" | wc -l)
CHUNK_SIZE=$(echo "a=${TOTAL_COUNT}; b=${JOBS_COUNT}; if ( a%b ) a/b+1 else a/b" | bc)
# do the split
for SEQUENCE in $(seq "${JOBS_COUNT}"); do
DST_MANIFEST="${DST_DIR}/manifest-${SEQUENCE}"
IDX_FROM=$(( (SEQUENCE - 1) * CHUNK_SIZE ))
IDX_TILL=$(( IDX_FROM + CHUNK_SIZE ))
jq ".files|=.[${IDX_FROM}:${IDX_TILL}]" "${INVENTORY_DATE}/manifest.json" > "${DST_MANIFEST}.json"
md5sum "${DST_MANIFEST}.json" | awk '{print $1}' > "${DST_MANIFEST}.checksum"
echo "$(head -n1 ${DST_MANIFEST}.checksum) ${DST_MANIFEST}.json" | md5sum -c -
done
# push splitted into custom directory
aws s3 sync --sse AES256 "${DST_DIR}" "s3://${INVENTORY_BUCKET}/${INVENTORY_PATH}/${DST_DIR}/"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.