dmabamboo/delete-aws-glacier-vault-archives.sh Secret

## delete-aws-glacier-vault-archives.sh
#!/usr/bin/env bash
#Checking pre-requisites (aws cli v2 and jq installed)
if ! command -v jq &> /dev/null
then
    echo "jq could not be found - check how to download and install it here https://stedolan.github.io/jq/download/"
    exit
fi

if ! command -v aws --version &> /dev/null
then
    echo "AWS CLI could not be found - check how to download and install it here https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html"
    echo "How to configure the AWS CLI to use secrets for your Glacier IAM - https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html"
    echo "You need to configure it with your appropriate secrets for an IAM that has full access over your Glacier resources (AmazonGlacierFullAccess)."
    echo "In JSON: "
    echo "{"
    echo "  \"Version\": \"2012-10-17\","
    echo "  \"Statement\": ["
    echo "    {"
    echo "      \"Action\": \"glacier:*\","
    echo "      \"Effect\": \"Allow\","
    echo "      \"Resource\": \"*\""
    echo "    }"
    echo "  ]"
    echo "}"
    exit
fi

account_id=$1
region=$2
vault_name=$3

if [[ -z ${account_id} ]] || [[ -z ${region} ]] || [[ -z ${vault_name} ]]; then
  echo "#################################################################"
  echo "Attention!!! Parameters required are missing."
  echo "Account: ${account_id}"
  echo "Region: ${region}"
  echo "Vault: ${vault_name}"
  echo "#################################################################"
  echo "run this command like: sh ./delete-aws-glacier-vault-archives.sh AWS_ACCOUNT_ID AWS_REGION AWS_GLACIER_VAULT_NAME"
  exit 1
fi

echo "Initiating delete process for the vault."
echo "    Account:${account_id}"
echo "    Region:${region}"
echo "    Vault:${vault_name}"

echo "Starting Step 1/4 - Glacier Inventory Retrieval Job - it's Async and can take hours or days to complete"

# Step 1 - inventory retrieval job for the given vault
job_initiation_file=./glacier-inventory-retrieval-job-file-${account_id}-${region}-${vault_name}.json
if test -f "${job_initiation_file}"; then
  echo "There is already a file for this job. Using it now. If you don't want to use it you need to delete the file ${job_initiation_file}."
else
  echo "No previous job file found for this vault."
  echo "Starting a new Job."
  aws glacier initiate-job --job-parameters '{"Type": "inventory-retrieval"}' --account-id ${account_id} --region ${region} --vault-name ${vault_name} &> ${job_initiation_file}
  echo "Job request made."
fi

echo "Checking if the job initiation file is in good shape."
job_id="Undefined"
if cat ${job_initiation_file} | jq ".jobId" > /dev/null; then
  job_id=$(cat ${job_initiation_file} | jq -r ".jobId")
  echo "File is OK, jobId=${job_id}"
else
  echo "Failed to obtain Job Id from file, file may be corrupted or your retrieve-inventory call failed - check parameters passed to this script or aws cli config and connectivity."
fi

echo "Starting Step 2/4 - Checking state of the Job to see if it's completed and can have its inventory retrieved for deletion."
job_completed_flag=false
job_status_file=./glacier-describe-job-file-${account_id}-${region}-${vault_name}-${job_id}.json
while [ "${job_completed_flag}" = false ]
do
  aws glacier describe-job --account-id ${account_id} --region ${region} --vault-name ${vault_name} --job-id ${job_id} &> ${job_status_file}

  if cat ${job_status_file} | jq ".Completed" > /dev/null; then
    job_completed_flag=$(cat ${job_status_file} | jq -r ".Completed")
    echo "File is OK. Job completed? ${job_completed_flag}"
    if ${job_completed_flag} = true; then
      break
    fi
  else
    echo "$(date) Failed to check status from describe job."
  fi
  #sleeps for 1/2 hour - 1800 seconds before trying to fetch status again - Glacier is slow...
  echo "$(date) Will try again in 1/2 hour... "
  sleep 1800
done

echo "Starting Step 3/4 - Obtaining output from retrieval job - finally getting archive ids to delete"
inventory_output_file=./glacier-inventory-output-file-${account_id}-${region}-${vault_name}-${job_id}.json
aws glacier get-job-output --account-id ${account_id} --region ${region} --vault-name ${vault_name} --job-id ${job_id} ${inventory_output_file}
echo "Output file: ${inventory_output_file} created for vault ${vault_name} and job ${job_id}"

inventory_id_file=./glacier-inventory-output-file-${account_id}-${region}-${vault_name}-${job_id}.txt
echo "Creating archive list from output file at ${inventory_id_file}"
if [[ ! -f ${inventory_id_file} ]]; then
  cat ${inventory_output_file} | jq -r --stream ". | { (.[0][2]): .[1]} | select(.ArchiveId) | .ArchiveId" > ${inventory_id_file} 2> /dev/null
fi
total=$(wc -l ${inventory_id_file} | awk '{print $1}')
echo "Total archives to delete: ${total} in vault ${vault_name}"

echo "Starting Step 4/4 - Delete process starting now $(date)"

case "$(uname -s)" in
    Linux*)     numCPU="$(nproc)";;
    Darwin*)    numCPU="$(sysctl -n hw.logicalcpu)";;
    *)          numCPU=1
esac

num=0
while read -r archive_id; do
  num=$((num+1))

  aws glacier delete-archive --account-id ${account_id} --region ${region} --vault-name ${vault_name} --archive-id=${archive_id} &
  [ $( jobs | wc -l ) -ge $numCPU ] && wait

  echo "Archive ${num}/${total} deleted at $(date) - id: ${archive_id}"
done < "${inventory_id_file}"

wait
echo "Finished at $(date)"
echo "Deleted all archives listed in ${inventory_id_file}"

## deleting_all_archives_from_aws_glacier_vault.md

      
    Raw
  

              deleting_all_archives_from_aws_glacier_vault.md
            
          
    Simple script to delete all archives from a given AWS Glacier Vault

I thought it would be good to share with others facing the incovenient process of deleting AWS Glacier Vaults - it was always something that kept me paying AWS for file archives I no longer needed for years! Now I finally decided to eliminate this waste.
I've written my own because I couldn't find a convenient one to use, this script needs only an account id, aws region and vault name to do its job for you (after you've followed the pre-requisites).
I've borrowed a lot from a previous gist, mentioned in the acknowledgements (including a number of comments from others that used it) but decided to build something more end-to-end and remove any manual steps so I could start it and leave it alone. Hope that's useful to you too!
Pre-requisites


I've run this on Mac and Linux (AWS) - I ended up running it on an EC2 instance as it can be a very long running process.
Install jq https://stedolan.github.io/jq/download/
Install the AWS CLI v2 https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html
Create an IAM that has full access to AWS Glacier resources (AmazonGlacierFullAccess policy)
Copy the IAM's Access Key ID and Secret Access Key for your cli configuration.
Run aws configure and provide your IAM's Access Key ID and Secret Access Key and default output format to JSON (region in this case is not so important as I've decided to have it as a parameter on the script itself).

Setting it up


Copy the attached file named: delete-aws-glacier-vault-archives.sh
Make it executable by running $ chmod 744 ./delete-aws-glacier-vault-archives.sh

Running it

This script take three parameters to run: AWS Account ID, AWS Region for the Glacier Vault in Question and the Vault Name you want to delete archives from.
Based on comments from the gist mentioned in my early acknowledgments, a good way to run it is to keep it running in background and make it log to a file so you can check on progress which can be achieved running the command below:
$ nohup ./delete-aws-glacier-vault-archives.sh AWS_ACCOUNT_ID AWS_REGION AWS_GLACIER_VAULT_NAME > delete_AWS_ACCOUNT_ID_AWS_REGION_AWS_GLACIER_VAULT_NAME.log 2>&1 &

Please note that you should replace the AWS_ACCOUNT_ID, AWS_REGION and AWS_GLACIER_VAULT_NAME on the script and log file name to be able to monitor and run multiple process in parallel in case you need to delete multiple vaults.
Hope it works out for you as it did for me :)
Notes on runtime behaviour


It will try and avoid creating unecessary inventory-retrieve tasks, so it creates a file to store the job information against a given vault.
It will create files for the job output (which contains the JSON returned from Glacier containing all archives on the vault).
It will create an input file containing only the archive ids extracted from the previous JSON file.
Logs will give you a sense of where you are and will contain each archive id deleted.

Notes on Glacier behaviour

Even after you delete all your archives they will still show as if nothing has happened in the AWS Glacier console as the information there is only computed daily by AWS. You will only be able to delete the Vault in the console after AWS refreshes its information and show as an empty vault... Annoying but nothing I can do about it.
Acknowledgements


Vincent Meijer gist at https://gist.github.com/veuncent/ac21ae8131f24d3971a621fac0d95be5
https://gist.github.com/johandebeurs for commenting on the previous version on differences between Mac and Linux regarding number of processing units for parallelism - I've incorporated the feedback.
https://gist.github.com/kellyatkinson for feedback on how to improve status messages (adding timestamps on pauses). Incorporated the feedback.