Last active
May 24, 2023 10:19
-
-
Save jazzl0ver/c6859e1615a0f97b8704052db0745e25 to your computer and use it in GitHub Desktop.
Firecamp Cassandra backup script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Firecamp Cassandra backup script | |
# Example: | |
# ./fc_cass_backup.sh -r us-east-1 -c firecamp-qa -s cass-qa -k '14 days' -p | |
# where: | |
# -r - region | |
# -c - firecamp cluster name | |
# -s - firecamp service name | |
# -k - days to purge backup after in the date tool format | |
# -p - purge the expired backups | |
# -d - destination region to copy snapshots to | |
# -g - days to purge backup in the destination region after in the date tool format | |
# -u - purge the expired backups in the destination region | |
# | |
# Script should be executed on a docker-enabled instance within the same VPC as Firecamp cluster and in AppSecurityGroup | |
# Dependancies: awscli, docker, firecamp-service-cli, jq | |
# | |
# This script takes snapshots of the Cassandra service volumes (primary and journal) and tag them accordingly | |
# Use fc_cass_restore.sh to restore from the backup | |
# | |
# Following policy should be assigned to the instance (or a user) where the script is executed: | |
# { | |
# "Version": "2012-10-17", | |
# "Statement": [ | |
# { | |
# "Sid": "VisualEditor0", | |
# "Effect": "Allow", | |
# "Action": [ | |
# "ec2:DescribeVolumes", | |
# "ec2:CreateSnapshot", | |
# "ec2:CopySnapshot", | |
# "ec2:DeleteSnapshot", | |
# "ec2:DescribeSnapshots", | |
# "ec2:CreateTags", | |
# "sts:GetCallerIdentity" | |
# ], | |
# "Resource": "*" | |
# } | |
# ] | |
# } | |
# | |
# | |
# Modify FCCLI var to the actual paths | |
FCCLI=~ec2-user/firecamp/1.6/firecamp-service-cli | |
#-- Do not modify below | |
[ -x "$FCCLI" ] || { echo "Download $(basename $FCCLI) tool into $(dirname $FCCLI) before using this script"; exit; } | |
purge_snapshots=false | |
dest_purge_snapshots=false | |
while getopts :s:c:r:k:d:g:pu opt; do | |
case $opt in | |
r) region="$OPTARG" ;; | |
c) cluster="$OPTARG" ;; | |
s) servicename="$OPTARG" ;; | |
k) purge_after_input="$OPTARG" ;; | |
p) purge_snapshots=true ;; | |
d) dest_region="$OPTARG" ;; | |
g) dest_purge_after_input="$OPTARG" ;; | |
u) dest_purge_snapshots=true ;; | |
*) echo "=== Error with Options Input. Cause of failure is most likely that an unsupported parameter was passed or a parameter was passed without a corresponding option." 1>&2 ; exit 64 ;; | |
esac | |
done | |
[ -n "$region" -a -n "$cluster" -a -n "$servicename" ] || { echo "region, cluster name or service name is not specified, exiting..."; exit; } | |
[ "$purge_snapshots" = "true" -a -z "$purge_after_input" ] && { echo "retention period is not specified, exiting..."; exit; } | |
[ "$dest_purge_snapshots" = "true" -a -z "$dest_purge_after_input" ] && { echo "retention period for the destination region is not specified, exiting..."; exit; } | |
today=$(date -u +%Y-%m-%d) | |
hostname=$(hostname) | |
purge_after=$(date -d "$purge_after_input" -u +%Y-%m-%d) | |
purge_after_fe=$(date -d "$purge_after_input" -u +%s) | |
dest_purge_after=$(date -d "$dest_purge_after_input" -u +%Y-%m-%d) | |
dest_purge_after_fe=$(date -d "$dest_purge_after_input" -u +%s) | |
echo "Retriving credentials..." | |
for line in $($FCCLI -region=$region -cluster=$cluster -op=get-service -service-type=cassandra -service-name=$servicename \ | |
| grep -oE "(JMX_REMOTE_USER|JMX_REMOTE_PASSWD)=[a-z0-9-]+"); do | |
key=$(echo $line | cut -f1 -d=) | |
val=$(echo $line | cut -f2 -d=) | |
if [[ $key == "JMX_REMOTE_USER" ]]; then | |
jmxuser=$val | |
fi | |
if [[ $key == "JMX_REMOTE_PASSWD" ]]; then | |
jmxpass=$val | |
fi | |
done | |
echo | |
for line in $($FCCLI -region=$region -cluster=$cluster -op=list-members -service-type=cassandra -service-name=$servicename \ | |
| grep -oE "(ServiceUUID|MemberName|AvailableZone|ServerInstanceID|PrimaryVolumeID|JournalVolumeID):[a-z0-9-]+"); do | |
key=$(echo $line | cut -f1 -d:) | |
val=$(echo $line | cut -f2 -d:) | |
if [[ $key == "ServiceUUID" ]]; then | |
uuid=$val | |
fi | |
if [[ $key == "MemberName" ]]; then | |
member=$val | |
fi | |
if [[ $key == "AvailableZone" ]]; then | |
az=$val | |
fi | |
if [[ $key == "ServerInstanceID" ]]; then | |
instanceid=$val | |
fi | |
if [[ $key == "PrimaryVolumeID" ]]; then | |
pvolid=$val | |
fi | |
if [[ $key == "JournalVolumeID" ]]; then | |
jvolid=$val | |
fi | |
if [ -n "$uuid" -a -n "$member" -a -n "$az" -a -n "$instanceid" -a -n "$pvolid" -a -n "$jvolid" ]; then | |
pvolName=$(aws --region=$region ec2 describe-volumes --volume-ids $pvolid --query 'Volumes[].Tags[?Key==`Name`].Value' --output text) | |
jvolName=$(aws --region=$region ec2 describe-volumes --volume-ids $jvolid --query 'Volumes[].Tags[?Key==`Name`].Value' --output text) | |
[ -n "$pvolName" -a -n "$jvolName" ] || { echo "Can't get volume name for $pvolid or $jvolid"; exit; } | |
echo "$(date): Flushing node $member..." | |
/bin/docker run --rm harisekhon/cassandra-dev nodetool -h $member.$cluster-firecamp.com -u $jmxuser -pw $jmxpass flush | |
echo "$(date): Taking $pvolName ($pvolid) snapshot..." | |
pvolss=$(aws --region=$region ec2 create-snapshot --volume-id $pvolid --description 'Created by Firecamp Cassandra Backup Script' \ | |
--tag-specifications "ResourceType=snapshot,Tags=[\ | |
{Key=Name,Value=$pvolName}, | |
{Key=VolumeDestiny,Value=Primary}, | |
{Key=ServiceUUID,Value=$uuid}, | |
{Key=Created,Value=$today}, | |
{Key=InitiatingHost,Value=$hostname}, | |
{Key=MemberName,Value=$member}, | |
{Key=Instance,Value=$instanceid}, | |
{Key=AvailableZone,Value=$az}, | |
{Key=PurgeAfter,Value=$purge_after}, | |
{Key=PurgeAfterFE,Value=$purge_after_fe}, | |
{Key=PurgeAllow,Value=$purge_snapshots}\ | |
]" --query 'SnapshotId' --output text) | |
[ $? -ne 0 ] && { echo "$(date): Failed to CreateSnapshot for $pvolid, exiting..."; exit; } | |
while true; do | |
pvolss_state=$(aws --region $region ec2 describe-snapshots --snapshot-ids $pvolss --query 'Snapshots[].State' --output text) | |
if [ "$pvolss_state" = "completed" ]; then | |
echo "$(date): $pvolName ($pvolid) snapshot completed - $pvolss" | |
if [ -n "$dest_region" ]; then | |
echo "$(date): Copying snapshot $pvolss to $dest_region region..." | |
pvolss_copy=$(aws --region=$dest_region ec2 copy-snapshot --description 'Created by Firecamp Cassandra Backup Script' \ | |
--source-region $region --source-snapshot-id $pvolss --tag-specifications "ResourceType=snapshot,Tags=[\ | |
{Key=Name,Value=$pvolName}, | |
{Key=VolumeDestiny,Value=Primary}, | |
{Key=ServiceUUID,Value=$uuid}, | |
{Key=Created,Value=$today}, | |
{Key=InitiatingHost,Value=$hostname}, | |
{Key=MemberName,Value=$member}, | |
{Key=Instance,Value=$instanceid}, | |
{Key=AvailableZone,Value=$az}, | |
{Key=SourceRegion,Value=$region}, | |
{Key=SourceSnapshotId,Value=$pvolss}, | |
{Key=PurgeAfter,Value=$dest_purge_after}, | |
{Key=PurgeAfterFE,Value=$dest_purge_after_fe}, | |
{Key=PurgeAllow,Value=$dest_purge_snapshots}\ | |
]" --query 'SnapshotId' --output text) | |
[ $? -ne 0 ] && { echo "$(date): Failed to CopySnapshot $pvolss for $pvolName ($pvolid), exiting..."; exit; } | |
while true; do | |
pvolss_copy_state=$(aws --region $dest_region ec2 describe-snapshots --snapshot-ids $pvolss_copy --query 'Snapshots[].State' --output text) | |
if [ "$pvolss_copy_state" = "completed" ]; then | |
echo "$(date): snapshot $pvolss copying for $pvolName ($pvolid) to $dest_region region completed - $pvolss_copy" | |
break | |
else | |
sleep 60 | |
fi | |
done & | |
fi | |
break | |
else | |
sleep 60 | |
fi | |
done & | |
echo "$(date): Taking $jvolName ($jvolid) snapshot..." | |
jvolss=$(aws --region=$region ec2 create-snapshot --volume-id $jvolid --description 'Created by Firecamp Cassandra Backup Script' \ | |
--tag-specifications "ResourceType=snapshot,Tags=[\ | |
{Key=Name,Value=$jvolName}, | |
{Key=VolumeDestiny,Value=Journal}, | |
{Key=ServiceUUID,Value=$uuid}, | |
{Key=Created,Value=$today}, | |
{Key=InitiatingHost,Value=$hostname}, | |
{Key=MemberName,Value=$member}, | |
{Key=Instance,Value=$instanceid}, | |
{Key=AvailableZone,Value=$az}, | |
{Key=PurgeAfter,Value=$purge_after}, | |
{Key=PurgeAfterFE,Value=$purge_after_fe}, | |
{Key=PurgeAllow,Value=$purge_snapshots}\ | |
]" --query 'SnapshotId' --output text) | |
[ $? -ne 0 ] && { echo "$(date): Failed to CreateSnapshot for $jvolid, exiting..."; exit; } | |
while true; do | |
jvolss_state=$(aws --region us-east-1 ec2 describe-snapshots --snapshot-ids $jvolss --query 'Snapshots[].State' --output text) | |
if [ "$jvolss_state" = "completed" ]; then | |
echo "$(date): $jvolName ($jvolid) snapshot completed - $jvolss" | |
if [ -n "$dest_region" ]; then | |
echo "$(date): Copying snapshot $jvolss to $dest_region region..." | |
jvolss_copy=$(aws --region=$dest_region ec2 copy-snapshot --description 'Created by Firecamp Cassandra Backup Script' \ | |
--source-region $region --source-snapshot-id $jvolss --tag-specifications "ResourceType=snapshot,Tags=[\ | |
{Key=Name,Value=$jvolName}, | |
{Key=VolumeDestiny,Value=Journal}, | |
{Key=ServiceUUID,Value=$uuid}, | |
{Key=Created,Value=$today}, | |
{Key=InitiatingHost,Value=$hostname}, | |
{Key=MemberName,Value=$member}, | |
{Key=Instance,Value=$instanceid}, | |
{Key=AvailableZone,Value=$az}, | |
{Key=SourceRegion,Value=$region}, | |
{Key=SourceSnapshotId,Value=$jvolss}, | |
{Key=PurgeAfter,Value=$dest_purge_after}, | |
{Key=PurgeAfterFE,Value=$dest_purge_after_fe}, | |
{Key=PurgeAllow,Value=$dest_purge_snapshots}\ | |
]" --query 'SnapshotId' --output text) | |
[ $? -ne 0 ] && { echo "$(date): Failed to CopySnapshot $jvolss for $jvolName ($jvolid), exiting..."; exit; } | |
while true; do | |
jvolss_copy_state=$(aws --region $dest_region ec2 describe-snapshots --snapshot-ids $jvolss_copy --query 'Snapshots[].State' --output text) | |
if [ "$jvolss_copy_state" = "completed" ]; then | |
echo "$(date): snapshot $jvolss copying for $jvolName ($jvolid) to $dest_region region completed - $jvolss_copy" | |
break | |
else | |
sleep 60 | |
fi | |
done & | |
fi | |
break | |
else | |
sleep 60 | |
fi | |
done & | |
uuid=""; member=""; az=""; instanceid=""; pvolid=""; jvolid=""; pvolss=""; jvolss="" | |
echo | |
fi | |
done | |
echo "Commands to snapshot Cassandra volumes have been sent" | |
echo | |
if $purge_snapshots; then | |
echo "$(date): Purging expired snapshots..." | |
accountid=$(aws sts get-caller-identity --output text --query 'Account') | |
yesterday=$(($(date -u +%s)-86400)) | |
sshots=$(aws --region=$region ec2 describe-snapshots --owner-ids $accountid \ | |
--filters Name=tag-key,Values=ServiceUUID \ | |
Name=tag:PurgeAllow,Values=true \ | |
| jq -r '.Snapshots[] | select(.Tags[]|select(.Key=="PurgeAfterFE").Value<"'$yesterday'") | .SnapshotId') | |
for sshot in $(echo $sshots); do | |
echo -e "\t$sshot" | |
aws --region=$region ec2 delete-snapshot --snapshot-id $sshot > /dev/null | |
done | |
if [ -n "$dest_region" ]; then | |
sshots=$(aws --region=$dest_region ec2 describe-snapshots --owner-ids $accountid \ | |
--filters Name=tag-key,Values=ServiceUUID \ | |
Name=tag:PurgeAllow,Values=true \ | |
| jq -r '.Snapshots[] | select(.Tags[]|select(.Key=="PurgeAfterFE").Value<"'$yesterday'") | .SnapshotId') | |
for sshot in $(echo $sshots); do | |
echo -e "\t$sshot" | |
aws --region=$dest_region ec2 delete-snapshot --snapshot-id $sshot > /dev/null | |
done | |
fi | |
[ -n "$sshots" ] && echo "$(date): Commands to delete expired snapshots have been sent" | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment