Skip to content

Instantly share code, notes, and snippets.

@hdoverobinson
Last active June 8, 2019 02:30
Show Gist options
  • Save hdoverobinson/ab5704ce6bacbace0d50fe77ca3ea211 to your computer and use it in GitHub Desktop.
Save hdoverobinson/ab5704ce6bacbace0d50fe77ca3ea211 to your computer and use it in GitHub Desktop.
Multithreaded tar.gz backups with multiple checksum verifications
#!/bin/bash
###AUTHOR###
#Harry Dove-Robinson 2019-06-07
#harry@doverobinson.me
#https://gist.github.com/hdoverobinson
#https://github.com/hdoverobinson
###NOTES###
#Packages required: cmp, pigz, md5sum (Mac users should also install findutils and gnu-tar via Brew)
#This script will do the following:
#-Make list of MD5 checksums of all files in the directory to be backed up
#-Perform a multithreaded tar.gz backup based on the list of files
#-Test the tar.gz archive by decompressing while making MD5 checksums of the files inside
#-Compare list of files and checksums inside and outside the archive to verify integrity of files
#-Verify integrity of the tar.gz itself with pigz --test
#-Make MD5 checksum of the entire archive
###USAGE###
#Provide two arguments: the directory to be backed up, and a destination directory to put the backup files
#e.g. ./backup.sh /root/mystuff/ /media/backup/mystuff/
#If the destination directory does not exist the script will attempt to create it
#If the the archive files are already in the destination directory then the script will exit
#If the directory to be backed up is empty then the script will exit
#The FIND_CRITERIA variable can be edited to change the selection criteria of the files to be backed up
#e.g. FIND_CRITERIA="-newermt 2018-01-01 ! -newermt 2018-02-01" to back up only the files modified between 2018-01-01 and 2018-02-01
#The COMPRESSION_LEVEL variable can be edited to set the pigz compression level between -1 and -9 where -9 is the highest.
#optional criteria for FIND_CMD
FIND_CRITERIA="! -name '.DS_Store' ! -name '._*'"
#compression level option passed to pigz
COMPRESSION_LEVEL="-9"
backup_stats ()
{
TIME_RUNNING="$SECONDS"
NUMBER_OF_SOURCE_FILES="$(printf "$SRC_FILES" | sed 's/ //g')"
BYTES_SIZE_OF_INPUT="$(echo "$(du -Lc "$INPUT" | grep 'total' | awk '{print $1}')*512" | bc -l)"
BYTES_SIZE_OF_OUTPUT="$(echo "$(du -L "$OUTPUT.tar.gz" | awk '{print $1}')*512" | bc -l)"
PERCENTAGE_SIZE_REDUCTION="$(echo "scale=3; 100-((${BYTES_SIZE_OF_OUTPUT}/${BYTES_SIZE_OF_INPUT})*100)" | bc -l)"
MEGABYTES_SIZE_OF_INPUT="$(echo "scale=3; ${BYTES_SIZE_OF_INPUT}/(1024^2)" | bc -l)"
MEGABYTES_SIZE_OF_OUTPUT="$(echo "scale=3; ${BYTES_SIZE_OF_OUTPUT}/(1024^2)" | bc -l)"
echo "$(tput setaf 2 2> /dev/null) "
echo "Duration: ${TIME_RUNNING}s"
echo "Files in source: ${NUMBER_OF_SOURCE_FILES}"
echo "Input size: ${MEGABYTES_SIZE_OF_INPUT} MB"
echo "Output size: ${MEGABYTES_SIZE_OF_OUTPUT} MB"
echo "Percentage size reduction: ${PERCENTAGE_SIZE_REDUCTION}%$(tput sgr0 2> /dev/null)"
}
backup ()
{
echo "$(tput setaf 2 2> /dev/null)Archiving files in $INPUT to $OUTPUT.tar.gz...$(tput sgr0 2> /dev/null)" &&
#make sorted list of files to back up with their checksums
touch "$OUTPUT.tar.gz.files-in-source-md5.txt" &&
cat /dev/null "$OUTPUT.tar.gz.files-in-source-md5.txt" &&
eval "$FIND_CMD 2> /dev/null" | while read -r SOURCE_FILE
do
test -d "$SOURCE_FILE" || md5sum "$SOURCE_FILE" | sed "s@$(realpath "$INPUT/../")/@@g" >> "$OUTPUT.tar.gz.files-in-source-md5.txt"
done &&
sort -s -k 2,2 "$OUTPUT.tar.gz.files-in-source-md5.txt" -o "$OUTPUT.tar.gz.files-in-source-md5.txt" &&
#multithreaded tar.gz backup of files from list of files
$TAR_CMD -cf - -C "$(realpath "$INPUT/../")" --files-from=<(cat "$OUTPUT.tar.gz.files-in-source-md5.txt" | cut -d\ -f3-) | pigz --stdout $COMPRESSION_LEVEL > "$OUTPUT.tar.gz" &&
#make sorted list of md5 checksums of files inside tar.gz
$TAR_CMD -xzf "$OUTPUT.tar.gz" --to-command='sh -c "md5sum | sed \"s|-|\$TAR_FILENAME|\""' > "$OUTPUT.tar.gz.files-in-archive-md5.txt" &&
sort -s -k 2,2 "$OUTPUT.tar.gz.files-in-archive-md5.txt" -o "$OUTPUT.tar.gz.files-in-archive-md5.txt" &&
#check whether all of the expected output files exist
if [[ -f "$OUTPUT.tar.gz" ]] && [[ -f "$OUTPUT.tar.gz.files-in-source-md5.txt" ]] && [[ -f "$OUTPUT.tar.gz.files-in-archive-md5.txt" ]] && \
#check whether the number of files in the source is the same as the number of files in the archive
SRC_FILES="$(cat "$OUTPUT.tar.gz.files-in-source-md5.txt" | wc -l)" && \
BACK_FILES="$(cat "$OUTPUT.tar.gz.files-in-archive-md5.txt" | wc -l)" && \
[[ "$SRC_FILES" -gt 0 ]] && [[ "$BACK_FILES" -gt 0 ]] && [[ "$SRC_FILES" -eq "$BACK_FILES" ]] && \
#check whether the output tar.gz is a file with size greater than 0
[[ $(du -L "$OUTPUT.tar.gz" | awk '{print $1}') -gt 0 ]] && \
#check whether the sorted list of input files and the sorted list of files in the archive are the same
cmp -s "$OUTPUT.tar.gz.files-in-source-md5.txt" "$OUTPUT.tar.gz.files-in-archive-md5.txt" && \
#check whether the archive itself is valid
pigz --test "$OUTPUT.tar.gz"
then
#if above tests pass, make md5 checksum of entire tar.gz archive and exit
md5sum "$OUTPUT.tar.gz" | sed "s@$(dirname "$OUTPUT.tar.gz")/@@g" > "$OUTPUT.tar.gz-md5.txt" &&
echo "$(tput setaf 2 2> /dev/null)Archive $OUTPUT.tar.gz complete!$(tput sgr0 2> /dev/null)" &&
backup_stats &&
return 0
else
echo "$(tput setaf 1 2> /dev/null)Archive failed: $OUTPUT.tar.gz.$(tput sgr0 2> /dev/null)" &&
return 1
fi
}
###RUN###
if [[ -z "$2" ]] || [[ -n "$3" ]]
then
echo "$(tput setaf 1)Please provide all 2 arguments: $(basename "$0") INPUT_DIR OUTPUT_DIR
e.g. ./$(basename "$0") /path/to/directory/to/be/backed/up/ /path/to/directory/to/put/backup/files/$(tput sgr0)" &&
exit 1
fi &&
if [[ "$(uname -s)" == "Darwin" ]]
then
XARGS_CMD="gxargs"
TAR_CMD="gtar"
else
XARGS_CMD="xargs"
TAR_CMD="tar"
fi &&
if ! command -v cmp > /dev/null 2>&1 || ! command -v pigz > /dev/null 2>&1 || ! command -v md5sum > /dev/null 2>&1 || ! command -v $XARGS_CMD > /dev/null 2>&1 || ! command -v $TAR_CMD > /dev/null 2>&1
then
echo "$(tput setaf 2 2> /dev/null)This script requires $XARGS_CMD, $TAR_CMD, cmp, pigz, and md5sum!$(tput sgr0 2> /dev/null)" &&
exit 1
fi &&
INPUT="$(realpath -m "$1" 2> /dev/null)" &&
OUTDIR="$(realpath -m "$2" 2> /dev/null)" &&
OUTPUT="$(realpath -m "$OUTDIR" 2> /dev/null)/$(basename "$INPUT")" &&
FIND_CMD="find \"$INPUT\" -type f $FIND_CRITERIA" &&
if ! [[ -d "$INPUT" ]]
then
echo "$(tput setaf 1 2> /dev/null)Cannot find input directory $INPUT!$(tput sgr0 2> /dev/null)" &&
exit 1
fi &&
if ! eval "$FIND_CMD ! -empty -print -quit 2> /dev/null" | grep -q '.'
then
echo "$(tput setaf 1 2> /dev/null)Cannot find files to back up in $INPUT!$(tput sgr0 2> /dev/null)" &&
exit 1
fi &&
if [[ -f "$OUTPUT.tar.gz" ]] || [[ -f "$OUTPUT.tar.gz.files-in-source-md5.txt" ]] || [[ -f "$OUTPUT.tar.gz.files-in-archive-md5.txt" ]]
then
echo "$(tput setaf 1 2> /dev/null)Output files already exist in $OUTDIR!$(tput sgr0 2> /dev/null)" &&
exit 1
fi &&
if ! mkdir -p "$OUTDIR"
then
echo "$(tput setaf 1 2> /dev/null)Cannot create output directory $OUTDIR!$(tput sgr0 2> /dev/null)" &&
exit 1
fi &&
backup && exit 0 || (echo "$(tput setaf 1 2> /dev/null)Something went wrong!$(tput sgr0 2> /dev/null)" && exit 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment