Last active
June 8, 2019 02:30
-
-
Save hdoverobinson/ab5704ce6bacbace0d50fe77ca3ea211 to your computer and use it in GitHub Desktop.
Multithreaded tar.gz backups with multiple checksum verifications
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
###AUTHOR### | |
#Harry Dove-Robinson 2019-06-07 | |
#harry@doverobinson.me | |
#https://gist.github.com/hdoverobinson | |
#https://github.com/hdoverobinson | |
###NOTES### | |
#Packages required: cmp, pigz, md5sum (Mac users should also install findutils and gnu-tar via Brew) | |
#This script will do the following: | |
#-Make list of MD5 checksums of all files in the directory to be backed up | |
#-Perform a multithreaded tar.gz backup based on the list of files | |
#-Test the tar.gz archive by decompressing while making MD5 checksums of the files inside | |
#-Compare list of files and checksums inside and outside the archive to verify integrity of files | |
#-Verify integrity of the tar.gz itself with pigz --test | |
#-Make MD5 checksum of the entire archive | |
###USAGE### | |
#Provide two arguments: the directory to be backed up, and a destination directory to put the backup files | |
#e.g. ./backup.sh /root/mystuff/ /media/backup/mystuff/ | |
#If the destination directory does not exist the script will attempt to create it | |
#If the the archive files are already in the destination directory then the script will exit | |
#If the directory to be backed up is empty then the script will exit | |
#The FIND_CRITERIA variable can be edited to change the selection criteria of the files to be backed up | |
#e.g. FIND_CRITERIA="-newermt 2018-01-01 ! -newermt 2018-02-01" to back up only the files modified between 2018-01-01 and 2018-02-01 | |
#The COMPRESSION_LEVEL variable can be edited to set the pigz compression level between -1 and -9 where -9 is the highest. | |
#optional criteria for FIND_CMD | |
FIND_CRITERIA="! -name '.DS_Store' ! -name '._*'" | |
#compression level option passed to pigz | |
COMPRESSION_LEVEL="-9" | |
backup_stats () | |
{ | |
TIME_RUNNING="$SECONDS" | |
NUMBER_OF_SOURCE_FILES="$(printf "$SRC_FILES" | sed 's/ //g')" | |
BYTES_SIZE_OF_INPUT="$(echo "$(du -Lc "$INPUT" | grep 'total' | awk '{print $1}')*512" | bc -l)" | |
BYTES_SIZE_OF_OUTPUT="$(echo "$(du -L "$OUTPUT.tar.gz" | awk '{print $1}')*512" | bc -l)" | |
PERCENTAGE_SIZE_REDUCTION="$(echo "scale=3; 100-((${BYTES_SIZE_OF_OUTPUT}/${BYTES_SIZE_OF_INPUT})*100)" | bc -l)" | |
MEGABYTES_SIZE_OF_INPUT="$(echo "scale=3; ${BYTES_SIZE_OF_INPUT}/(1024^2)" | bc -l)" | |
MEGABYTES_SIZE_OF_OUTPUT="$(echo "scale=3; ${BYTES_SIZE_OF_OUTPUT}/(1024^2)" | bc -l)" | |
echo "$(tput setaf 2 2> /dev/null) " | |
echo "Duration: ${TIME_RUNNING}s" | |
echo "Files in source: ${NUMBER_OF_SOURCE_FILES}" | |
echo "Input size: ${MEGABYTES_SIZE_OF_INPUT} MB" | |
echo "Output size: ${MEGABYTES_SIZE_OF_OUTPUT} MB" | |
echo "Percentage size reduction: ${PERCENTAGE_SIZE_REDUCTION}%$(tput sgr0 2> /dev/null)" | |
} | |
backup () | |
{ | |
echo "$(tput setaf 2 2> /dev/null)Archiving files in $INPUT to $OUTPUT.tar.gz...$(tput sgr0 2> /dev/null)" && | |
#make sorted list of files to back up with their checksums | |
touch "$OUTPUT.tar.gz.files-in-source-md5.txt" && | |
cat /dev/null "$OUTPUT.tar.gz.files-in-source-md5.txt" && | |
eval "$FIND_CMD 2> /dev/null" | while read -r SOURCE_FILE | |
do | |
test -d "$SOURCE_FILE" || md5sum "$SOURCE_FILE" | sed "s@$(realpath "$INPUT/../")/@@g" >> "$OUTPUT.tar.gz.files-in-source-md5.txt" | |
done && | |
sort -s -k 2,2 "$OUTPUT.tar.gz.files-in-source-md5.txt" -o "$OUTPUT.tar.gz.files-in-source-md5.txt" && | |
#multithreaded tar.gz backup of files from list of files | |
$TAR_CMD -cf - -C "$(realpath "$INPUT/../")" --files-from=<(cat "$OUTPUT.tar.gz.files-in-source-md5.txt" | cut -d\ -f3-) | pigz --stdout $COMPRESSION_LEVEL > "$OUTPUT.tar.gz" && | |
#make sorted list of md5 checksums of files inside tar.gz | |
$TAR_CMD -xzf "$OUTPUT.tar.gz" --to-command='sh -c "md5sum | sed \"s|-|\$TAR_FILENAME|\""' > "$OUTPUT.tar.gz.files-in-archive-md5.txt" && | |
sort -s -k 2,2 "$OUTPUT.tar.gz.files-in-archive-md5.txt" -o "$OUTPUT.tar.gz.files-in-archive-md5.txt" && | |
#check whether all of the expected output files exist | |
if [[ -f "$OUTPUT.tar.gz" ]] && [[ -f "$OUTPUT.tar.gz.files-in-source-md5.txt" ]] && [[ -f "$OUTPUT.tar.gz.files-in-archive-md5.txt" ]] && \ | |
#check whether the number of files in the source is the same as the number of files in the archive | |
SRC_FILES="$(cat "$OUTPUT.tar.gz.files-in-source-md5.txt" | wc -l)" && \ | |
BACK_FILES="$(cat "$OUTPUT.tar.gz.files-in-archive-md5.txt" | wc -l)" && \ | |
[[ "$SRC_FILES" -gt 0 ]] && [[ "$BACK_FILES" -gt 0 ]] && [[ "$SRC_FILES" -eq "$BACK_FILES" ]] && \ | |
#check whether the output tar.gz is a file with size greater than 0 | |
[[ $(du -L "$OUTPUT.tar.gz" | awk '{print $1}') -gt 0 ]] && \ | |
#check whether the sorted list of input files and the sorted list of files in the archive are the same | |
cmp -s "$OUTPUT.tar.gz.files-in-source-md5.txt" "$OUTPUT.tar.gz.files-in-archive-md5.txt" && \ | |
#check whether the archive itself is valid | |
pigz --test "$OUTPUT.tar.gz" | |
then | |
#if above tests pass, make md5 checksum of entire tar.gz archive and exit | |
md5sum "$OUTPUT.tar.gz" | sed "s@$(dirname "$OUTPUT.tar.gz")/@@g" > "$OUTPUT.tar.gz-md5.txt" && | |
echo "$(tput setaf 2 2> /dev/null)Archive $OUTPUT.tar.gz complete!$(tput sgr0 2> /dev/null)" && | |
backup_stats && | |
return 0 | |
else | |
echo "$(tput setaf 1 2> /dev/null)Archive failed: $OUTPUT.tar.gz.$(tput sgr0 2> /dev/null)" && | |
return 1 | |
fi | |
} | |
###RUN### | |
if [[ -z "$2" ]] || [[ -n "$3" ]] | |
then | |
echo "$(tput setaf 1)Please provide all 2 arguments: $(basename "$0") INPUT_DIR OUTPUT_DIR | |
e.g. ./$(basename "$0") /path/to/directory/to/be/backed/up/ /path/to/directory/to/put/backup/files/$(tput sgr0)" && | |
exit 1 | |
fi && | |
if [[ "$(uname -s)" == "Darwin" ]] | |
then | |
XARGS_CMD="gxargs" | |
TAR_CMD="gtar" | |
else | |
XARGS_CMD="xargs" | |
TAR_CMD="tar" | |
fi && | |
if ! command -v cmp > /dev/null 2>&1 || ! command -v pigz > /dev/null 2>&1 || ! command -v md5sum > /dev/null 2>&1 || ! command -v $XARGS_CMD > /dev/null 2>&1 || ! command -v $TAR_CMD > /dev/null 2>&1 | |
then | |
echo "$(tput setaf 2 2> /dev/null)This script requires $XARGS_CMD, $TAR_CMD, cmp, pigz, and md5sum!$(tput sgr0 2> /dev/null)" && | |
exit 1 | |
fi && | |
INPUT="$(realpath -m "$1" 2> /dev/null)" && | |
OUTDIR="$(realpath -m "$2" 2> /dev/null)" && | |
OUTPUT="$(realpath -m "$OUTDIR" 2> /dev/null)/$(basename "$INPUT")" && | |
FIND_CMD="find \"$INPUT\" -type f $FIND_CRITERIA" && | |
if ! [[ -d "$INPUT" ]] | |
then | |
echo "$(tput setaf 1 2> /dev/null)Cannot find input directory $INPUT!$(tput sgr0 2> /dev/null)" && | |
exit 1 | |
fi && | |
if ! eval "$FIND_CMD ! -empty -print -quit 2> /dev/null" | grep -q '.' | |
then | |
echo "$(tput setaf 1 2> /dev/null)Cannot find files to back up in $INPUT!$(tput sgr0 2> /dev/null)" && | |
exit 1 | |
fi && | |
if [[ -f "$OUTPUT.tar.gz" ]] || [[ -f "$OUTPUT.tar.gz.files-in-source-md5.txt" ]] || [[ -f "$OUTPUT.tar.gz.files-in-archive-md5.txt" ]] | |
then | |
echo "$(tput setaf 1 2> /dev/null)Output files already exist in $OUTDIR!$(tput sgr0 2> /dev/null)" && | |
exit 1 | |
fi && | |
if ! mkdir -p "$OUTDIR" | |
then | |
echo "$(tput setaf 1 2> /dev/null)Cannot create output directory $OUTDIR!$(tput sgr0 2> /dev/null)" && | |
exit 1 | |
fi && | |
backup && exit 0 || (echo "$(tput setaf 1 2> /dev/null)Something went wrong!$(tput sgr0 2> /dev/null)" && exit 1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment