Last active
October 4, 2023 00:13
-
-
Save ctberthiaume/fa8cc545e590d726a59157910b335efa to your computer and use it in GitHub Desktop.
Script to calculate MD5 checksums for file paths read on stdin, optionally handling .gz files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Script to calculate MD5 checksums for file paths read on stdin, optionally | |
# handling .gz files | |
# | |
# Dependencies: | |
# Just basic Linux/BSD programs: bash, xargs, openssl, gzip. | |
# | |
# The skeleton for the option parsing section of this script was pulled from | |
# https://mywiki.wooledge.org/BashFAQ/035. | |
die() { | |
printf '%s\n' "$1" >&2 | |
exit 1 | |
} | |
# Initialize all the option variables. | |
# This ensures we are not contaminated by variables from the environment. | |
procs=1 | |
gz=0 | |
gz2=0 | |
nullterm="" | |
# How to batch up normal md5 checksum calculations | |
batch_size=20 | |
while :; do | |
case $1 in | |
-h|-\?|--help) | |
printf "md5gz\n" | |
printf "\n" | |
printf "Calculate MD5 checksums for files paths on stdin, optionally handling .gz files.\n" | |
printf "\n" | |
printf "Options:\n" | |
printf -- "--procs N: Processes to use, default is one.\n" | |
printf -- "--gz: Calculate checksum for uncompressed contents of .gz files as well.\n" | |
printf -- "--gz2: Same as --gz but don't calculate checksum for the original gzipped file.\n" | |
printf "\n" | |
printf "Example:\n" | |
printf "find mypath -type f -name '*.myfile.gz' -print0 | md5gz -0 --gz --procs 2\n" | |
exit | |
;; | |
--procs) # Takes an option argument; ensure it has been specified. | |
if [ "$2" ]; then | |
procs=$2 | |
shift | |
else | |
die 'ERROR: "--procs" requires a non-empty option argument.' | |
fi | |
;; | |
--procs=?*) | |
procs=${1#*=} # Delete everything up to "=" and assign the remainder. | |
;; | |
--procs) # Handle the case of an empty --procs | |
die 'ERROR: "--procs" requires a non-empty option argument.' | |
;; | |
--gz) # Get checksum for uncompressed .gz files as well | |
gz=1 | |
;; | |
--gz2) # Get checksum for uncompressed .gz files but not original .gz file | |
if [ "$gz" -eq 1 ]; then | |
die 'ERROR: "--gz2" is mutually exclusive with "--gz".' | |
fi | |
gz2=1 | |
;; | |
-0) | |
nullterm="-0" | |
;; | |
--) # End of all options. | |
shift | |
break | |
;; | |
-?*) | |
printf 'WARN: Unknown option (ignored): %s\n' "$1" >&2 | |
;; | |
*) # Default case: No more options, so break out of the loop. | |
break | |
esac | |
shift | |
done | |
# Why assign to variable x in --gz and --gz2 right away rather than use {}? | |
# In some versions of xargs, string replacement (-I {}) only works up to 255 | |
# bytes and these commands can exceed that pretty easily when {} gets replaced. | |
# | |
# All output will be in the form "<MD5> <path>" to match basic GNU md5sum output. | |
if [ "$gz" -eq 1 ]; then | |
xargs -P "$procs" $nullterm -I {} bash -c 'x={}; if [[ "$x" == *.gz ]]; then printf "%s %s\n" "$(gzip -dc "$x" | openssl md5)" "${x%.gz}"; fi; printf "%s %s\n" "$(openssl md5 <"$x")" "$x";' | sed -E "s/^(MD5)?\(stdin\)= //" | |
elif [ "$gz2" -eq 1 ]; then | |
xargs -P "$procs" $nullterm -I {} bash -c 'x={}; if [[ "$x" == *.gz ]]; then printf "%s %s\n" "$(gzip -dc "$x" | openssl md5)" "${x%.gz}"; else printf "%s %s\n" "$(openssl md5 <"$x")" "$x"; fi' | sed -E "s/^(MD5)?\(stdin\)= //" | |
else | |
# Only hash the file itself | |
# This sed substitution converts openssl output to "<MD5> <path>" | |
xargs -n "$batch_size" -P "$procs" $nullterm openssl md5 | sed -E "s/^(MD5)?\(stdin\)= //" | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment