Skip to content

Instantly share code, notes, and snippets.

@dargmuesli
Created June 21, 2022 12:04
Show Gist options
  • Save dargmuesli/58a2c1c58350d2c1587ee00fa35a4ceb to your computer and use it in GitHub Desktop.
Save dargmuesli/58a2c1c58350d2c1587ee00fa35a4ceb to your computer and use it in GitHub Desktop.
Compare all filenames inside a folder for similarity.
#!/bin/bash
# Exit on errors, use last pipe error code, do not overwrite files, ensure
# variables exist
set -o errexit -o pipefail -o noclobber -o nounset
# allow matching of files by glob directly without find
shopt -s globstar nullglob
# Set color codes for use with echo
LIGHT_BLUE='\e[94m'
LIGHT_GREEN='\e[92m'
LIGHT_RED='\e[91m'
LIGHT_YELLOW='\e[93m'
NC='\e[0m'
if ! hash fstrcmp 2>/dev/null; then
echo -e "${LIGHT_BLUE}fstrcmp${LIGHT_YELLOW} is not available.${NC}"
if ! hash lsb_release 2>/dev/null; then
echo -e "${LIGHT_RED}Could not determine OS distribution!${NC}"
echo -e "${LIGHT_BLUE}lsb_release${NC} is not available."
exit 1
fi
if [[ ("$(lsb_release -is)" == "Debian") || (\
"$(lsb_release -is)" == "Ubuntu") ]]; then
sudo apt-get install -y fstrcmp
else
echo -e "${LIGHT_RED}Could not install ${LIGHT_BLUE}fstrcmp${LIGHT_RED}" \
" automatically!${NC}"
echo -e "Please install it manually."
exit 1
fi
fi
DISTANCE_MINIMUM=0.9
INPUT_PATH=""
IS_PROGRESS_SHOWN=false
function usage() {
echo -e "usage: ${0##*/} ${LIGHT_YELLOW}<options>${NC}"
echo -e ""
echo -e "${LIGHT_YELLOW}options${NC}"
echo -e " -d, --distance The minimum distance for comparison."
echo -e " -i, --input-path * The files' source path."
echo -e " -h, --help Display this help."
echo -e " -p, --progress Display progress."
echo -e ""
echo -e "*=required"
exit 1
}
FILES=()
function scan() {
INPUT_PATH="$1"
echo -e "Scanning directory..."
while read -r -d $'\0' name
do
fileName="$(basename "$name")"
FILES+=("$fileName")
done < <(find "$INPUT_PATH" -type f -print0)
}
function compare() {
DISTANCE_MINIMUM="$1"
IS_PROGRESS_SHOWN="$2"
comparison_count=$((((${#FILES[@]})*(${#FILES[@]}-1))/2))
comparison_index=0
echo -e "Comparing with minimum distance $DISTANCE_MINIMUM (${#FILES[@]} files, $comparison_count comparisons)..."
for i in $(eval echo "{0..$((${#FILES[@]}-1))}")
do
if [ "$i" = $((${#FILES[@]}-1)) ]; then
break
fi
for j in $(eval echo "{$(("$i"+1))..$((${#FILES[@]}-1))}")
do
distance=$(fstrcmp "${FILES[$i]}" "${FILES[$j]}")
if [ "$IS_PROGRESS_SHOWN" = "true" ]; then
comparison_index=$(("$comparison_index"+1))
echo -ne "$((100*"$comparison_index"/"$comparison_count"))% ($comparison_index/$comparison_count)\r"
fi
if (( $(echo "$distance > $DISTANCE_MINIMUM" | bc -l) )); then
echo -e "\n${FILES[$i]}\n${FILES[$j]}\n$distance"
fi
done
done
}
# Check if getopt is available
# shellcheck disable=SC2251
! getopt --test >/dev/null
if [[ ${PIPESTATUS[0]} -ne 4 ]]; then
echo -e "${LIGHT_RED}Cannot parse parameters!${NC}"
exit 1
fi
# Parse command line parameters
OPTIONS=d:hi:p
LONGOPTS=distance:,help,input-path:,progress
# shellcheck disable=SC2251
! PARSED=$(getopt --options=$OPTIONS --longoptions=$LONGOPTS \
--name "$0" -- "$@")
if [[ ${PIPESTATUS[0]} -ne 0 ]]; then
exit 2
fi
eval set -- "$PARSED"
while true; do
case "$1" in
-d | --distance)
DISTANCE_MINIMUM="$2"
shift 2
;;
-h | --help)
echo -e "Find similar filenames."
echo -e ""
usage
;;
-i | --input-path)
INPUT_PATH="$2"
shift 2
;;
-p | --progress)
IS_PROGRESS_SHOWN=true
shift 1
;;
--)
shift
break
;;
*)
echo -e "${LIGHT_RED}Programming error!${NC}"
exit 2
;;
esac
done
if [ -z "$INPUT_PATH" ]; then
echo -e "${LIGHT_RED}Input path not provided!${NC}"
usage
exit 1
fi
if [ ! -d "$INPUT_PATH" ]; then
echo -e "${LIGHT_RED}Input path is not a directory!${NC}"
exit 1
fi
scan "$INPUT_PATH"
compare "$DISTANCE_MINIMUM" "$IS_PROGRESS_SHOWN"
echo -e "\n${LIGHT_GREEN}Done${NC}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment