Skip to content

Instantly share code, notes, and snippets.

@augustohp
Created June 11, 2019 20:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save augustohp/ce867eee5c2141055efc2aae832e176f to your computer and use it in GitHub Desktop.
Save augustohp/ce867eee5c2141055efc2aae832e176f to your computer and use it in GitHub Desktop.
Remove duplicated files inside a folder
#!/usr/bin/env bash
APP_NAME=$(basename $0)
APP_VERSION="1.0.0"
APP_REQUIREMENTS="awk mktemp md5sum grep"
TMP_HASHES="$(mktemp tmp-hashes-XXX)"
TMP_HASHES_DUPLICATED="$(mktemp tmp-duplicated-hashes-XXX)"
TMP_FILES_DUPLICATED="$(mktemp tmp-duplicated-files-XXX)"
trap cleanup 0 1 2 3 6
verbose ()
{
echo "$@"
}
error_and_exit ()
{
exit_code=${2:-2}
echo "$1" &>2
exit $exit_code
}
indent ()
{
sed 's/^/ /'
}
# Usage: cleanup
# Removes temporary files and other messes created
cleanup ()
{
rm -f "$TMP_HASHES"
rm -f "$TMP_HASHES_DUPLICATED"
rm -f "$TMP_FILES_DUPLICATED"
}
command_deduplicate_files ()
{
search_directory="${1%%/}"
if [ ! -d "$search_directory" ]
then
error_and_exit "Search must be performed on directory ('$search_directory' does not exist)."
fi
verbose "Removing files with (1) or (2) sufixes..."
command_remove_duplicate_filenames "$1" | indent
verbose "Checking file contents for duplication..."
command_deduplicate_by_contents "$1" | indent
}
command_remove_duplicate_filenames ()
{
search_directory="${1%%/}"
for f in $(find "${search_directory}" -name "*(*")
do
verbose "Removing '$f'."
rm -f "$f"
done
}
command_deduplicate_by_contents ()
{
search_directory="${1%%/}"
# Generate a md5 of each file
for f in "${search_directory}"/*
do
md5sum "${f}" >> "$TMP_HASHES"
done
# Filter only repeated hashes
cat "$TMP_HASHES" \
| print_first_column \
| count_duplicates \
| more_than_one \
| print_second_column \
> "$TMP_HASHES_DUPLICATED"
for duplicated_hash in $(cat "$TMP_HASHES_DUPLICATED")
do
# All files found with the same content but the first one
grep "$duplicated_hash" "$TMP_HASHES" \
| head -n -1 \
| awk '{$1=""; print $0}' \
| remove_leading_spaces \
> "$TMP_FILES_DUPLICATED"
done
while read duplicated_file
do
verbose "Removing ${duplicated_file}."
rm -f "${duplicated_file}"
done < "$TMP_FILES_DUPLICATED"
}
count_duplicates ()
{
sort \
| uniq -c \
| sort -r
}
more_than_one()
{
grep -vE "^[ ]+1 "
}
remove_leading_spaces ()
{
sed 's/^[ \t]*//'
}
print_first_column ()
{
awk '{print $1}'
}
print_second_column ()
{
awk '{print $2}'
}
display_help ()
{
cat <<-EOT
Usage: $APP_NAME [directory]
EOT
}
command_deduplicate_files "$1"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment