augustohp/deduplicate.sh

## deduplicate.sh
#!/usr/bin/env bash

APP_NAME=$(basename $0)
APP_VERSION="1.0.0"
APP_REQUIREMENTS="awk mktemp md5sum grep"

TMP_HASHES="$(mktemp tmp-hashes-XXX)"
TMP_HASHES_DUPLICATED="$(mktemp tmp-duplicated-hashes-XXX)"
TMP_FILES_DUPLICATED="$(mktemp tmp-duplicated-files-XXX)"

trap cleanup 0 1 2 3 6

verbose ()
{
	echo "$@"
}

error_and_exit ()
{
	exit_code=${2:-2}
	echo "$1" &>2
	exit $exit_code
}

indent ()
{
	sed 's/^/    /'

}

# Usage: cleanup
# Removes temporary files and other messes created
cleanup ()
{
	rm -f "$TMP_HASHES"
	rm -f "$TMP_HASHES_DUPLICATED"
	rm -f "$TMP_FILES_DUPLICATED"
}

command_deduplicate_files ()
{
	search_directory="${1%%/}"

	if [ ! -d "$search_directory" ]
	then
		error_and_exit "Search must be performed on directory ('$search_directory' does not exist)."
	fi

	verbose "Removing files with (1) or (2) sufixes..."
	command_remove_duplicate_filenames "$1" | indent

	verbose "Checking file contents for duplication..."
	command_deduplicate_by_contents "$1" | indent
}

command_remove_duplicate_filenames ()
{
	search_directory="${1%%/}"

	for f in $(find "${search_directory}" -name "*(*")
	do
		verbose "Removing '$f'."
		rm -f "$f"
	done
}

command_deduplicate_by_contents ()
{
	search_directory="${1%%/}"

	# Generate a md5 of each file
	for f in "${search_directory}"/*
	do
		md5sum "${f}" >> "$TMP_HASHES"
	done

	# Filter only repeated hashes
	cat "$TMP_HASHES" \
		| print_first_column \
		| count_duplicates \
		| more_than_one \
		| print_second_column \
		> "$TMP_HASHES_DUPLICATED"


	for duplicated_hash in $(cat "$TMP_HASHES_DUPLICATED")
	do
		# All files found with the same content but the first one
		grep "$duplicated_hash" "$TMP_HASHES" \
			| head -n -1 \
			| awk '{$1=""; print $0}' \
			| remove_leading_spaces \
			> "$TMP_FILES_DUPLICATED"
	done

	while read duplicated_file
	do
		verbose "Removing ${duplicated_file}."
		rm -f "${duplicated_file}"
	done < "$TMP_FILES_DUPLICATED"
}

	count_duplicates ()
	{
		sort \
			| uniq -c \
			| sort -r
	}

	more_than_one()
	{
		grep -vE "^[ ]+1 "
	}

	remove_leading_spaces ()
	{
		sed 's/^[ \t]*//'
	}

	print_first_column ()
	{
		awk '{print $1}'
	}

	print_second_column ()
	{
		awk '{print $2}'
	}

display_help ()
{
	cat <<-EOT
	Usage: $APP_NAME [directory]
	EOT
}

command_deduplicate_files "$1"
	#!/usr/bin/env bash

	APP_NAME=$(basename $0)
	APP_VERSION="1.0.0"
	APP_REQUIREMENTS="awk mktemp md5sum grep"

	TMP_HASHES="$(mktemp tmp-hashes-XXX)"
	TMP_HASHES_DUPLICATED="$(mktemp tmp-duplicated-hashes-XXX)"
	TMP_FILES_DUPLICATED="$(mktemp tmp-duplicated-files-XXX)"

	trap cleanup 0 1 2 3 6

	verbose ()
	{
	echo "$@"
	}

	error_and_exit ()
	{
	exit_code=${2:-2}
	echo "$1" &>2
	exit $exit_code
	}

	indent ()
	{
	sed 's/^/ /'

	}

	# Usage: cleanup
	# Removes temporary files and other messes created
	cleanup ()
	{
	rm -f "$TMP_HASHES"
	rm -f "$TMP_HASHES_DUPLICATED"
	rm -f "$TMP_FILES_DUPLICATED"
	}

	command_deduplicate_files ()
	{
	search_directory="${1%%/}"

	if [ ! -d "$search_directory" ]
	then
	error_and_exit "Search must be performed on directory ('$search_directory' does not exist)."
	fi

	verbose "Removing files with (1) or (2) sufixes..."
	command_remove_duplicate_filenames "$1" \| indent

	verbose "Checking file contents for duplication..."
	command_deduplicate_by_contents "$1" \| indent
	}

	command_remove_duplicate_filenames ()
	{
	search_directory="${1%%/}"

	for f in $(find "${search_directory}" -name "(")
	do
	verbose "Removing '$f'."
	rm -f "$f"
	done
	}

	command_deduplicate_by_contents ()
	{
	search_directory="${1%%/}"

	# Generate a md5 of each file
	for f in "${search_directory}"/*
	do
	md5sum "${f}" >> "$TMP_HASHES"
	done

	# Filter only repeated hashes
	cat "$TMP_HASHES" \
	\| print_first_column \
	\| count_duplicates \
	\| more_than_one \
	\| print_second_column \
	> "$TMP_HASHES_DUPLICATED"


	for duplicated_hash in $(cat "$TMP_HASHES_DUPLICATED")
	do
	# All files found with the same content but the first one
	grep "$duplicated_hash" "$TMP_HASHES" \
	\| head -n -1 \
	\| awk '{$1=""; print $0}' \
	\| remove_leading_spaces \
	> "$TMP_FILES_DUPLICATED"
	done

	while read duplicated_file
	do
	verbose "Removing ${duplicated_file}."
	rm -f "${duplicated_file}"
	done < "$TMP_FILES_DUPLICATED"
	}

	count_duplicates ()
	{
	sort \
	\| uniq -c \
	\| sort -r
	}

	more_than_one()
	{
	grep -vE "^[ ]+1 "
	}

	remove_leading_spaces ()
	{
	sed 's/^[ \t]*//'
	}

	print_first_column ()
	{
	awk '{print $1}'
	}

	print_second_column ()
	{
	awk '{print $2}'
	}

	display_help ()
	{
	cat <<-EOT
	Usage: $APP_NAME [directory]
	EOT
	}

	command_deduplicate_files "$1"