Skip to content

Instantly share code, notes, and snippets.

@aravindkumarsvg
Last active July 13, 2018 18:49
Show Gist options
  • Save aravindkumarsvg/5e1e074c56a8244f4fa4f1991b39a940 to your computer and use it in GitHub Desktop.
Save aravindkumarsvg/5e1e074c56a8244f4fa4f1991b39a940 to your computer and use it in GitHub Desktop.
Replaces the duplicate files with the hard link to the file which comes first in the lexical sorting
#!/bin/bash
#################################################################################
# #
# - Removes the duplicate normal files from the given directory and makes them #
# as the hard link for the non removed file. #
# - The non removed file is the one which comes first in the lexical ordering #
# - Script uses fdupes utility to find out the duplicate files #
# #
# = Usage: bash script.sh duplicate_files_folder #
# #
#################################################################################
# Validates the inputted directory
validate_input_directory () {
# Checks for the existence of the directory
if [ ! -z $1 ] && [ -d $1 ]
then
input_dir=$1
else
echo "Input is not valid"
exit 1
fi
}
# Checks for the availability of fdupes command
checks_fdupes () {
# Gets the path of the fdupes binary
which fdupes > /dev/null
# If the exit status is not zero, then fdupes is not installed
if [ $? -ne 0 ]
then
echo "fdupes is not available"
exit 1
fi
}
# Executes fdupes on the given directory to get all the duplicates.
# In each set of duplicates, keeps the one which is at first.
# and makes the others as hard links to the first one.
duplicates_remover () {
OLDIFS=$IFS # Holds the old IFS
IFS=$'\n' # sets the IFS to new line
# Loops through the duplicate files
for duplicate_files_batch in $(fdupes --sameline ${input_dir})
do
OLDIFS1=$IFS # Holds the old IFS
IFS=$' ' # sets the IFS to new line
local temp_filename="" # Holds the filename which is about to be assigned for this batch
# Iterates through the batch
for duplicate_file in $duplicate_files_batch
do
# Checks only for the normal file
if [ -f $duplicate_file ]
then
# Gets the filename which is least on lexical comparison
if [ -z $temp_filename ] || [[ $temp_filename > $duplicate_file ]]
then
temp_filename=$duplicate_file
fi
fi
done
# Checks whether the filename has been calculated
if [ ! -z $temp_filename ] && [ -f $temp_filename ]
then
# Iterates through the batch.
# Removes all the duplicates except the filename which comes first in lexical ordering.
# Creates the hard links with the removed file names to the non removed file.
for duplicate_file_remove in $duplicate_files_batch
do
# Checks only for the normal file
if [ $temp_filename != $duplicate_file_remove ] && [ -f $duplicate_file_remove ]
then
# Removes the duplicate file
rm -f $duplicate_file_remove &
# Gets the pid of the file removal process
duplicate_file_remove_pid=$!
while :
do
# checks for the availability of rm -rf command in process list
if ps ax | grep -v grep | grep $duplicate_file_remove_pid | grep "rm -f ${duplicate_file_remove}" > /dev/null
then
# Pauses the process for one second
sleep 1
# : means do nothing
:
else
# if the rm -rf is successfully executed, then the while loop is exited
break
fi
done
# Checks whether the file removal is successful
if [ ! -f $duplicate_file_remove ]
then
# Creates a hard link to the file with the removed filename
ln $temp_filename $duplicate_file_remove
fi
fi
done
fi
IFS=$OLDIFS1 # Restores the old IFS
done
IFS=$OLDIFS # Restores the old IFS
}
# Function which acts as the starting point of execution
main () {
# Checks for the availability of the fdupes binary
checks_fdupes
# Validates the inputted directory
validate_input_directory $@
# Modifies the duplicates as hard links to the non removed file
duplicates_remover
}
# program starting point
main $@
# Exits the program with exit status - 0
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment