aravindkumarsvg/delete-dupfiles.sh

## delete-dupfiles.sh
#!/bin/bash

#################################################################################
#                                                                               #
#  - Removes the duplicate normal files from the given directory and makes them #
#    as the hard link for the non removed file.                                 #
#  - The non removed file is the one which comes first in the lexical ordering  #
#  - Script uses fdupes utility to find out the duplicate files                 #
#                                                                               #
#  = Usage:   bash script.sh duplicate_files_folder                             #
#                                                                               #
#################################################################################

# Validates the inputted directory
validate_input_directory () {
  # Checks for the existence of the directory
  if [ ! -z $1 ] && [ -d $1 ]
  then
    input_dir=$1
  else
    echo "Input is not valid"
    exit 1
  fi
}

# Checks for the availability of fdupes command
checks_fdupes () {
  # Gets the path of the fdupes binary
  which fdupes > /dev/null
  # If the exit status is not zero, then fdupes is not installed
  if [ $? -ne 0 ]
  then
    echo "fdupes is not available"
    exit 1
  fi
}

# Executes fdupes on the given directory to get all the duplicates.
# In each set of duplicates, keeps the one which is at first.
# and makes the others as hard links to the first one.
duplicates_remover () {
  OLDIFS=$IFS # Holds the old IFS
  IFS=$'\n' # sets the IFS to new line
  # Loops through the duplicate files
  for duplicate_files_batch in $(fdupes --sameline ${input_dir})
  do
    OLDIFS1=$IFS # Holds the old IFS
    IFS=$' ' # sets the IFS to new line
    local temp_filename="" # Holds the filename which is about to be assigned for this batch
    # Iterates through the batch
    for duplicate_file in $duplicate_files_batch
    do
      # Checks only for the normal file
      if [ -f $duplicate_file ]
      then
        # Gets the filename which is least on lexical comparison
        if [ -z $temp_filename ] || [[ $temp_filename > $duplicate_file ]]
        then
          temp_filename=$duplicate_file
        fi
      fi
    done
    # Checks whether the filename has been calculated
    if [ ! -z $temp_filename ] && [ -f $temp_filename ]
    then
      # Iterates through the batch.
      # Removes all the duplicates except the filename which comes first in lexical ordering.
      # Creates the hard links with the removed file names to the non removed file.
      for duplicate_file_remove in $duplicate_files_batch
      do
        # Checks only for the normal file
        if [ $temp_filename != $duplicate_file_remove ] && [ -f $duplicate_file_remove ]
        then
          # Removes the duplicate file
          rm -f $duplicate_file_remove &
          # Gets the pid of the file removal process
          duplicate_file_remove_pid=$!
          while :
          do
            # checks for the availability of rm -rf command in process list
            if ps ax | grep -v grep | grep $duplicate_file_remove_pid | grep "rm -f ${duplicate_file_remove}" > /dev/null
            then
              # Pauses the process for one second
              sleep 1
              # : means do nothing
              :
            else
              # if the rm -rf is successfully executed, then the while loop is exited
              break
            fi
          done
          # Checks whether the file removal is successful
          if [ ! -f $duplicate_file_remove ]
          then
            # Creates a hard link to the file with the removed filename
            ln $temp_filename $duplicate_file_remove
          fi
        fi
      done
    fi
    IFS=$OLDIFS1 # Restores the old IFS
  done
  IFS=$OLDIFS # Restores the old IFS
}

# Function which acts as the starting point of execution
main () {
  # Checks for the availability of the fdupes binary
  checks_fdupes
  # Validates the inputted directory
  validate_input_directory $@
  # Modifies the duplicates as hard links to the non removed file
  duplicates_remover
}

# program starting point
main $@
# Exits the program with exit status - 0
exit 0