Last active
May 13, 2020 18:45
-
-
Save JoeyBurzynski/6d69d0829db9b3617440b5c059370460 to your computer and use it in GitHub Desktop.
Ruby: Identify Duplicate Files in a Directory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# frozen_string_literal: true | |
require 'pathname' | |
require 'digest/md5' | |
# * Colorize Gem (https://github.com/fazibear/colorize) | |
require 'colorize' | |
# * HashDot Gem (https://github.com/adsteel/hash_dot) | |
require 'hash_dot' | |
# * Amazing Print Gem (https://github.com/amazing-print/amazing_print) | |
require 'amazing_print' | |
# * Set custom process title | |
Process.setproctitle('find-duplicate-files') | |
# * Identify _exact_ duplicate files in a specified directory/file path. | |
# * Supports dotfiles and recursive search. | |
module DuplicateFiles | |
module_function | |
CONFIG = { | |
# * Configuration options are merged with defaults upon initialization. | |
active: {}, | |
# = Configuration Defaults | |
defaults: { | |
# * Default options | |
opts: { | |
# = delete_duplicates {Boolean} | |
# * Toggles deletion of all identified duplicate files. | |
# ? Disabled by default. | |
delete_duplicates: false, | |
# = recursive {Boolean} | |
# * Toggles recursive search functionality. | |
# ? Disabled by default. | |
recursive: false, | |
# = verbose {Boolean} | |
# * Toggle verbose logging to console. | |
# ? Enabled by default. | |
verbose: true | |
}, | |
# * Default hash for tracking scanned files and duplicates. | |
scan_log: { | |
# * Hash used to store the MD5 hash and file name of duplicate files. | |
duplicates: {}, | |
# * Hash used to store the MD5 hash/file name of scanned files. | |
scanned: {} | |
} | |
} | |
}.to_dot | |
# + Fetch files from provided file path. | |
def files_in_directory(target_path: nil) | |
puts 'Initializing scan for duplicate files using target file path:'.light_yellow | |
ap target_path | |
# * Ensure file path was provided. | |
if target_path.nil? | |
error = "\nError! No file path specified.\n" | |
puts error.light_red | |
puts ' - Usage: ' + "find-duplicate-files <directory_path>".light_cyan | |
puts ' - Example: ' + "find-duplicate-files ./sample/path\n".light_yellow | |
raise error | |
end | |
# * Ensure file path is a directory that exists. | |
unless Pathname.new(target_path.to_s).exist? && File.directory?(target_path.to_s) | |
error = "\n\tError! Invalid path specified.#{Pathname.new(target_path)}" | |
puts error.light_red | |
raise error | |
end | |
# * Enable recursive search, if active. | |
depth = CONFIG.active.recursive ? '**/*' : '/*' | |
# * Return Pathname objects representing files in the target file path. | |
Pathname.glob( | |
target_path + depth, | |
# * Include hidden files that start with a dot (.) in the search. | |
File::FNM_DOTMATCH | | |
# * Do not follow symlinks. | |
File::NOFOLLOW | |
) | |
end | |
# + Scan directory of files (array of file paths) for duplicates | |
def identify_duplicates(file_paths = []) | |
# * Iterate over files and determine whether duplicates exist. | |
file_paths.each_with_object( | |
CONFIG.defaults.scan_log.dup | |
.merge!(start_time: Time.now) | |
) { |file_path, log| | |
# * Skip directories | |
next if file_path.directory? | |
# * Calculate MD5 Hash | |
md5_hash = Digest::MD5.hexdigest(file_path.read) | |
# * If duplicate is identified, log to console and add to duplicates hash. | |
if log.scanned.key?(md5_hash) | |
puts " - Duplicate found: #{file_path}".light_red | |
log.duplicates[md5_hash] ||= file_path | |
end | |
# * Log all scanned files using MD5 hash as key. | |
log.scanned[md5_hash] ||= file_path | |
# * Log stats to console. | |
next unless CONFIG.active.verbose | |
puts "Scanning file #{log.scanned.length} of #{file_paths.length}..".light_yellow | |
puts " - File Path: #{file_path.realpath}".light_white | |
puts " - MD5 Hash: #{md5_hash}".light_cyan | |
puts " - Scanned: #{log.scanned.length}".light_green | |
puts " - Duplicates: #{log.duplicates.length}\n".light_red | |
}.to_dot | |
end | |
# + Identify duplicates in a specified file/directory path. | |
def identify(target_file_path: nil, **opts) | |
# * Create hash of active configuration options by merging | |
# * provided options with defaults. | |
CONFIG.active.merge!( | |
CONFIG.defaults.opts.merge(opts) | |
) | |
# * Fetch files in the specified target path. | |
identify_duplicates( | |
files_in_directory(target_path: target_file_path) | |
).then(&method(:report)) | |
end | |
# + Report scan status to console on completion. | |
def report(scan) | |
puts "\n================================== Scan Complete ==================================\n".light_green | |
puts " - Files Scanned: #{scan.scanned.length}".light_white | |
puts " - Duplicates Identified: #{scan.duplicates.length}".light_white | |
puts " - File Paths:\n".light_white | |
# * Log result to console | |
# ? md5_hash {String} | |
# ? file_path {Pathname} | |
scan.duplicates.each_pair do |md5_hash, file_path| | |
puts | |
puts "========================================== Duplicate File (#{file_path.basename}) ==========================================".light_yellow | |
puts ' - Path: '.light_white + file_path.realpath.to_s.light_blue | |
puts ' - Name: '.light_white + file_path.basename.to_s.light_yellow | |
puts ' - Type: '.light_white + file_path.extname.to_s.light_magenta | |
puts ' - Size: '.light_white + file_path.size.to_s.light_green | |
puts ' - MD5 Hash: '.light_white + md5_hash.light_cyan | |
# * If file deletion is enabled, remove the file and log to console. | |
if CONFIG.active.delete_duplicates | |
puts ' NOTICE: Duplicate deletion enabled. Deleting file.'.light_red | |
file_path.delete | |
end | |
end | |
# * Log execution time to console. | |
puts "\n✔ Scanned #{scan.scanned.length} files in #{((Time.now - scan.start_time).round(4) / 60.0 ).round(2)} minutes, #{(Time.now - scan.start_time).round(4)} seconds.".light_green | |
end | |
end | |
# ? Sample CLI Usage: | |
# ? Scan current directory for duplicate files: | |
# ? - ruby find-duplicate-files.rb . | |
# ? | |
# ? Scan current directory for duplicate files (execute as bash command) | |
# ? - find-duplicate-files . | |
# + Identify duplicate files in provided directory file path. | |
DuplicateFiles.identify( | |
# = target_file_path {String} | |
# * File path/directory (relative or absolute) to start the search from. | |
target_file_path: ARGV[0], | |
# = delete_duplicates {Boolean} | |
# * Toggles deletion of all identified duplicate files. | |
# ? Default: false | |
delete_duplicates: true, | |
# = recursive {Boolean} | |
# * Toggles recursive search functionality. | |
# ? Default: false | |
recursive: false, | |
# = verbose {Boolean} | |
# * Toggle verbose logging to console. | |
# ? Default: false | |
verbose: false | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment