Skip to content

Instantly share code, notes, and snippets.

@JoeyBurzynski
Last active May 13, 2020 18:45
Show Gist options
  • Save JoeyBurzynski/6d69d0829db9b3617440b5c059370460 to your computer and use it in GitHub Desktop.
Save JoeyBurzynski/6d69d0829db9b3617440b5c059370460 to your computer and use it in GitHub Desktop.
Ruby: Identify Duplicate Files in a Directory
#!/usr/bin/env ruby
# frozen_string_literal: true
require 'pathname'
require 'digest/md5'
# * Colorize Gem (https://github.com/fazibear/colorize)
require 'colorize'
# * HashDot Gem (https://github.com/adsteel/hash_dot)
require 'hash_dot'
# * Amazing Print Gem (https://github.com/amazing-print/amazing_print)
require 'amazing_print'
# * Set custom process title
Process.setproctitle('find-duplicate-files')
# * Identify _exact_ duplicate files in a specified directory/file path.
# * Supports dotfiles and recursive search.
module DuplicateFiles
module_function
CONFIG = {
# * Configuration options are merged with defaults upon initialization.
active: {},
# = Configuration Defaults
defaults: {
# * Default options
opts: {
# = delete_duplicates {Boolean}
# * Toggles deletion of all identified duplicate files.
# ? Disabled by default.
delete_duplicates: false,
# = recursive {Boolean}
# * Toggles recursive search functionality.
# ? Disabled by default.
recursive: false,
# = verbose {Boolean}
# * Toggle verbose logging to console.
# ? Enabled by default.
verbose: true
},
# * Default hash for tracking scanned files and duplicates.
scan_log: {
# * Hash used to store the MD5 hash and file name of duplicate files.
duplicates: {},
# * Hash used to store the MD5 hash/file name of scanned files.
scanned: {}
}
}
}.to_dot
# + Fetch files from provided file path.
def files_in_directory(target_path: nil)
puts 'Initializing scan for duplicate files using target file path:'.light_yellow
ap target_path
# * Ensure file path was provided.
if target_path.nil?
error = "\nError! No file path specified.\n"
puts error.light_red
puts ' - Usage: ' + "find-duplicate-files <directory_path>".light_cyan
puts ' - Example: ' + "find-duplicate-files ./sample/path\n".light_yellow
raise error
end
# * Ensure file path is a directory that exists.
unless Pathname.new(target_path.to_s).exist? && File.directory?(target_path.to_s)
error = "\n\tError! Invalid path specified.#{Pathname.new(target_path)}"
puts error.light_red
raise error
end
# * Enable recursive search, if active.
depth = CONFIG.active.recursive ? '**/*' : '/*'
# * Return Pathname objects representing files in the target file path.
Pathname.glob(
target_path + depth,
# * Include hidden files that start with a dot (.) in the search.
File::FNM_DOTMATCH |
# * Do not follow symlinks.
File::NOFOLLOW
)
end
# + Scan directory of files (array of file paths) for duplicates
def identify_duplicates(file_paths = [])
# * Iterate over files and determine whether duplicates exist.
file_paths.each_with_object(
CONFIG.defaults.scan_log.dup
.merge!(start_time: Time.now)
) { |file_path, log|
# * Skip directories
next if file_path.directory?
# * Calculate MD5 Hash
md5_hash = Digest::MD5.hexdigest(file_path.read)
# * If duplicate is identified, log to console and add to duplicates hash.
if log.scanned.key?(md5_hash)
puts " - Duplicate found: #{file_path}".light_red
log.duplicates[md5_hash] ||= file_path
end
# * Log all scanned files using MD5 hash as key.
log.scanned[md5_hash] ||= file_path
# * Log stats to console.
next unless CONFIG.active.verbose
puts "Scanning file #{log.scanned.length} of #{file_paths.length}..".light_yellow
puts " - File Path: #{file_path.realpath}".light_white
puts " - MD5 Hash: #{md5_hash}".light_cyan
puts " - Scanned: #{log.scanned.length}".light_green
puts " - Duplicates: #{log.duplicates.length}\n".light_red
}.to_dot
end
# + Identify duplicates in a specified file/directory path.
def identify(target_file_path: nil, **opts)
# * Create hash of active configuration options by merging
# * provided options with defaults.
CONFIG.active.merge!(
CONFIG.defaults.opts.merge(opts)
)
# * Fetch files in the specified target path.
identify_duplicates(
files_in_directory(target_path: target_file_path)
).then(&method(:report))
end
# + Report scan status to console on completion.
def report(scan)
puts "\n================================== Scan Complete ==================================\n".light_green
puts " - Files Scanned: #{scan.scanned.length}".light_white
puts " - Duplicates Identified: #{scan.duplicates.length}".light_white
puts " - File Paths:\n".light_white
# * Log result to console
# ? md5_hash {String}
# ? file_path {Pathname}
scan.duplicates.each_pair do |md5_hash, file_path|
puts
puts "========================================== Duplicate File (#{file_path.basename}) ==========================================".light_yellow
puts ' - Path: '.light_white + file_path.realpath.to_s.light_blue
puts ' - Name: '.light_white + file_path.basename.to_s.light_yellow
puts ' - Type: '.light_white + file_path.extname.to_s.light_magenta
puts ' - Size: '.light_white + file_path.size.to_s.light_green
puts ' - MD5 Hash: '.light_white + md5_hash.light_cyan
# * If file deletion is enabled, remove the file and log to console.
if CONFIG.active.delete_duplicates
puts ' NOTICE: Duplicate deletion enabled. Deleting file.'.light_red
file_path.delete
end
end
# * Log execution time to console.
puts "\n✔ Scanned #{scan.scanned.length} files in #{((Time.now - scan.start_time).round(4) / 60.0 ).round(2)} minutes, #{(Time.now - scan.start_time).round(4)} seconds.".light_green
end
end
# ? Sample CLI Usage:
# ? Scan current directory for duplicate files:
# ? - ruby find-duplicate-files.rb .
# ?
# ? Scan current directory for duplicate files (execute as bash command)
# ? - find-duplicate-files .
# + Identify duplicate files in provided directory file path.
DuplicateFiles.identify(
# = target_file_path {String}
# * File path/directory (relative or absolute) to start the search from.
target_file_path: ARGV[0],
# = delete_duplicates {Boolean}
# * Toggles deletion of all identified duplicate files.
# ? Default: false
delete_duplicates: true,
# = recursive {Boolean}
# * Toggles recursive search functionality.
# ? Default: false
recursive: false,
# = verbose {Boolean}
# * Toggle verbose logging to console.
# ? Default: false
verbose: false
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment