Last active
January 27, 2018 00:55
-
-
Save yesmar/7292f0ebd133dee79784e4483278c3ab to your computer and use it in GitHub Desktop.
Ruby script to find and remove duplicate files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# dupes.rb — Given one or more target directories, identify duplicate files through | |
# cryptographic hashing. The exit status is the number of duplicates identified. | |
# Copyright © 2016,2017, Ramsey Dow. All rights reserved. | |
# SPDX-License-Identifier: BSD-2-Clause | |
# 20160811 yesmar@gmail.com | |
# 20160822 added color output and deletion support | |
# 20160829 fixed crash on nonexistent directory bug | |
# 20171215 fixed several serious problems | |
require 'digest/sha2' | |
require 'find' | |
require 'optparse' | |
# Smartly erase trailing content. | |
def clean_up_line(previous_length) | |
if previous_length | |
previous_length.times {print ' '} | |
previous_length.times {print "\x08"} | |
end | |
nil | |
end | |
SCRIPT = File.basename(__FILE__, '.rb') | |
DELAY = 1.0/24.0 # 24fps | |
Struct.new("Duplicate", :pathname, :digest) | |
options = { | |
:delete => false, | |
:quiet => false, | |
:verbose => false | |
} | |
begin | |
require 'colorize' | |
options[:color] = true | |
rescue LoadError | |
options[:color] = false | |
end | |
optparse = OptionParser.new do |opts| | |
opts.banner = "Usage: #{SCRIPT} [-q] [-r] <directory> […]" | |
opts.on('-d', '--delete', 'Delete duplicates (permanent and uncaring)') {options[:delete] = true} | |
opts.on('-h', '--help', 'Display this screen') {puts opts; exit} | |
opts.on('-n', '--no-color', 'Disable color output') {options[:color] = false} | |
opts.on('-q', '--quiet', 'Output less information') {options[:quiet] = true} | |
opts.on('-v', '--verbose', 'Output more information') {options[:verbose] = true} | |
end | |
optparse.parse! | |
if options[:quiet] && options[:verbose] | |
puts "#{SCRIPT}: the -q and -v options are mutually exclusive" | |
exit 1 | |
end | |
if ARGV.size == 0 | |
puts "#{SCRIPT}: please specify at least one directory" | |
exit 1 | |
end | |
files = {} | |
dupes = [] | |
first = true | |
previous_length = 0 | |
search_dirs = ARGV | |
until search_dirs.size == 0 | |
current = search_dirs.shift | |
# Ensure target exists, is a directory and is readable. | |
if !File.exist?(current) | |
puts "#{current}: not found" if !options[:quiet] | |
next | |
end | |
if !File.directory?(current) | |
puts "#{current}: not a directory" if !options[:quiet] | |
next | |
end | |
if !File.readable?(current) | |
puts "#{current}: permission denied" if !options[:quiet] | |
next | |
end | |
# Recursively process target directories. | |
begin | |
Find.find(current) do |path| | |
# Ensure path is readble. | |
if !File.readable?(path) | |
puts "#{path}: permission denied" if options[:verbose] | |
next | |
end | |
if File.directory?(path) | |
# Print out the directory we are currently scanning… | |
# We need to keep track of length of string so we can smartly erase it. | |
if !options[:quiet] | |
print "≫ #{path}" | |
if !first | |
current_length = path.length | |
if current_length < previous_length | |
delta = previous_length - current_length | |
delta.times {print ' '} | |
delta.times {print "\x08"} | |
end | |
previous_length = current_length + 2 | |
else | |
previous_length = path.length + 2 | |
first = false | |
end | |
sleep DELAY | |
print "\r" | |
end | |
# Ensure that symbolically linked directories are followed. | |
search_dirs << File.realdirpath(path) if File.symlink?(path) | |
else | |
# Ensure file is regular and has non-zero length. | |
if !File.file?(path) | |
puts "#{path}: not a regular file; skipping…" if options[:verbose] | |
next | |
end | |
if File.zero?(path) | |
puts "#{path}: empty file; skipping…" if options[:verbose] | |
next | |
end | |
# Determine whether the file target is a duplicate by generating a SHA256 hash. | |
# The file target is a duplicate if we have seen it before. | |
digest = Digest::SHA256.file(path).hexdigest | |
if files.has_key?(digest) | |
dupes << Struct::Duplicate.new(path, digest) | |
else | |
files[digest] = path | |
end | |
end | |
end | |
rescue Errno::EPERM => e | |
# We are already accounting for unreadable directories (line 75) | |
# and files (line 83), above. If we get here then it's likely we | |
# are running on macOS and have hit something protected by SIP. | |
# We ignore the error unless running in verbose mode. | |
clean_up_line(previous_length) if !options[:quiet] | |
puts "#{current}: #{e}" if options[:verbose] | |
next | |
end | |
end | |
clean_up_line(previous_length) if !options[:quiet] | |
# Report (and possibly delete) duplicates… | |
status = dupes.size | |
dupes.each do |dup| | |
if !options[:quiet] | |
if options[:color] | |
print "#{dup.digest} #{dup.pathname.red} duplicates #{files[dup.digest].green}" | |
else | |
print "#{dup.digest} #{dup.pathname} duplicates #{files[dup.digest]}" | |
end | |
end | |
if options[:delete] | |
if File.delete(dup.pathname) == 1 | |
print options[:color] ? " ≫ DELETED".red : " ≫ DELETED" if !options[:quiet] | |
else | |
print options[:color] ? " ≫ ERROR".red : " ≫ ERROR" if !options[:quiet] | |
end | |
end | |
puts if !options[:quiet] | |
end | |
exit status |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This latest version fixes several serious defects:
-v/--verbose
switch so you can see more error diagnosticsFind.find
doesn't follow symlinksEACCES
errors resulted in a crashEACCES
errors raised as a result of SIP on macOS caused a crash