Skip to content

Instantly share code, notes, and snippets.

@edt11x
Last active January 12, 2019 19:27
Show Gist options
  • Save edt11x/d738532c077ea15aba7f5b1a251b56bf to your computer and use it in GitHub Desktop.
Save edt11x/d738532c077ea15aba7f5b1a251b56bf to your computer and use it in GitHub Desktop.
A Perl version of the script to delete shallow file copies with more options.
#!/usr/bin/env perl
use strict;
use warnings;
use Digest::SHA qw(sha1_base64 sha256_base64 sha512_base64);
use File::Basename;
use File::Find ();
use Getopt::Long;
use IO::All;
use Pod::Usage;
use Scalar::MoreUtils qw( empty );
# Set the variable $File::Find::dont_use_nlink if you're using AFS,
# since AFS cheats.
# for the convenience of &wanted calls, including -eval statements:
use vars qw/*name *dir *prune/;
*name = *File::Find::name;
*dir = *File::Find::dir;
*prune = *File::Find::prune;
my ( $man,$help,$verbose,$thisFile );
my $startDir = ".";
my %fileHash;
my ( @a, @pruneList );
sub prompt {
my ($query) = @_; # take a prompt string as argument
local $| = 1; # activate autoflush to immediately show the prompt
print $query;
chomp(my $answer = <STDIN>);
return $answer;
}
sub prompt_yn {
my ($query) = @_;
my $answer = prompt("$query (y/n): ");
return lc($answer) eq 'y';
}
sub wanted {
my ($dev,$ino,$mode,$nlink,$uid,$gid);
my $pruneThis = 0;
$thisFile = $_;
foreach my $thisPrune (@pruneList) {
if (/^\Q$thisPrune\E\z/s) {
$pruneThis = 1;
print("Pruning $thisFile\n") if $verbose;
}
}
if ($pruneThis) {
$File::Find::prune = 1;
} elsif ((($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($thisFile)) &&
(-f $thisFile) && (-r $thisFile)) {
# for each file, we need to collect in a list of information
$fileHash{$name}{sha1} = sha1_base64(io("$thisFile")->all);
$fileHash{$name}{sha256} = sha256_base64(io("$thisFile")->all);
$fileHash{$name}{sha512} = sha512_base64(io("$thisFile")->all);
$fileHash{$name}{fileSize} = -s $thisFile;
if ($verbose) {
print("$name\n");
print(" Size : ", $fileHash{$name}{fileSize}, "\n");
print(" SHA-1 : ", $fileHash{$name}{sha1}, "\n");
print(" SHA-256 : ", $fileHash{$name}{sha256}, "\n");
print(" SHA-512 : ", $fileHash{$name}{sha512}, "\n");
print("\n");
}
}
}
GetOptions("help|?" => \$help,
"man" => \$man,
"start=s" => \$startDir,
"exclude=s{2}" => \@a,
"prune=s" => \@pruneList,
"verbose" => \$verbose) or pod2usage(2);
pod2usage(-exitval => 0) if $help;
pod2usage(-exitval => 0, -verbose => 2) if $man;
pod2usage(-msg => "Unexpected argument : " . $ARGV[0], -exitval => 3) if (@ARGV != 0);
print("Starting File Find...\n") if ($verbose);
# Traverse desired filesystems
File::Find::find({wanted => \&wanted}, $startDir);
print("File Find DONE.\n\n") if ($verbose);
print("Starting File Match...\n") if ($verbose);
# for each file
foreach my $key (keys %fileHash) {
# see if we can find a match
# The psuedo code for this
#
# for each file
# if there is a file with a matching basename whose path is longer
# if the SHA256 hash of the two files match
# ask if you want to delete the more shallow file, the shorter file path
#
foreach my $matchKey (keys %fileHash) {
if ((basename($key) eq basename($matchKey)) &&
(!empty(basename($key))) &&
(!empty(basename($matchKey))) &&
($key ne $matchKey)) {
my $excludeThisCompare = 0;
# loop through the excluded combinations
for (my $i = 0; $i < $#a; $i += 2) {
if ((!empty($a[$i])) &&
(!empty($a[$i+1]))) {
print("Checking exclusion ", $a[$i], " vs ", $a[$i+1], "\n") if $verbose;
if (($a[$i] eq substr($key, 0, length($a[$i]))) &&
($a[$i+1] eq substr($matchKey, 0, length($a[$i+1])))) {
$excludeThisCompare = 1;
}
if (($a[$i] eq substr($matchKey, 0, length($a[$i]))) &&
($a[$i+1] eq substr($key, 0, length($a[$i+1])))) {
$excludeThisCompare = 1;
}
}
}
print "This match is excluded\n$key\n$matchKey\n\n" if (($verbose) && ($excludeThisCompare));
# note we need to check again if the file exists,
# because we may have deleted it.
# I really really want to make sure these are the
# same files, that they have the same contents.
# So I do overly redundant checks to make sure the
# files are the same.
if ((!$excludeThisCompare) &&
(exists $fileHash{$key}{sha1}) &&
(exists $fileHash{$matchKey}{sha1}) &&
($fileHash{$key}{sha1} eq $fileHash{$matchKey}{sha1}) &&
(exists $fileHash{$key}{sha256}) &&
(exists $fileHash{$matchKey}{sha256}) &&
($fileHash{$key}{sha256} eq $fileHash{$matchKey}{sha256}) &&
(exists $fileHash{$key}{sha512}) &&
(exists $fileHash{$matchKey}{sha512}) &&
($fileHash{$key}{sha512} eq $fileHash{$matchKey}{sha512}) &&
(exists $fileHash{$key}{fileSize}) &&
(exists $fileHash{$matchKey}{fileSize}) &&
($fileHash{$key}{fileSize} == $fileHash{$matchKey}{fileSize}) &&
(-f $key) && (-f $matchKey)) {
print "\nFile match\n$key\n$matchKey\n\n";
if (length($key) >= length($matchKey)) {
if (prompt_yn("\n$matchKey appears to be the shallow path, delete this path (y/n) ? ")) {
print "unlink($matchKey)\n" if ($verbose);
unlink($matchKey);
}
} else {
if (prompt_yn("\n$key appears to be the shallow path, delete this path (y/n) ? ")) {
print "unlink($key)\n" if ($verbose);
unlink($key);
}
}
print("\n");
}
}
}
}
print("File Match DONE.\n\n") if ($verbose);
print("All Done.\n\n") if ($verbose);
exit(0);
__END__
MANUAL_PAGE
=head1 NAME
deleteShallowFileCopies.pl - delete shallow copies of file duplicates
=head1 SYNOPSIS
deleteShallowFileCopies.pl
This perl script interactively deletes shallow copies of files that are
duplicates of other files that are stored deeper in the file system. It
is not uncommon for me to decide to organize an overly crowded directory
into subdirectories. It is also not uncommon for me to leave copies behind.
This perl scripts looks at each file and looks to see if there is a duplicate
deeper in the file system. Since some of my file sync operations such as
Dropbox modify file times, I need to do this comparison based on something
more intrinsic like a sha256 hash of the file.
There are some extra checks in here, but I really want to know that there
are two reliable copies before I delete one of them.
EXAMPLES
$ cd some_directory_to_evaluate
$ deleteShallowFileCopies.sh
remove file foo.bar?
$ deleteShallowFileCopies.sh --start . --exclude ./a ./b --prune .DS_Store
$ deleteShallowFileCopies.sh --exclude ./a ./b --exclude ./c ./d
Options:
--help brief help message
--man manual page
--verbose info about everything, even in the into dir
--start dir_to_start_in specify the directory to start in
--exclude ./a ./b exclude comparing files in directories a & b
--prune name_to_prune directory or file names to disregard
=head1 OPTIONS
=over 8
=item B<help>
Print a brief help message and exit
=item B<man>
Print the manual page and exit
=item B<verbose>
This will cause something to be printed about every file, even the ones in the
into directory.
=item B<start>
The argument to this will specify the directory to start in.
=item B<exclude>
This takes a pair of arguments, two directories that you do not want to compare. For
example if you do not want to compare directories ./a and ./b you can add the
argument "--exclude ./a ./b". Any other combinations will be compared, for example
"./c" and "./a". You can specify the exclude argument many times with separate pairs.
=item B<prune>
This takes an argument, a directory or a file name that we will truncate or prune
the search when that name is encountered. If I specify "--prune .svn", it will not
evaluate any of the files or directories under any directory named ".svn".
=back
=head1 DESCRIPTION
This perl script interactively deletes shallow copies of files that are
duplicates of other files that are stored deeper in the file system.
=cut
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment