Skip to content

Instantly share code, notes, and snippets.

@putnamhill
Last active June 19, 2021 14:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save putnamhill/8740809 to your computer and use it in GitHub Desktop.
Save putnamhill/8740809 to your computer and use it in GitHub Desktop.
Print groups of files that are duplicates.
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use Digest::MD5;
#use diagnostics;
my $minimum = 0;
my $header = '';
my $footer = '';
my $group_open_a = 'duplicate files (md5: ';
my $group_open_b = ')';
my $group_close = '';
my $file_open = '';
my $file_close = '';
my %md5_hash=();
my $help;
my $xml;
GetOptions(
'help' => \$help,
'h' => \$help,
'minimum=i' => \$minimum,
'm=i' => \$minimum,
'xml' => \$xml,
'x' => \$xml
);
if (defined $help) {
print <<'EOT';
Usage: group-dupes.pl [options] file1 file2 ...
Print groups of files that are duplicates.
If no files are passed on the command line, files are read from stdin (tip: feed with find).
Anything that is not a regular file is ignored.
Options:
-m, --minimum minimum number of duplicates to print, default is 1
-x, --xml print as xml
-h, --help print this message
EOT
exit;
}
if (defined $xml) {
$header = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<duplicates>\n";
$footer = "</duplicates>\n";
$group_open_a = '<group md5="'; $group_open_b = '">';
$group_close = "</group>\n";
$file_open = '<file>';
$file_close = '</file>';
}
if ($#ARGV > -1) {
while ($#ARGV > -1) { # process every file on the command line
group_dupe($ARGV[0]);
shift;
}
} else {
while (<>) { # read from standard in if there's nothing on the command line
chomp;
group_dupe($_);
}
}
print $header;
foreach my $md5 (keys %md5_hash) {
my @files = @{$md5_hash{$md5}};
if ($#files > $minimum) {
print "$group_open_a$md5$group_open_b\n";
for my $file (@files) {
print "\t$file_open$file$file_close\n";
}
print $group_close;
}
}
print $footer;
sub group_dupe {
my ($path) = @_;
(! -f $path) && return; # skip anything that's not a regular file
if (-r $path) {
open(FILE, '<', $path) or die "Can't open $path: $!";
binmode(FILE);
if (my $digest = Digest::MD5->new->addfile(*FILE)->hexdigest, $path) {
push(@{$md5_hash{$digest}}, $path);
} else {
print STDERR "Can't make md5 digest of file $path: $!\n";
}
} else {
print STDERR "Can't read file: $path ... skipping\n";
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment