Skip to content

Instantly share code, notes, and snippets.

@schplurtz
Created May 17, 2023 13:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save schplurtz/72916c5f820bdf478ce8b584e7d3813e to your computer and use it in GitHub Desktop.
Save schplurtz/72916c5f820bdf478ce8b584e7d3813e to your computer and use it in GitHub Desktop.
perl script to find files with same md5 fingerprint
#!/bin/sh
# encoding: utf-8
#!/usr/bin/perl
eval 'exec perl -x -S "$0" ${1+"$@"}'
if $running_under_some_shell;
# Ce fichier est la propriété exclusive de Schplurtz le Déboulonné.
# Copyright © 2009-2023 Schplurtz le Déboulonné <Schplurtz -AT- laposte • net>
# Fichier sous licence CeCILL 2.1.
# Distributed under the CeCILL 2.1 license.
# https://cecill.info/licences/Licence_CeCILL_V2.1-en.html
# https://cecill.info/licences/Licence_CeCILL_V2.1-fr.html
$main::VERSION='1.2';
use strict;
use warnings;
use File::Find;
use Digest::MD5 qw( md5_hex );
use Getopt::Std;
use POSIX qw(strftime);
use Data::Dumper;
my( %mdsums, %mdsumsure, %size, %stats, $dir );
my( $same, $total ) = ( 0, 0 );
my %conf = (
verb => 0,
size => 20 * 1024,
printmd5 => 1,
printmax => 0,
readwhole => 1,
printsize => 0,
fromend2 => 0,
Long => 0,
long => 0,
);
sub somme() {
if ( $conf{'verb'} && $dir ne $File::Find::dir ) {
print STDERR "\r$File::Find::dir";
$dir=$File::Find::dir;
}
return if -l $_ || ! -f $_;
# remember: stat(_) reuse last stat obtained (-l and -f did a stat() call)
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks) = stat(_);
$stats{${File::Find::name}}=[$mode, $nlink, $uid, $gid, $size, $mtime] if $conf{'Long'};
open( my $fic, "< ./$_" ) or # ./$_ to avoid interpretation by perl of
# certain filenames such as "&1"
warn "\nCan't open file ${File::Find::name} : $!\n";
sysread $fic, my $data, $conf{'size'};
if( $conf{'fromend2'} && ($size > $conf{'size'}) ) {
my $toread = $conf{'size'};
$toread = 2 * $conf{'size'} - $size if $size < (2 * $conf{'size'});
if( $size > $conf{'size'} ) {
seek( $fic, -1 * $toread, 2 );
sysread $fic, my ($d2), $toread;
$data .= $d2;
}
}
my $sum=md5_hex( $data );
$sum .= ' ' . (stat( "./$_" ))[7]
if $conf{'printsize'};
push @{$mdsums{ $sum }}, ${File::Find::name};
if( $conf{'verb'} && @{$mdsums{ $sum }} > 1 ) {
$total++;
if( @{$mdsums{ $sum }} == 2 ) {
$same++;
$total++;
}
}
}
MAIN:
$Getopt::Std::STANDARD_HELP_VERSION=1;
my %opts;
getopts( 'vs:MLmSn:we', \%opts );
$conf{'verb'}=$opts{'v'};
$conf{'size'}=1024 * $opts{'s'} if $opts{'s'};
$conf{'Long'}=$opts{'L'};
$conf{'printmd5'} = 0 if $opts{'M'};
$conf{'printmd5'} = 2 if $opts{'m'};
$conf{'printmax'} = int( $opts{'n'} ) if $opts{'n'};
$conf{'readwhole'} = 0 if $opts{'w'};
$conf{'printsize'} = 1 if $opts{'S'};
$conf{'printsize'} = 0 unless $opts{'w'};
$conf{'fromend2'} = 1 if $opts{'e'};
$conf{'fromend2'} = 0 unless $opts{'w'};
push @ARGV, '.' if 0 == @ARGV;
find( \&somme, @ARGV );
print STDERR "\npass 2\ngroups to examine : $same\nfiles to examine : $total\n"
if( $conf{'verb'} );
my $firsttotal=$total;
if( $conf{'readwhole'} ) {
$same=$total=0;
my $n=1;
my $md5 = Digest::MD5->new;
foreach my $k (keys %mdsums) {
my ($i, $sum, $f) = ( 0 );
next if( @{$mdsums{$k}} <= 1 );
for $f (@{$mdsums{$k}}) {
if( $conf{'verb'} ) {
print STDERR "\r$n/$firsttotal";
$n++;
}
my $fic;
next unless open( $fic, $f );
$md5->reset;
$md5->addfile($fic);
$sum=$md5->hexdigest;
push @{$mdsumsure{$sum}}, $f;
if( $conf{'verb'} && @{$mdsumsure{$sum}} > 1 ) {
$total++;
if( @{$mdsumsure{$sum}} == 2 ) {
$same++;
$total++;
}
}
}
}
}
else {
my $n = 0;
foreach my $sum (keys %mdsums) {
if( @{$mdsums{$sum}} > 1 ) {
$mdsumsure{$sum} = $mdsums{$sum};
if( $conf{'verb'} ) {
$n += @{$mdsums{$sum}};
print STDERR "\r$n/$firsttotal";
}
}
}
}
print STDERR "\nAt last\ngroups : $same\nfiles : $total\n"
if( $conf{'verb'} );
my $tab = ($conf{'printmd5'} == 1) ? "\t" : '';
foreach my $sum (keys %mdsumsure) {
next if( @{$mdsumsure{$sum}} == 1 );
print "$sum\n" if $conf{'printmd5'} == 1;
my $n = $conf{printmax};
for (@{$mdsumsure{$sum}}) {
print "$sum " if $conf{'printmd5'} == 2;
print "$tab";
printf( "%04o %2d %5d %5d %10s ", ${$stats{$_}}[0] & 07777, @{$stats{$_}}[1..4] ) if $conf{'Long'};
printf( "%24s\t", scalar localtime ${$stats{$_}}[5] ) if $conf{'Long'};
print "$_\n";
last if ! --$n;
}
}
__END__
=encoding utf-8
=head1 NAME
double-doc - find probably duplicate files in directory trees
=head1 SYNOPSIS
double-doc [--help] [--version] [-v] [-s size] [-n max] [-M] [-m] [-S] [-w] [directory directory...]
=head1 DESCRIPTION
C<double-doc> is a little perl script that tries to find files with
probably same content within some directories and their arborescence. It
does so by computing the I<MD5> sum of the first 20 kib (adjustable
via an option) of each file, then for files whose begining have the
same I<MD5> sum, it computes the I<MD5> sum for the whole file. files
that happen to have the same I<MD5> sum are printed in paragraph, each
paragraph preceded by the I<MD5> sum. It is important to realize that
C<double-doc> finds files whose content have same I<MD5> sum, B<NOT>
files that have identical content. Although this is unlikely to happen,
two or more completely different files may have the same I<MD5> sum.
sample default output :
a7e40313ff29aa57bc2ee1ec6da78831
/usr/share/doc/evolution-data-server-common/changelog.Debian.gz
/usr/share/doc/libedataserver1.2-9/changelog.Debian.gz
/usr/share/doc/libegroupwise1.2-13/changelog.Debian.gz
/usr/share/doc/libgdata1.2-1/changelog.Debian.gz
be3c3fe9427010bee4a0493b16808f44
/usr/share/doc/libgphoto2-2/copyright
/usr/share/doc/libgphoto2-port0/copyright
dde08874a41ed02d26649c228bbaea8c
/usr/share/doc/iputils-arping/RELNOTES.gz
/usr/share/doc/iputils-ping/RELNOTES.gz
/usr/share/doc/iputils-tracepath/RELNOTES.gz
=head1 options
=over
=item --help
Give help and terminate.
=item --version
Print versions and exit.
=item -v
Enter verbose mode. All verbose messages are printed on standard
error. In verbose mode, directories are printed as they are searched
during the first pass. Then the number of groups and files found so
far is printed. During the second pass, a counter indicates the number
of files processed.
=item -s size
Set the size in kib of the begining of file used for the computation
of the I<MD5> sum.
=item -M
Don't print the I<MD5>sum; print nothing instead.
=item -m
Print the I<MD5>sum on each line. like this:
be3c3fe9427010bee4a0493b16808f44 /usr/share/doc/libgphoto2-2/copyright
be3c3fe9427010bee4a0493b16808f44 /usr/share/doc/libgphoto2-port0/copyright
dde08874a41ed02d26649c228bbaea8c /usr/share/doc/iputils-arping/RELNOTES.gz
dde08874a41ed02d26649c228bbaea8c /usr/share/doc/iputils-ping/RELNOTES.gz
dde08874a41ed02d26649c228bbaea8c /usr/share/doc/iputils-tracepath/RELNOTES.gz
=item -w
Do not read the whole file in the second phase. Assume that files that
have same I<MD5> sum for their beginings are identical.
=item -e
When B<-w> is also set, the I<MD5> sum is computed for the begining and
the end of the file. Files that have same beginning, but different end,
should have different I<MD5> sum.
=item -S
When B<-w> is in use, also stat every file and use the file size as
part of the key to group files. It is intended so that files with same
begining but different sizes are recognized as different. The output is
slightly different from default output. Each group is preceded by the
I<MD5SUM> computed, a space, the file size, like this :
dde08874a41ed02d26649c228bbaea8c 7228
/usr/share/doc/iputils-arping/RELNOTES.gz
/usr/share/doc/iputils-ping/RELNOTES.gz
/usr/share/doc/iputils-tracepath/RELNOTES.gz
=item -L
B<-L> is the “long listing” mode. In this mode, you get
file basic filemode, number of links, uid, gid, size and mtime before
the filename. The output starts with a tab character (ASCII 9), then
the file infos are blank separated, then there is again a tab character
and finally the filename.
82b26659c7b1a979fa288b4f88c78e59
0644 1 0 0 4659 Sat Feb 5 05:10:38 2022 /usr/share/doc/iputils-ping/copyright
0644 1 0 0 4659 Sat Feb 5 05:10:38 2022 /usr/share/doc/iputils-tracepath/copyright
=begin none
=item -S
When B<-w> is in use, prints on a line the size of the file, a space,
then the filename, like this :
dde08874a41ed02d26649c228bbaea8c
7228 /usr/share/doc/iputils-arping/RELNOTES.gz
7228 /usr/share/doc/iputils-ping/RELNOTES.gz
7228 /usr/share/doc/iputils-tracepath/RELNOTES.gz
=end none
=item -n max
Print only the first B<max> files of each group. 0 means infinity.
=back
=head1 EXAMPLE
doubledoc -w -e -S -s 50 /dir1 /dir2 /dir3
=head1 AUTHOR
Schplurtz le Déboulonné <schplurtz@laposte.net>
=cut
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment