-
-
Save schplurtz/72916c5f820bdf478ce8b584e7d3813e to your computer and use it in GitHub Desktop.
perl script to find files with same md5 fingerprint
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# encoding: utf-8 | |
#!/usr/bin/perl | |
eval 'exec perl -x -S "$0" ${1+"$@"}' | |
if $running_under_some_shell; | |
# Ce fichier est la propriété exclusive de Schplurtz le Déboulonné. | |
# Copyright © 2009-2023 Schplurtz le Déboulonné <Schplurtz -AT- laposte • net> | |
# Fichier sous licence CeCILL 2.1. | |
# Distributed under the CeCILL 2.1 license. | |
# https://cecill.info/licences/Licence_CeCILL_V2.1-en.html | |
# https://cecill.info/licences/Licence_CeCILL_V2.1-fr.html | |
$main::VERSION='1.2'; | |
use strict; | |
use warnings; | |
use File::Find; | |
use Digest::MD5 qw( md5_hex ); | |
use Getopt::Std; | |
use POSIX qw(strftime); | |
use Data::Dumper; | |
my( %mdsums, %mdsumsure, %size, %stats, $dir ); | |
my( $same, $total ) = ( 0, 0 ); | |
my %conf = ( | |
verb => 0, | |
size => 20 * 1024, | |
printmd5 => 1, | |
printmax => 0, | |
readwhole => 1, | |
printsize => 0, | |
fromend2 => 0, | |
Long => 0, | |
long => 0, | |
); | |
sub somme() { | |
if ( $conf{'verb'} && $dir ne $File::Find::dir ) { | |
print STDERR "\r$File::Find::dir[K"; | |
$dir=$File::Find::dir; | |
} | |
return if -l $_ || ! -f $_; | |
# remember: stat(_) reuse last stat obtained (-l and -f did a stat() call) | |
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, | |
$atime,$mtime,$ctime,$blksize,$blocks) = stat(_); | |
$stats{${File::Find::name}}=[$mode, $nlink, $uid, $gid, $size, $mtime] if $conf{'Long'}; | |
open( my $fic, "< ./$_" ) or # ./$_ to avoid interpretation by perl of | |
# certain filenames such as "&1" | |
warn "\nCan't open file ${File::Find::name} : $!\n"; | |
sysread $fic, my $data, $conf{'size'}; | |
if( $conf{'fromend2'} && ($size > $conf{'size'}) ) { | |
my $toread = $conf{'size'}; | |
$toread = 2 * $conf{'size'} - $size if $size < (2 * $conf{'size'}); | |
if( $size > $conf{'size'} ) { | |
seek( $fic, -1 * $toread, 2 ); | |
sysread $fic, my ($d2), $toread; | |
$data .= $d2; | |
} | |
} | |
my $sum=md5_hex( $data ); | |
$sum .= ' ' . (stat( "./$_" ))[7] | |
if $conf{'printsize'}; | |
push @{$mdsums{ $sum }}, ${File::Find::name}; | |
if( $conf{'verb'} && @{$mdsums{ $sum }} > 1 ) { | |
$total++; | |
if( @{$mdsums{ $sum }} == 2 ) { | |
$same++; | |
$total++; | |
} | |
} | |
} | |
MAIN: | |
$Getopt::Std::STANDARD_HELP_VERSION=1; | |
my %opts; | |
getopts( 'vs:MLmSn:we', \%opts ); | |
$conf{'verb'}=$opts{'v'}; | |
$conf{'size'}=1024 * $opts{'s'} if $opts{'s'}; | |
$conf{'Long'}=$opts{'L'}; | |
$conf{'printmd5'} = 0 if $opts{'M'}; | |
$conf{'printmd5'} = 2 if $opts{'m'}; | |
$conf{'printmax'} = int( $opts{'n'} ) if $opts{'n'}; | |
$conf{'readwhole'} = 0 if $opts{'w'}; | |
$conf{'printsize'} = 1 if $opts{'S'}; | |
$conf{'printsize'} = 0 unless $opts{'w'}; | |
$conf{'fromend2'} = 1 if $opts{'e'}; | |
$conf{'fromend2'} = 0 unless $opts{'w'}; | |
push @ARGV, '.' if 0 == @ARGV; | |
find( \&somme, @ARGV ); | |
print STDERR "\npass 2\ngroups to examine : $same\nfiles to examine : $total\n" | |
if( $conf{'verb'} ); | |
my $firsttotal=$total; | |
if( $conf{'readwhole'} ) { | |
$same=$total=0; | |
my $n=1; | |
my $md5 = Digest::MD5->new; | |
foreach my $k (keys %mdsums) { | |
my ($i, $sum, $f) = ( 0 ); | |
next if( @{$mdsums{$k}} <= 1 ); | |
for $f (@{$mdsums{$k}}) { | |
if( $conf{'verb'} ) { | |
print STDERR "\r$n/$firsttotal"; | |
$n++; | |
} | |
my $fic; | |
next unless open( $fic, $f ); | |
$md5->reset; | |
$md5->addfile($fic); | |
$sum=$md5->hexdigest; | |
push @{$mdsumsure{$sum}}, $f; | |
if( $conf{'verb'} && @{$mdsumsure{$sum}} > 1 ) { | |
$total++; | |
if( @{$mdsumsure{$sum}} == 2 ) { | |
$same++; | |
$total++; | |
} | |
} | |
} | |
} | |
} | |
else { | |
my $n = 0; | |
foreach my $sum (keys %mdsums) { | |
if( @{$mdsums{$sum}} > 1 ) { | |
$mdsumsure{$sum} = $mdsums{$sum}; | |
if( $conf{'verb'} ) { | |
$n += @{$mdsums{$sum}}; | |
print STDERR "\r$n/$firsttotal"; | |
} | |
} | |
} | |
} | |
print STDERR "\nAt last\ngroups : $same\nfiles : $total\n" | |
if( $conf{'verb'} ); | |
my $tab = ($conf{'printmd5'} == 1) ? "\t" : ''; | |
foreach my $sum (keys %mdsumsure) { | |
next if( @{$mdsumsure{$sum}} == 1 ); | |
print "$sum\n" if $conf{'printmd5'} == 1; | |
my $n = $conf{printmax}; | |
for (@{$mdsumsure{$sum}}) { | |
print "$sum " if $conf{'printmd5'} == 2; | |
print "$tab"; | |
printf( "%04o %2d %5d %5d %10s ", ${$stats{$_}}[0] & 07777, @{$stats{$_}}[1..4] ) if $conf{'Long'}; | |
printf( "%24s\t", scalar localtime ${$stats{$_}}[5] ) if $conf{'Long'}; | |
print "$_\n"; | |
last if ! --$n; | |
} | |
} | |
__END__ | |
=encoding utf-8 | |
=head1 NAME | |
double-doc - find probably duplicate files in directory trees | |
=head1 SYNOPSIS | |
double-doc [--help] [--version] [-v] [-s size] [-n max] [-M] [-m] [-S] [-w] [directory directory...] | |
=head1 DESCRIPTION | |
C<double-doc> is a little perl script that tries to find files with | |
probably same content within some directories and their arborescence. It | |
does so by computing the I<MD5> sum of the first 20 kib (adjustable | |
via an option) of each file, then for files whose begining have the | |
same I<MD5> sum, it computes the I<MD5> sum for the whole file. files | |
that happen to have the same I<MD5> sum are printed in paragraph, each | |
paragraph preceded by the I<MD5> sum. It is important to realize that | |
C<double-doc> finds files whose content have same I<MD5> sum, B<NOT> | |
files that have identical content. Although this is unlikely to happen, | |
two or more completely different files may have the same I<MD5> sum. | |
sample default output : | |
a7e40313ff29aa57bc2ee1ec6da78831 | |
/usr/share/doc/evolution-data-server-common/changelog.Debian.gz | |
/usr/share/doc/libedataserver1.2-9/changelog.Debian.gz | |
/usr/share/doc/libegroupwise1.2-13/changelog.Debian.gz | |
/usr/share/doc/libgdata1.2-1/changelog.Debian.gz | |
be3c3fe9427010bee4a0493b16808f44 | |
/usr/share/doc/libgphoto2-2/copyright | |
/usr/share/doc/libgphoto2-port0/copyright | |
dde08874a41ed02d26649c228bbaea8c | |
/usr/share/doc/iputils-arping/RELNOTES.gz | |
/usr/share/doc/iputils-ping/RELNOTES.gz | |
/usr/share/doc/iputils-tracepath/RELNOTES.gz | |
=head1 options | |
=over | |
=item --help | |
Give help and terminate. | |
=item --version | |
Print versions and exit. | |
=item -v | |
Enter verbose mode. All verbose messages are printed on standard | |
error. In verbose mode, directories are printed as they are searched | |
during the first pass. Then the number of groups and files found so | |
far is printed. During the second pass, a counter indicates the number | |
of files processed. | |
=item -s size | |
Set the size in kib of the begining of file used for the computation | |
of the I<MD5> sum. | |
=item -M | |
Don't print the I<MD5>sum; print nothing instead. | |
=item -m | |
Print the I<MD5>sum on each line. like this: | |
be3c3fe9427010bee4a0493b16808f44 /usr/share/doc/libgphoto2-2/copyright | |
be3c3fe9427010bee4a0493b16808f44 /usr/share/doc/libgphoto2-port0/copyright | |
dde08874a41ed02d26649c228bbaea8c /usr/share/doc/iputils-arping/RELNOTES.gz | |
dde08874a41ed02d26649c228bbaea8c /usr/share/doc/iputils-ping/RELNOTES.gz | |
dde08874a41ed02d26649c228bbaea8c /usr/share/doc/iputils-tracepath/RELNOTES.gz | |
=item -w | |
Do not read the whole file in the second phase. Assume that files that | |
have same I<MD5> sum for their beginings are identical. | |
=item -e | |
When B<-w> is also set, the I<MD5> sum is computed for the begining and | |
the end of the file. Files that have same beginning, but different end, | |
should have different I<MD5> sum. | |
=item -S | |
When B<-w> is in use, also stat every file and use the file size as | |
part of the key to group files. It is intended so that files with same | |
begining but different sizes are recognized as different. The output is | |
slightly different from default output. Each group is preceded by the | |
I<MD5SUM> computed, a space, the file size, like this : | |
dde08874a41ed02d26649c228bbaea8c 7228 | |
/usr/share/doc/iputils-arping/RELNOTES.gz | |
/usr/share/doc/iputils-ping/RELNOTES.gz | |
/usr/share/doc/iputils-tracepath/RELNOTES.gz | |
=item -L | |
B<-L> is the “long listing” mode. In this mode, you get | |
file basic filemode, number of links, uid, gid, size and mtime before | |
the filename. The output starts with a tab character (ASCII 9), then | |
the file infos are blank separated, then there is again a tab character | |
and finally the filename. | |
82b26659c7b1a979fa288b4f88c78e59 | |
0644 1 0 0 4659 Sat Feb 5 05:10:38 2022 /usr/share/doc/iputils-ping/copyright | |
0644 1 0 0 4659 Sat Feb 5 05:10:38 2022 /usr/share/doc/iputils-tracepath/copyright | |
=begin none | |
=item -S | |
When B<-w> is in use, prints on a line the size of the file, a space, | |
then the filename, like this : | |
dde08874a41ed02d26649c228bbaea8c | |
7228 /usr/share/doc/iputils-arping/RELNOTES.gz | |
7228 /usr/share/doc/iputils-ping/RELNOTES.gz | |
7228 /usr/share/doc/iputils-tracepath/RELNOTES.gz | |
=end none | |
=item -n max | |
Print only the first B<max> files of each group. 0 means infinity. | |
=back | |
=head1 EXAMPLE | |
doubledoc -w -e -S -s 50 /dir1 /dir2 /dir3 | |
=head1 AUTHOR | |
Schplurtz le Déboulonné <schplurtz@laposte.net> | |
=cut | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment