schplurtz/double-doc Secret

## double-doc
#!/bin/sh
# encoding: utf-8
#!/usr/bin/perl
  eval 'exec perl -x -S "$0" ${1+"$@"}'
    if $running_under_some_shell;

# Ce fichier est la propriété exclusive de Schplurtz le Déboulonné.
# Copyright © 2009-2023 Schplurtz le Déboulonné <Schplurtz -AT- laposte • net>
# Fichier sous licence CeCILL 2.1.
# Distributed under the CeCILL 2.1 license.
# https://cecill.info/licences/Licence_CeCILL_V2.1-en.html
# https://cecill.info/licences/Licence_CeCILL_V2.1-fr.html

$main::VERSION='1.2';

use strict;
use warnings;
use File::Find;
use Digest::MD5 qw( md5_hex );
use Getopt::Std;
use POSIX qw(strftime);
use Data::Dumper;

my( %mdsums, %mdsumsure, %size, %stats, $dir );
my( $same, $total ) = ( 0, 0 );
my %conf = (
  verb => 0,
  size => 20 * 1024,
  printmd5 => 1,
  printmax => 0,
  readwhole => 1,
  printsize => 0,
  fromend2 => 0,
  Long => 0,
  long => 0,
);
sub somme() {
  if ( $conf{'verb'} && $dir ne $File::Find::dir ) {
    print STDERR "\r$File::Find::dir[K";
    $dir=$File::Find::dir;
  }
  return if -l $_ || ! -f $_;
  # remember: stat(_) reuse last stat obtained (-l and -f did a stat() call)
  my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
   $atime,$mtime,$ctime,$blksize,$blocks) = stat(_);
  $stats{${File::Find::name}}=[$mode, $nlink, $uid, $gid, $size, $mtime] if $conf{'Long'};
  open( my $fic, "< ./$_" ) or # ./$_ to avoid interpretation by perl of
                               # certain filenames such as "&1"
    warn "\nCan't open file ${File::Find::name} : $!\n";
  sysread $fic, my $data, $conf{'size'};
  if( $conf{'fromend2'} && ($size > $conf{'size'}) ) {
    my $toread = $conf{'size'};
    $toread = 2 * $conf{'size'} - $size if $size < (2 * $conf{'size'});
    if( $size > $conf{'size'} ) {
      seek( $fic, -1 * $toread, 2 );
      sysread $fic, my ($d2), $toread;
      $data .= $d2;
    }
  }
  my $sum=md5_hex( $data );
  $sum .= ' ' . (stat( "./$_" ))[7]
    if $conf{'printsize'};
  push @{$mdsums{ $sum }}, ${File::Find::name};
  if( $conf{'verb'} && @{$mdsums{ $sum }} > 1 ) {
    $total++;
    if( @{$mdsums{ $sum }} == 2 ) {
      $same++;
      $total++;
    }
  }
}

MAIN:
$Getopt::Std::STANDARD_HELP_VERSION=1;
my %opts;
getopts( 'vs:MLmSn:we', \%opts );
$conf{'verb'}=$opts{'v'};
$conf{'size'}=1024 * $opts{'s'} if $opts{'s'};
$conf{'Long'}=$opts{'L'};
$conf{'printmd5'} = 0 if $opts{'M'};
$conf{'printmd5'} = 2 if $opts{'m'};
$conf{'printmax'} = int( $opts{'n'} ) if $opts{'n'};
$conf{'readwhole'} = 0 if $opts{'w'};
$conf{'printsize'} = 1 if $opts{'S'};
$conf{'printsize'} = 0 unless $opts{'w'};
$conf{'fromend2'} = 1 if $opts{'e'};
$conf{'fromend2'} = 0 unless $opts{'w'};
push @ARGV, '.' if 0 == @ARGV;

find( \&somme, @ARGV );

print STDERR "\npass 2\ngroups to examine : $same\nfiles to examine : $total\n"
  if( $conf{'verb'} );
my $firsttotal=$total;
if( $conf{'readwhole'} ) {
  $same=$total=0;
  my $n=1;
  my $md5 = Digest::MD5->new;
  foreach my $k (keys %mdsums) {
    my ($i, $sum, $f) = ( 0 );
    next if( @{$mdsums{$k}} <= 1 );
    for $f (@{$mdsums{$k}}) {
      if( $conf{'verb'} ) {
        print STDERR "\r$n/$firsttotal";
        $n++;
      }
      my $fic;
      next unless open( $fic, $f );
      $md5->reset;
      $md5->addfile($fic);
      $sum=$md5->hexdigest;
      push @{$mdsumsure{$sum}}, $f;
      if( $conf{'verb'} && @{$mdsumsure{$sum}} > 1 ) {
        $total++;
        if( @{$mdsumsure{$sum}} == 2 ) {
          $same++;
          $total++;
        }
      }
    }
  }
}
else {
  my $n = 0;
  foreach my $sum (keys %mdsums) {
    if( @{$mdsums{$sum}} > 1 ) {
      $mdsumsure{$sum} = $mdsums{$sum};
      if( $conf{'verb'} ) {
        $n += @{$mdsums{$sum}};
        print STDERR "\r$n/$firsttotal";
      }
    }
  }
}

print STDERR "\nAt last\ngroups : $same\nfiles : $total\n"
  if( $conf{'verb'} );
my $tab = ($conf{'printmd5'} == 1) ? "\t" : '';
foreach my $sum (keys %mdsumsure) {
  next if( @{$mdsumsure{$sum}} == 1 );
  print "$sum\n" if $conf{'printmd5'} == 1;
  my $n = $conf{printmax};
  for (@{$mdsumsure{$sum}}) {
    print "$sum " if $conf{'printmd5'} == 2;
    print "$tab";
    printf( "%04o %2d %5d %5d %10s ", ${$stats{$_}}[0] & 07777, @{$stats{$_}}[1..4] ) if $conf{'Long'};
    printf( "%24s\t", scalar localtime ${$stats{$_}}[5] ) if $conf{'Long'};
    print "$_\n";
    last if ! --$n;
  }

}

__END__

=encoding utf-8

=head1 NAME

double-doc - find probably duplicate files in directory trees

=head1 SYNOPSIS

double-doc [--help] [--version] [-v] [-s size] [-n max] [-M] [-m] [-S] [-w] [directory directory...]

=head1 DESCRIPTION

C<double-doc> is a little perl script that tries to find files with
probably same content within some directories and their arborescence. It
does so by computing the I<MD5> sum of the first 20 kib (adjustable
via an option) of each file, then for files whose begining have the
same I<MD5> sum, it computes the I<MD5> sum for the whole file. files
that happen to have the same I<MD5> sum are printed in paragraph, each
paragraph preceded by the I<MD5> sum.  It is important to realize that
C<double-doc> finds files whose content have same I<MD5> sum, B<NOT>
files that have identical content. Although this is unlikely to happen,
two or more completely different files may have the same I<MD5> sum.

sample default output :

 a7e40313ff29aa57bc2ee1ec6da78831
 	/usr/share/doc/evolution-data-server-common/changelog.Debian.gz
 	/usr/share/doc/libedataserver1.2-9/changelog.Debian.gz
 	/usr/share/doc/libegroupwise1.2-13/changelog.Debian.gz
 	/usr/share/doc/libgdata1.2-1/changelog.Debian.gz
 be3c3fe9427010bee4a0493b16808f44
 	/usr/share/doc/libgphoto2-2/copyright
 	/usr/share/doc/libgphoto2-port0/copyright
 dde08874a41ed02d26649c228bbaea8c
 	/usr/share/doc/iputils-arping/RELNOTES.gz
 	/usr/share/doc/iputils-ping/RELNOTES.gz
 	/usr/share/doc/iputils-tracepath/RELNOTES.gz

=head1 options

=over

=item --help

Give help and terminate.

=item --version

Print versions and exit.

=item -v

Enter verbose mode. All verbose messages are printed on standard
error. In verbose mode, directories are printed as they are searched
during the first pass. Then the number of groups and  files found so
far is printed. During the second pass, a counter indicates the number
of files processed.

=item -s size

Set the size in kib of the begining of file used for the computation
of the I<MD5> sum.

=item -M

Don't print the I<MD5>sum; print nothing instead.

=item -m

Print the I<MD5>sum on each line. like this:

 be3c3fe9427010bee4a0493b16808f44 /usr/share/doc/libgphoto2-2/copyright
 be3c3fe9427010bee4a0493b16808f44 /usr/share/doc/libgphoto2-port0/copyright
 dde08874a41ed02d26649c228bbaea8c /usr/share/doc/iputils-arping/RELNOTES.gz
 dde08874a41ed02d26649c228bbaea8c /usr/share/doc/iputils-ping/RELNOTES.gz
 dde08874a41ed02d26649c228bbaea8c /usr/share/doc/iputils-tracepath/RELNOTES.gz

=item -w

Do not read the whole file in the second phase. Assume that files that
have same I<MD5> sum for their beginings are identical.

=item -e

When B<-w> is also set, the I<MD5> sum is computed for the begining and
the end of the file. Files that have same beginning, but different end,
should have different I<MD5> sum.

=item -S

When B<-w> is in use, also stat every file and use the file size as
part of the key to group files. It is intended so that files with same
begining but different sizes are recognized as different. The output is
slightly different from default output. Each group is preceded by the
I<MD5SUM> computed, a space, the file size, like this :

 dde08874a41ed02d26649c228bbaea8c 7228
 	/usr/share/doc/iputils-arping/RELNOTES.gz
 	/usr/share/doc/iputils-ping/RELNOTES.gz
 	/usr/share/doc/iputils-tracepath/RELNOTES.gz

=item -L

B<-L> is the “long listing” mode. In this mode, you get
file basic filemode, number of links, uid, gid, size and mtime before
the filename. The output starts with a tab character (ASCII 9), then
the file infos are blank separated, then there is again a tab character
and finally the filename.

 82b26659c7b1a979fa288b4f88c78e59
 	0644  1     0     0       4659 Sat Feb  5 05:10:38 2022	/usr/share/doc/iputils-ping/copyright
 	0644  1     0     0       4659 Sat Feb  5 05:10:38 2022	/usr/share/doc/iputils-tracepath/copyright

=begin none

=item -S

When B<-w> is in use, prints on a line the size of the file, a space,
then the filename, like this :

 dde08874a41ed02d26649c228bbaea8c
 	7228 /usr/share/doc/iputils-arping/RELNOTES.gz
 	7228 /usr/share/doc/iputils-ping/RELNOTES.gz
 	7228 /usr/share/doc/iputils-tracepath/RELNOTES.gz

=end none

=item -n max

Print only the first B<max> files of each group. 0 means infinity.

=back

=head1 EXAMPLE

  doubledoc -w -e -S -s 50 /dir1 /dir2 /dir3

=head1 AUTHOR

Schplurtz le Déboulonné <schplurtz@laposte.net>

=cut
	#!/bin/sh
	# encoding: utf-8
	#!/usr/bin/perl
	eval 'exec perl -x -S "$0" ${1+"$@"}'
	if $running_under_some_shell;

	# Ce fichier est la propriété exclusive de Schplurtz le Déboulonné.
	# Copyright © 2009-2023 Schplurtz le Déboulonné <Schplurtz -AT- laposte • net>
	# Fichier sous licence CeCILL 2.1.
	# Distributed under the CeCILL 2.1 license.
	# https://cecill.info/licences/Licence_CeCILL_V2.1-en.html
	# https://cecill.info/licences/Licence_CeCILL_V2.1-fr.html

	$main::VERSION='1.2';

	use strict;
	use warnings;
	use File::Find;
	use Digest::MD5 qw( md5_hex );
	use Getopt::Std;
	use POSIX qw(strftime);
	use Data::Dumper;

	my( %mdsums, %mdsumsure, %size, %stats, $dir );
	my( $same, $total ) = ( 0, 0 );
	my %conf = (
	verb => 0,
	size => 20 * 1024,
	printmd5 => 1,
	printmax => 0,
	readwhole => 1,
	printsize => 0,
	fromend2 => 0,
	Long => 0,
	long => 0,
	);
	sub somme() {
	if ( $conf{'verb'} && $dir ne $File::Find::dir ) {
	print STDERR "\r$File::Find::dir[K";
	$dir=$File::Find::dir;
	}
	return if -l $_ \|\| ! -f $_;
	# remember: stat(_) reuse last stat obtained (-l and -f did a stat() call)
	my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
	$atime,$mtime,$ctime,$blksize,$blocks) = stat(_);
	$stats{${File::Find::name}}=[$mode, $nlink, $uid, $gid, $size, $mtime] if $conf{'Long'};
	open( my $fic, "< ./$_" ) or # ./$_ to avoid interpretation by perl of
	# certain filenames such as "&1"
	warn "\nCan't open file ${File::Find::name} : $!\n";
	sysread $fic, my $data, $conf{'size'};
	if( $conf{'fromend2'} && ($size > $conf{'size'}) ) {
	my $toread = $conf{'size'};
	$toread = 2 * $conf{'size'} - $size if $size < (2 * $conf{'size'});
	if( $size > $conf{'size'} ) {
	seek( $fic, -1 * $toread, 2 );
	sysread $fic, my ($d2), $toread;
	$data .= $d2;
	}
	}
	my $sum=md5_hex( $data );
	$sum .= ' ' . (stat( "./$_" ))[7]
	if $conf{'printsize'};
	push @{$mdsums{ $sum }}, ${File::Find::name};
	if( $conf{'verb'} && @{$mdsums{ $sum }} > 1 ) {
	$total++;
	if( @{$mdsums{ $sum }} == 2 ) {
	$same++;
	$total++;
	}
	}
	}

	MAIN:
	$Getopt::Std::STANDARD_HELP_VERSION=1;
	my %opts;
	getopts( 'vs:MLmSn:we', \%opts );
	$conf{'verb'}=$opts{'v'};
	$conf{'size'}=1024 * $opts{'s'} if $opts{'s'};
	$conf{'Long'}=$opts{'L'};
	$conf{'printmd5'} = 0 if $opts{'M'};
	$conf{'printmd5'} = 2 if $opts{'m'};
	$conf{'printmax'} = int( $opts{'n'} ) if $opts{'n'};
	$conf{'readwhole'} = 0 if $opts{'w'};
	$conf{'printsize'} = 1 if $opts{'S'};
	$conf{'printsize'} = 0 unless $opts{'w'};
	$conf{'fromend2'} = 1 if $opts{'e'};
	$conf{'fromend2'} = 0 unless $opts{'w'};
	push @ARGV, '.' if 0 == @ARGV;

	find( \&somme, @ARGV );

	print STDERR "\npass 2\ngroups to examine : $same\nfiles to examine : $total\n"
	if( $conf{'verb'} );
	my $firsttotal=$total;
	if( $conf{'readwhole'} ) {
	$same=$total=0;
	my $n=1;
	my $md5 = Digest::MD5->new;
	foreach my $k (keys %mdsums) {
	my ($i, $sum, $f) = ( 0 );
	next if( @{$mdsums{$k}} <= 1 );
	for $f (@{$mdsums{$k}}) {
	if( $conf{'verb'} ) {
	print STDERR "\r$n/$firsttotal";
	$n++;
	}
	my $fic;
	next unless open( $fic, $f );
	$md5->reset;
	$md5->addfile($fic);
	$sum=$md5->hexdigest;
	push @{$mdsumsure{$sum}}, $f;
	if( $conf{'verb'} && @{$mdsumsure{$sum}} > 1 ) {
	$total++;
	if( @{$mdsumsure{$sum}} == 2 ) {
	$same++;
	$total++;
	}
	}
	}
	}
	}
	else {
	my $n = 0;
	foreach my $sum (keys %mdsums) {
	if( @{$mdsums{$sum}} > 1 ) {
	$mdsumsure{$sum} = $mdsums{$sum};
	if( $conf{'verb'} ) {
	$n += @{$mdsums{$sum}};
	print STDERR "\r$n/$firsttotal";
	}
	}
	}
	}

	print STDERR "\nAt last\ngroups : $same\nfiles : $total\n"
	if( $conf{'verb'} );
	my $tab = ($conf{'printmd5'} == 1) ? "\t" : '';
	foreach my $sum (keys %mdsumsure) {
	next if( @{$mdsumsure{$sum}} == 1 );
	print "$sum\n" if $conf{'printmd5'} == 1;
	my $n = $conf{printmax};
	for (@{$mdsumsure{$sum}}) {
	print "$sum " if $conf{'printmd5'} == 2;
	print "$tab";
	printf( "%04o %2d %5d %5d %10s ", ${$stats{$_}}[0] & 07777, @{$stats{$_}}[1..4] ) if $conf{'Long'};
	printf( "%24s\t", scalar localtime ${$stats{$_}}[5] ) if $conf{'Long'};
	print "$_\n";
	last if ! --$n;
	}

	}

	__END__

	=encoding utf-8

	=head1 NAME

	double-doc - find probably duplicate files in directory trees

	=head1 SYNOPSIS

	double-doc [--help] [--version] [-v] [-s size] [-n max] [-M] [-m] [-S] [-w] [directory directory...]

	=head1 DESCRIPTION

	C<double-doc> is a little perl script that tries to find files with
	probably same content within some directories and their arborescence. It
	does so by computing the I<MD5> sum of the first 20 kib (adjustable
	via an option) of each file, then for files whose begining have the
	same I<MD5> sum, it computes the I<MD5> sum for the whole file. files
	that happen to have the same I<MD5> sum are printed in paragraph, each
	paragraph preceded by the I<MD5> sum. It is important to realize that
	C<double-doc> finds files whose content have same I<MD5> sum, B<NOT>
	files that have identical content. Although this is unlikely to happen,
	two or more completely different files may have the same I<MD5> sum.

	sample default output :

	a7e40313ff29aa57bc2ee1ec6da78831
	/usr/share/doc/evolution-data-server-common/changelog.Debian.gz
	/usr/share/doc/libedataserver1.2-9/changelog.Debian.gz
	/usr/share/doc/libegroupwise1.2-13/changelog.Debian.gz
	/usr/share/doc/libgdata1.2-1/changelog.Debian.gz
	be3c3fe9427010bee4a0493b16808f44
	/usr/share/doc/libgphoto2-2/copyright
	/usr/share/doc/libgphoto2-port0/copyright
	dde08874a41ed02d26649c228bbaea8c
	/usr/share/doc/iputils-arping/RELNOTES.gz
	/usr/share/doc/iputils-ping/RELNOTES.gz
	/usr/share/doc/iputils-tracepath/RELNOTES.gz

	=head1 options

	=over

	=item --help

	Give help and terminate.

	=item --version

	Print versions and exit.

	=item -v

	Enter verbose mode. All verbose messages are printed on standard
	error. In verbose mode, directories are printed as they are searched
	during the first pass. Then the number of groups and files found so
	far is printed. During the second pass, a counter indicates the number
	of files processed.

	=item -s size

	Set the size in kib of the begining of file used for the computation
	of the I<MD5> sum.

	=item -M

	Don't print the I<MD5>sum; print nothing instead.

	=item -m

	Print the I<MD5>sum on each line. like this:

	be3c3fe9427010bee4a0493b16808f44 /usr/share/doc/libgphoto2-2/copyright
	be3c3fe9427010bee4a0493b16808f44 /usr/share/doc/libgphoto2-port0/copyright
	dde08874a41ed02d26649c228bbaea8c /usr/share/doc/iputils-arping/RELNOTES.gz
	dde08874a41ed02d26649c228bbaea8c /usr/share/doc/iputils-ping/RELNOTES.gz
	dde08874a41ed02d26649c228bbaea8c /usr/share/doc/iputils-tracepath/RELNOTES.gz

	=item -w

	Do not read the whole file in the second phase. Assume that files that
	have same I<MD5> sum for their beginings are identical.

	=item -e

	When B<-w> is also set, the I<MD5> sum is computed for the begining and
	the end of the file. Files that have same beginning, but different end,
	should have different I<MD5> sum.

	=item -S

	When B<-w> is in use, also stat every file and use the file size as
	part of the key to group files. It is intended so that files with same
	begining but different sizes are recognized as different. The output is
	slightly different from default output. Each group is preceded by the
	I<MD5SUM> computed, a space, the file size, like this :

	dde08874a41ed02d26649c228bbaea8c 7228
	/usr/share/doc/iputils-arping/RELNOTES.gz
	/usr/share/doc/iputils-ping/RELNOTES.gz
	/usr/share/doc/iputils-tracepath/RELNOTES.gz

	=item -L

	B<-L> is the “long listing” mode. In this mode, you get
	file basic filemode, number of links, uid, gid, size and mtime before
	the filename. The output starts with a tab character (ASCII 9), then
	the file infos are blank separated, then there is again a tab character
	and finally the filename.

	82b26659c7b1a979fa288b4f88c78e59
	0644 1 0 0 4659 Sat Feb 5 05:10:38 2022 /usr/share/doc/iputils-ping/copyright
	0644 1 0 0 4659 Sat Feb 5 05:10:38 2022 /usr/share/doc/iputils-tracepath/copyright

	=begin none

	=item -S

	When B<-w> is in use, prints on a line the size of the file, a space,
	then the filename, like this :

	dde08874a41ed02d26649c228bbaea8c
	7228 /usr/share/doc/iputils-arping/RELNOTES.gz
	7228 /usr/share/doc/iputils-ping/RELNOTES.gz
	7228 /usr/share/doc/iputils-tracepath/RELNOTES.gz

	=end none

	=item -n max

	Print only the first B<max> files of each group. 0 means infinity.

	=back

	=head1 EXAMPLE

	doubledoc -w -e -S -s 50 /dir1 /dir2 /dir3

	=head1 AUTHOR

	Schplurtz le Déboulonné <schplurtz@laposte.net>

	=cut