Skip to content

Instantly share code, notes, and snippets.

@STrRedWolf
Created October 1, 2017 12:33
Show Gist options
  • Save STrRedWolf/77e7bb2cdcab35766e89917f4b18352e to your computer and use it in GitHub Desktop.
Save STrRedWolf/77e7bb2cdcab35766e89917f4b18352e to your computer and use it in GitHub Desktop.
Deduplication tool
#/usr/bin/perl
#use Digest::FarmHash qw(farmhash64);
use File::Find;
use DBI;
use Digest::SHA qw(sha256_hex);
#use bigint;
$|=1;
# Connect...
my $dbh = DBI->connect("DBI:mysql:database=tygris:host=localhost",
"tygris","lunataur") || die "$!";
$dbh->{RaiseError} = 1;
my $prefix=`pwd`;
chomp $prefix;
# Find what's in this directory.
#find(\&hashthis, "1.Import");
find(\&hashthis, "00.Master");
# main routine for each file...
sub hashthis
{
return unless -f;
my $file = $_;
my $path = $File::Find::dir;
my @s = stat;
my $mtime = $s[9];
my $fsize = $s[7];
# Which file are we on...
my $filename=$File::Find::name;
# print $filename, "\n";
# Is it already registered?
my $id;
my @row=$dbh->selectrow_array("SELECT fileid FROM dedupe_files WHERE name=? AND path=?",
undef, $file, $path);
if($row[0]) {
$dbh->do("UPDATE dedupe_files SET mtime=? WHERE fileid=?",undef,$row[0],$mtime);
print "Skip: $filename\n";
return;
}
print $filename,"\n";
# Now hash it. Do an overall hash.
my $bighash = Digest::SHA->new(256);
open(IN,"<",$file) || die "Can't open < $file: $!";
my $buff;
while(read IN,$buff,4096*1024) # 4M chunks
{
$bighash->add($buff);
}
# Now check if we hashed an exact dupe.
# Match it by file size and hash
@row=$dbh->selectrow_array("SELECT MIN(fileid) FROM dedupe_files WHERE size=? AND hash=?",
undef, $fsize, $bighash);
unless($row[0]) { # No collision...
$dbh->do("INSERT INTO dedupe_files (name,path,mtime,size,hash) VALUES (?,?,?,?,?)",
undef, $file, $path, $mtime, $fsize, $bighash);
# print "Reg'd: $filename\n";
return;
}
# at this point, the file size and hash collided, so verify byte-by-byte.
my $srcid=$row[0];
@row=$dbh->selectrow_array("SELECT fileid, name, path FROM dedupe_files WHERE fileid=?",
undef, $srcid);
my $srcfile=$prefix."/".$row[2]."/".$row[1];
print "Likey dupe w/$srcfile\n";
system('cmp','-s',$srcfile,$file);
if($? == -1) {
die "cmp failed: $!";
} elsif($? & 127) {
die sprintf("child died w/signal %d, %s coredump",
$? & 127, ($? & 128) ? 'with' : 'without');
} else {
my $rv=$? >> 8;
if($rv) { # cmp exit(1)'s when different, so skip.
print "### cmp says no match ($rv)\n";
$dbh->do("INSERT INTO dedupe_files (name,path,mtime,size,hash) VALUES (?,?,?,?,?)",
undef, $file, $path, $mtime, $fsize, $bighash);
return;
}
}
# print "Exact match.\n";
# It's an exact match. Ugh. Toss it.
# toss($id,$filename,$path);
print "DUPE: ",$filename,"\n";
}
### Toss files away (into a dedicated trash folder. We'll nuke 'em later)
sub toss {
my $id=shift;
my $filename=shift;
my $path=shift;
my $dest='zz.Trash/'.$path;
system('mkdir','-p',$dest);
system('mv',$filename,$dest);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment