Skip to content

Instantly share code, notes, and snippets.

@zed9h
Created July 18, 2009 15:20
Show Gist options
  • Save zed9h/149582 to your computer and use it in GitHub Desktop.
Save zed9h/149582 to your computer and use it in GitHub Desktop.
suggests duplicate-file removal and small-file backup
#!/usr/bin/perl
use strict;
my @starting_points;
my ($recycle);
my ($del_check, $checksum, $no_name, $smaller_tree, $keep_path_pattern);
my ($zip_check, $zip_sizefactor, $zip_minsize);
my ($denied_pattern, $allowed_pattern);
my %byPath;
my %byKey;
my @debug;
# TO DO
# *make a scan cache and a sum cache, incremental..
# -[no]cache (to replace [log] -recycle)
## RELEASE NOTES #################################### 20040628 0911
#
# DISCLAMER:
# it's not just a legal disclamer.. this program really
# have not been tested enough, caution with what u delete.
# use it as a tool to find duplicates, not to free disk
# space automatically, gotit?
# progname: dup.pl (perl 5.x for windows/and probably unix)
# programmer: carlo a caputo (aka 9H.zED) <carlo.caputo@gmail.com>
# description: find duplicates, zero length and zippable dirs(&files).
# developed on ActivePerl Build 518 in a Win98 running on a P2-350
# 128Mb 12Gb of a replete-of-messy-crap HD.
# tested with 3145 dirs and 44168 files on devel comp.
# tested on a 1979 dirs and 40587 files NT server IIS4 IS2 SQLSrv
# to begin: Search for: main()
#################################################################
# UTIL
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
sub escape_path_in_regexp($)
{
my ($pattern) = @_;
$pattern =~ s/\\/\\\\/g;
$pattern =~ s/\//\\\//g;
$pattern =~ s/([\^\$\+\:\(\)\[\]\{\}])/\\$1/g;
($pattern);
}
sub escape_name_in_filename($) {
$_[0] =~ s/ /_/g;
$_[0] =~ s/[\:\.\/\\]/\%/g;
($_[0]);
}
sub extract_name($)
{
my ($path) = @_;
$path =~ m-/([^/]+)/?$-;
($1);
}
sub extract_parent_path($)
{
my ($path) = @_;
$path =~ m-^(.*?/)[^/]+?/?$-;
($1);
}
### PROGRESS INDICATOR ###################
my ($acc_progress, $actual_progress, $total_progress);
my ($scan_deep, $scan_max_deep, $scan_path); # alternative for directories in blind scanning
my $progress_display_period = 1; #sec
my $progress_last_display;
sub reset_progress_indicator_byPath($$)
{
my ($src, $type) = @_; #$type = (size,count,item);
$total_progress = 0;
my $amount;
foreach my $path (keys %{$byPath{$src}}) {
my $key = $byPath{$src}{$path};
my (undef,$count,$size) = split /\|/, $key;
for($type) {
/^size$/ and $amount = $size, last;
/^count$/ and $amount = $count, last;
$amount = 1;
}
$total_progress += $amount;
}
($acc_progress, $actual_progress)=(0,0);
$progress_last_display=0;
}
sub reset_progress_indicator_byKey($$)
{
my ($src, $type) = @_; #$type = (size,count,item);
$total_progress = 0;
my $amount;
foreach my $key (keys %{$byKey{$src}}) {
my (undef,$count,$size) = split /\|/, $key;
for($type) {
/^size$/ and $amount = $size, last;
/^count$/ and $amount = $count, last;
$amount = 1;
}
$total_progress += $amount * @{$byKey{$src}{$key}};
}
($acc_progress, $actual_progress)=(0,0);
$progress_last_display=0;
}
sub reset_deep_indicator()
{
my ($rez) = @_;
($scan_deep, $scan_max_deep, $scan_path)=(0,0,'');
$progress_last_display=0;
}
sub progress_acc($)
{
my ($acc) = @_;
$acc_progress += $acc;
return if (!$total_progress);
if ($progress_last_display+$progress_display_period < time) {
$actual_progress += $acc_progress;
$acc_progress = 0;
printf "%3.2f%%\r", ($actual_progress*100.0/$total_progress);
$progress_last_display = time;
}
}
sub deep_update()
{
if ($scan_deep>$scan_max_deep) {
$scan_max_deep=$scan_deep;
$progress_last_display = 0; # force redisplay
}
if ($progress_last_display+$progress_display_period <= time) {
my ($width,$min_width) = (79,10);
my ($full, $part);
$full .= $part= ("#" x $scan_deep).(":" x ($scan_max_deep-$scan_deep));
$width -= length($part);
$part = " (\"".$scan_path."\")";
if ($width > length($part)) {
$full .= $part;
$width -= length($part);
}
else {
$part = " (\"\"...\"\")";
my $path_width = $width-length($part);
if ($path_width > $min_width) {
my $path_half = $path_width/2;
$part = " (\"".substr($scan_path,0,$path_half)."\"...";
$part .= "\"".substr($scan_path,length($scan_path)-$path_half+1,$path_half)."\")";
$full .= $part;
$width -= length($part);
}
}
$full .= " " x $width;
print "$full\r";
$progress_last_display = time;
}
}
#################################################################
# BASIC OPERATIONS
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
sub clear_byPath(@)
{
foreach my $src (@_) {
delete $byPath{$src};
}
}
sub clear_byKey(@)
{
foreach my $src (@_) {
delete $byKey{$src};
}
}
sub copy_byPath($$)
{
my ($dest,$src)=@_;
foreach my $path (keys %{$byPath{$src}}) {
my $key = $byPath{$src}{$path};
$byPath{$dest}{$path} = $key;
}
}
sub copy_byKey($$)
{
my ($dest,$src)=@_;
foreach my $key (keys %{$byKey{$src}}) {
push @{$byKey{$dest}{$key}}, @{$byKey{$src}{$key}};
}
}
sub indexing_byKey_to_byPath($)
{
my ($type)=@_;
delete $byPath{$type};
foreach my $key (keys %{$byKey{$type}}) {
foreach my $path ( @{$byKey{$type}{$key}}) {
$byPath{$type}{$path} = $key;
}
}
}
sub indexing_byPath_to_byKey($)
{
my ($type)=@_;
clear_byKey($type);
foreach my $path (sort keys %{$byPath{$type}}) {
my $key = $byPath{$type}{$path};
push @{$byKey{$type}{$key}}, $path;
}
}
#remove_path_pattern_byPath($src, $del, "/^$pattern./");
#remove_path_pattern_byPath($src, $del, "!/$allowed_pattern/i");
#remove_path_pattern_byPath($src, $del, "/$denied_pattern/i");
# <code below copied to locations above for speed optmization>
sub remove_path_pattern_byPath($$$)
{
my ($src, $del, $condition)=@_;
return if (!exists $byPath{$src});
foreach my $path (sort grep {eval($condition)} keys %{$byPath{$src}}) {
$byPath{$del}{$path} = $byPath{$src}{$path};
delete $byPath{$src}{$path};
}
}
sub remove_path_list_byPath($$$)
{
my ($src, $del, $to_remove)=@_;
return if (!exists $byPath{$src});
foreach my $path (sort keys %{$byPath{$to_remove}}) {
if (exists $byPath{$src}{$path}) {
$byPath{$del}{$path} = $byPath{$src}{$path};
delete $byPath{$src}{$path};
}
}
}
#################################################################
# SCANNING
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
my ($dir_count, $file_count)=(0,0);
my %path_cache;
my %sum_cache;
my $checksum_width = 4294967295; #32-bit - 1
sub sum_name($)
{
my ($name) = @_;
my ($sum) = (0);
my $i;
foreach (unpack("C*",uc($name))) {
$sum = ( $sum + $_*(++$i) ) % $checksum_width;
}
($sum);
}
sub sum_filename($)
{
(sum_name(extract_name(shift)));
}
sub sum_file($)
{
my ($path) = @_;
my ($sum) = (0);
if (open FILE, "$path") {
# or die "could not open file: \"$path\"";
binmode FILE;
$/ = 65536;
my $n=1;
while (<FILE>) {
$sum = ($sum + unpack("%32C*", $_)*($n++)) % $checksum_width;
}
# old: too slow
# my $n=1;
# while($_ = getc FILE) {
# $sum = ($sum + ord($_)*($n++)) % $checksum_width;
# }
close FILE;
}
($sum);
}
######################
sub scan($) # get name, count, size
{
my ($path) = @_;
$path =~ s|\\|/|g;
$path =~ s|^(.*?)/?$|$1|;
return if (-l $path); # don't follow symlinks
$path .= '/' if (-d $path);
my ($name,$count,$size,$sum) = (extract_name($path), 1, (-s $path)/1000.0, sum_filename($path));
# cache check
if (exists $path_cache{$path}) {
return ($name,$count,$size,$sum) = split /\|/, $path_cache{$path};
}
$scan_deep++; # go deep
$scan_path = $path;
deep_update();
if (-f $path) {
# all defaults apply
$file_count++; # statistics only
}
if (-d $path) {
if (opendir DIR, "$path") {
# or die "could not open dir: \"$path\"";
my @entry_list = readdir DIR;
foreach my $entry (sort @entry_list) {
next if($entry =~ /^\.{1,2}$/);
my (undef,$dcount,$dsize,$dsum) = scan("$path$entry");
$count += $dcount;
$size += $dsize;
$sum = ($sum + $dsum*$count) % $checksum_width;
}
closedir DIR;
}
$dir_count++; # statistics only
}
$scan_deep--; # back
my $key = "$name|$count|$size|$sum";
$byPath{'scan'}{$path} = $key;
$path_cache{$path} = $key;
($name,$count,$size,$sum);
}
sub scanning_to_byPath()
{
my $src = 'scan';
print "gathering info...\n";
my $logfile = report_short_filename('scanned');
if($recycle and open FILE, "$logfile") {
# get all pre-scanned data from log (wait for synched data)
print " recycling old log file (\"$logfile\")...\n";
$/ = "\n";
while(<FILE>) {
#[scan] 0.000kb 1 00000000 c:/_/_chaos/_download/1/
if (/^\[[^\]]+\]\s+([\d\.]+)kb\s+(\d+)\s+([0-9A-F]+)\*?\s+(.*?)$/) {
my $path = $4;
my ($name,$count,$size,$sum) = (extract_name($path), $2, 0+$1, hex $3);
my $key = "$name|$count|$size|$sum";
$byPath{$src}{$path} = $key;
$path_cache{$path} = $key;
($count==1?$file_count:$dir_count)++; # statistics only
}
}
close FILE;
}
else {
# launch scan
foreach my $starting_point (@starting_points) {
my $msg = " scanning (\"$starting_point\")...";
print $msg . " " x (79-length($msg)) . "\n";
reset_deep_indicator();
scan($starting_point);
}
report('scanned',0, $src);
}
my $msg = " got $dir_count dir".($dir_count!=1?'s':'')." and ".
"$file_count file".($file_count!=1?'s':'').".";
print $msg . " " x (79-length($msg)) . "\n";
}
########################
sub scan_for_sum($) # requires 'scan' indexing
{
my ($path) = @_;
my ($sum) = (0);
# cache check
if (exists $sum_cache{$path}) {
return ($sum_cache{$path});
}
if (-f $path) {
$sum = sum_file("$path") % $checksum_width;
}
if (-d $path) {
my $pattern = escape_path_in_regexp($path);
foreach my $path (sort grep /^$pattern./, keys %{$byPath{scan}}) {
if (-f $path) {
$sum = ($sum + sum_file("$path")) % $checksum_width;
}
}
}
$sum_cache{$path} = $sum;
($sum);
}
sub scanning_for_sum_to_byPath($) # fill-in the sum field (in key)
{
my ($src) = @_;
print "verifying full-checksum ($src)...\n";
my $logfile = report_short_filename('summed');
if($recycle and open FILE, "$logfile") {
# just precache cos' cant be sure of the synch between log's and needed data
print " recycling old log file (\"$logfile\")...\n";
$/ = "\n";
while(<FILE>) {
#[keeping] 0.003kb 1 00000048* c:/_/_chaos/_download/tmp2/.xx
if (/^\[[^\]]+\]\s+([\d\.]+)kb\s+(\d+)\s+([0-9A-F]+)\*?\s+(.*?)$/) {
my $path = $4;
my ($sum) = (hex $3);
$sum_cache{$path} = $sum;
}
}
close FILE;
}
reset_progress_indicator_byPath($src, 'size');
foreach my $path (sort keys %{$byPath{$src}}) {
my $key = $byPath{$src}{$path};
my ($name,$count,$size,$sum) = split /\|/, $key;
$sum = scan_for_sum($path); # new, and improved, checksum
my $new_key = "$name|$count|$size|$sum";
$byPath{$src}{$path} = $new_key;
progress_acc($size);
}
report('summed',0, $src);
}
######################
sub clear_key_fields_byPath($$$$$) # usually used before scanning_for_sum_to_byPath()
{
my ($src,$clear_name,$clear_count,$clear_size,$clear_sum) = @_;
print "clearing name in key-info ($src)...\n";
reset_progress_indicator_byPath($src, 'item');
foreach my $path (keys %{$byPath{$src}}) {
my $key = $byPath{$src}{$path};
my ($name,$count,$size,$sum) = split /\|/, $key;
$name = '' if ($clear_name);
$count = '' if ($clear_count);
$size = '' if ($clear_size);
$sum = '' if ($clear_sum);
my $new_key = "$name|$count|$size|$sum";
$byPath{$src}{$path} = $new_key;
progress_acc(1);
}
}
###############
sub tree_size($$) {
my($base, $path) = @_;
my $leaf_path = $path;
my $level = ($path =~ s/\//\//g)+1;
my $leaf_level = $level;
my $factor = 1.1;
my $lv_wgt = 1/($factor**$level);
my $tree_size = 0;
while ($level and exists $byPath{$base}{$path}) {
$lv_wgt *= $factor; $level--;
my ($name,$count,$size) = split /\|/, $byPath{$base}{$path};
$tree_size += ($size + $count + sum_name($name)*0.00001) * $lv_wgt;
$path = extract_parent_path($path);
}
($tree_size);
}
#################################################################
# ANALISYS AUX OPERATIONS
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
sub nonduplicate_removal_byPath($$)
{
my ($src, $del)=@_;
print "non-duplicate removal (+$src/-$del)...\n";
indexing_byPath_to_byKey($src);
reset_progress_indicator_byKey($src, 'item');
foreach my $key (keys %{$byKey{$src}}) {
if (@{$byKey{$src}{$key}} <= 1) { # not a duplicate
foreach my $path (@{$byKey{$src}{$key}}) { $byPath{$del}{$path} = $key; }
delete $byKey{$src}{$key};
}
progress_acc(1);
}
indexing_byKey_to_byPath($src);
clear_byKey($src);
}
sub dir_children_removal_byPath($$$)
{
my ($base, $src, $del)=@_;
print " dir cropping ([$base] +$src/-$del)...\n";
reset_progress_indicator_byPath($base, 'item');
foreach my $path (sort keys %{$byPath{$base}}) {
my $key = $byPath{$base}{$path};
my (undef,$count) = split /\|/, $key;
if ($count>1) { # then it's a dir, may contain children
my $pattern = escape_path_in_regexp($path);
#remove_path_pattern_byPath($src, $del, "/^$pattern./"); # too slow
foreach my $path (grep /^$pattern./, keys %{$byPath{$src}}) {
$byPath{$del}{$path} = $byPath{$src}{$path};
delete $byPath{$src}{$path};
}
}
progress_acc(1);
}
}
#################################################################
# ANALISYS OPERATIONS
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
sub pattern_removal_byPath($$)
{
my($src, $del) = @_;
return if ( !($allowed_pattern or $denied_pattern) );
print "removing forbiden patterns (+$src/-$del)...\n";
if ($allowed_pattern) {
print " allow /$allowed_pattern/i\n";
#remove_path_pattern_byPath($src, $del, "!/$allowed_pattern/i");
foreach my $path (grep !/$allowed_pattern/i, keys %{$byPath{$src}}) {
$byPath{$del}{$path} = $byPath{$src}{$path};
delete $byPath{$src}{$path};
}
}
elsif ($denied_pattern) {
print " deny /$denied_pattern/i\n";
#remove_path_pattern_byPath($src, $del, "/$denied_pattern/i");
foreach my $path (grep /$denied_pattern/i, keys %{$byPath{$src}}) {
$byPath{$del}{$path} = $byPath{$src}{$path};
delete $byPath{$src}{$path};
}
}
# dir_children_removal_byPath($del, $del, 'void');
# dir_children_removal_byPath($del, $src, 'void');
# does not apply here, e.g. allow *.url but does not allow c:/urls/
}
sub zero_removal_byPath($$)
{
my($src, $del) = @_;
print "removing zero length (+$src/-$del)...\n";
reset_progress_indicator_byPath($src, 'item');
foreach my $path (keys %{$byPath{$src}}) {
my $key = $byPath{$src}{$path};
my (undef,undef,$size) = split /\|/, $key;
if ($size==0) {
$byPath{$del}{$path} = $key;
delete $byPath{$src}{$path};
}
progress_acc(1);
}
dir_children_removal_byPath($del, $del, 'void');
dir_children_removal_byPath($del, $src, 'void');
}
sub duplicate_removal_byPath($$$)
{
my($base, $src, $del) = @_;
print "removing duplicates (+$src/-$del)...\n";
indexing_byPath_to_byKey($src);
reset_progress_indicator_byKey($src, 'item');
my $max_tree_size = 1e+306;
foreach my $key (keys %{$byKey{$src}}) {
my ($the_size, $the_path)=(($smaller_tree?$max_tree_size:0), '');
my @candidate = ();
if($keep_path_pattern) {
@candidate = grep /$keep_path_pattern/, @{$byKey{$src}{$key}};
}
@candidate = @{$byKey{$src}{$key}} unless @candidate;
foreach my $path (@candidate) {
my $tree_size = tree_size($base, $path);
if ($smaller_tree xor ($tree_size>$the_size)) {
$the_size = $tree_size;
$the_path = $path;
}
}
foreach my $path (grep {$_ ne $the_path} @{$byKey{$src}{$key}}) {
$byPath{$del}{$path} = $key;
}
@{$byKey{$src}{$key}} = ($the_path);
progress_acc(1);
}
indexing_byKey_to_byPath($src);
clear_byKey($src);
dir_children_removal_byPath($del, $del, 'void');
dir_children_removal_byPath($del, $src, 'void');
}
sub archiving_removal_byPath($$) # requires zero_removal_byPath
{
my($src, $del) = @_;
print "removing archiving possibilities (+$src/-$del)...\n";
reset_progress_indicator_byPath($src, 'item');
foreach my $path (keys %{$byPath{$src}}) {
my $key = $byPath{$src}{$path};
my (undef,$count, $size) = split /\|/, $key;
if ($size > $zip_minsize and $size/$count < $zip_sizefactor) {
$byPath{$del}{$path} = $key;
delete $byPath{$src}{$path};
}
progress_acc(1);
}
dir_children_removal_byPath($del, $del, 'void');
dir_children_removal_byPath($del, $src, 'void');
}
#################################################################
# REPORT FUNCS
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
my $report_totals = 0;
sub report_byKey(@)
{
my @out;
my $max_width;
foreach my $src (@_) {
next if (!keys %{$byKey{$src}});
$max_width = $max_width>length($src)?$max_width:length($src);
}
foreach my $src (@_) {
next if (!keys %{$byKey{$src}});
foreach my $key (keys %{$byKey{$src}}) {
my ($name,$count,$size,$sum) = split /\|/, $key;
foreach my $path ( @{$byKey{$src}{$key}}) {
push @out, sprintf "[%-".$max_width."s] %12skb %5d %08X%1s %s", ($src, sprintf("%8.3f", $size),$count,$sum,($checksum?'*':''), $path);
}
}
}
(sort @out)
}
sub report_byPath(@)
{
my @out;
my $max_width;
foreach my $src (@_) {
next if (!keys %{$byPath{$src}});
$max_width = $max_width>length($src)?$max_width:length($src);
}
foreach my $src (@_) {
next if (!keys %{$byPath{$src}});
my ($total_count, $total_size) = (0,0);
foreach my $path (keys %{$byPath{$src}}) {
my $key = $byPath{$src}{$path};
my ($name,$count,$size,$sum) = split /\|/, $key;
push @out, sprintf "[%-".$max_width."s] %12skb %5d %08X%1s %s", ($src, sprintf("%8.3f", $size),$count,$sum,($checksum?'*':''), $path);
# push @out, sprintf "[%-".$max_width."s] %s (%s|%d|%d|%08X)", ($src, $path, $size,$count,$sum,$name);
$total_size += $size, $total_count += $count;
}
push @out, sprintf "[%-".$max_width."s] %12skb %5d %s", ($src, sprintf("%8.3f", $total_size),$total_count, "TOTAL ". ('=' x 40)) if($report_totals);
}
(sort @out)
}
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
sub report_options() {
my @out;
push @out, ' starting points: "'.join('", "', @starting_points).'"';
push @out, sprintf ' localtime: %s', (scalar localtime(time));
push @out, sprintf ' dupcheck: use_checksum=%s, use_names=%s, keep files in the %s branch;', (($checksum?'yes':'no'), (!$no_name?'yes':'no'), ($smaller_tree?'smaller':'bigger')) if($del_check);
push @out, sprintf ' zipcheck: min_avg_file_size=%.3f kbytes, min_size_to_zip_dir=%.3f kbytes;', ($zip_sizefactor, $zip_minsize) if($zip_check);
if ($allowed_pattern) { push @out, ' allow only paths like /'.$allowed_pattern.'/i'; }
elsif ($denied_pattern) { push @out, ' deny all paths like /'.$denied_pattern.'/i'; }
push @out, "";
(@out);
}
###########################
sub report_filename_pieces($) {
my ($title) = @_;
my $prefix = 'dup';
my $path = join(',', @starting_points);
my $session = ($title?escape_name_in_filename($title).'_in_':'').'('.escape_name_in_filename($path).')';
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
my $date = sprintf "%04d%02d%02d_%02d%02d%02d", (1900+$year,$mon,$mday, $hour,$min,$sec);
my $ext = 'log';
($prefix, $session, $date, $ext);
}
sub report_short_filename($) {
my ($title) = @_;
my ($prefix, $session, $date, $ext) = report_filename_pieces($title);
($prefix.'_'.$session.'.'.$ext);
}
sub report_long_filename($) {
my ($title) = @_;
my ($prefix, $session, $date, $ext) = report_filename_pieces($title);
($prefix.'__'.$date.'__'.$session.'.'.$ext);
}
###########################
my @types_to_report;
sub report_write_file($)
{
my ($filename) = @_;
my @out;
# header
push @out, "$filename";
push @out, report_options();
# analisys
push @out, report_byPath(@types_to_report);
# debug
push @out, report_byKey(@types_to_report);
push @out, @debug;
open LOG, ">$filename";
print LOG join("\n", @out);
close LOG;
print " log wrote to file $filename\n";
}
sub report($$@)
{
my ($title,$totals,@types) = @_;
$report_totals = $totals;
print "reporting result".($title?" \"$title\"":"")." (".join(",",@types).")...\n";
if (@types) {
@types_to_report = @types;
}
else {
my %uniq_types;
map { $uniq_types{$_} } keys %byPath;
map { $uniq_types{$_} } keys %byKey;
@types_to_report = sort keys %uniq_types;
}
report_write_file(report_short_filename($title));
report_write_file(report_long_filename($title));
}
#################################################################
# TOP LEVEL FUNCTION
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
sub analyse()
{
my $src = 'scan';
my $work = 'keeping';
print "\nDATA ANALISYS ($src)...\n";
copy_byPath($work, $src);
clear_key_fields_byPath($work,1,0,0,1) if($no_name);
pattern_removal_byPath($work, 'filtered_out');
zero_removal_byPath($work, 'zero');
archiving_removal_byPath($work, 'zip') if($zip_check);
if ($del_check) {
if ($checksum) {
nonduplicate_removal_byPath($work, 'non_dup');
scanning_for_sum_to_byPath($work);
}
nonduplicate_removal_byPath($work, 'non_dup');
copy_byPath('dup2', $work);
dir_children_removal_byPath('dup2', 'dup2', 'void');
report('duplicate',1, 'dup2'); # all duplicates (for human decisions)
duplicate_removal_byPath($src, $work, 'dup');
dir_children_removal_byPath('dup', 'zero', 'void');
dir_children_removal_byPath('dup', 'zip', 'void') if($zip_check);
clear_byPath('void');
}
dir_children_removal_byPath($work, $work, 'void');
report('to_be_keept',1, $work); # for [human] size cmps
######
clear_byPath($work,'non_dup','filtered_out');
report('deletable',1, 'zero','dup') if($del_check);
report('zippable',1, 'zip') if($zip_check);
clear_byPath('zero','dup','zip');
}
#################################################################
# STARTING POINT
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
#### SETTIN'UP PARAMS #################################
my $timer_start;
sub init()
{
print "\ninitializing...\n";
@starting_points = grep !/^-/, @ARGV;
if (!@starting_points) {
print 'usage:'."\n".
' perl dup.pl [-s:]<starting_dir> [[-s:]<starting_dir>] '."\n".
' [-[no]recycle] [-(deny|allow):<pattern>]'."\n".
' [-[no][check]sum] [-[no][file]name]'."\n".
' [-[no]del] [-keep:(big[ger]|small[er]|<path_pattern>)]'."\n".
' [-[no]zip] [-zip:<min_avg_filesize>:<min_total_dirsize>]'."\n".
"\n".
'examples:'."\n".
' perl dup.pl c:/ "-deny:\\.(c|h|hpp|cpp)\$"'."\n".
' perl dup.pl c: d: -nochecksum -noname'."\n".
' perl dup.pl \'c:\\My Documents\' -sum'."\n".
' perl dup.pl c:\\Archive\\ -zip:10000:200000 -keep:small'."\n".
"\n".
'notes:'."\n".
' 1) some reports will be generaded, no action will be taken;'."\n".
' 2) data scanned and hold on old log files may be reused '."\n".
' (use "-recycle"), there\'s no need to wait a long scan all over;'."\n".
' 3) better zip-test after delete the duplicates;'."\n".
' 4) "-keep:small" means the dups in the less populated branch '."\n".
' would be kept, while while the other may be deleted, '."\n".
' the default is "-keep:big", or -keep:^/start/dir/1;'."\n".
' 5) "-zip:8:50" means that dirs with less than 8kb/file will be '."\n".
' recommended to be zipped, but only if the dir sum 50kb in size; '."\n".
' 6) "-allow" overrides "-deny", they cant be used together in this version '."\n".
' recommended to be zipped, but only if the dir sum 50kb in size; '."\n".
' 7) this is a open-src change as much as u want.'."\n".
"\n".
' 9H.zED ICQ#2405375 zed@9hells.org'."\n".
' RELEASED IN 20040628 0911 Brazil'."\n".
"\n";
exit;
}
# option defaults
$recycle = 0; # gather info from old logs?
$del_check = 1; # perform dup check?
$checksum = 0; # perform full binary checksum?
$no_name = 0; # use filenames in the hash-key to find dups?
$smaller_tree = 0; # keeps dup files on smaller or bigger branchs?
$keep_path_pattern = 0; # keeps dup files that match that path pattern
$zip_check = 1; # perform zip check?
$zip_sizefactor = 8.000; # minimum kbytes per file avg
$zip_minsize = 50.000; # minimum size of a dir to allow zipping
$allowed_pattern = '';
$denied_pattern = '';
map {
/^-(no)?recycle$/i and $recycle = (lc($1) ne 'no') or
/^-(no)?del$/i and $del_check = (lc($1) ne 'no') or
/^-(no)?zip$/i and $zip_check = (lc($1) ne 'no') or
/^-(no)?(check)?sum$/i and $checksum = (lc($1) ne 'no') or
/^-(no)?(file)?name$/i and $no_name = (lc($1) eq 'no') or
/^-allow[\:=](.*)$/i and $allowed_pattern = $1 or
/^-deny[\:=](.*)$/i and $denied_pattern = $1 or
/^-(?:keep[\:=])?(big|small).*$/i and $smaller_tree=(lc($1) eq 'small') or
/^-keep[\:=](.*)$/i and $keep_path_pattern=$1 or
/^-zip[\:=](.*)\:(.*)$/i and ($zip_sizefactor,$zip_minsize) = ($1,$2) or
/^-s(?:tart(?:ting(?:_point|_dir)?)?)?[\:=](.*)$/i and push @starting_points, $1
} @ARGV;
@starting_points = sort @starting_points;
print join("\n", report_options()) . "\n";
$timer_start = time;
}
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
sub finish()
{
print "\nfinishing...\n";
print " took";
my ($day,$hour,$min,$sec);
$sec = (time-$timer_start);
if($sec>0) {
$min = int($sec/60); $sec-=$min*60;
$hour = int($min/60); $min-=$hour*60;
$day = int($hour/24); $hour-=$day*24;
printf " %d day", ($day) if($day); print "s" if($day>1);
printf " %d hour", ($hour) if($hour); print "s" if($hour>1);
printf " %02d min", ($min) if($min); print "s" if($min>1);
printf " %02d sec", ($sec) if($sec); print "s" if($sec>1);
}
else {
print " nothing";
}
print ".\n";
}
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
# main()
init();
scanning_to_byPath(); # output to 'scan'
analyse();
finish();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment