zed9h/dup.pl

## dup.pl
#!/usr/bin/perl
use strict;

my @starting_points;
my ($recycle);
my ($del_check, $checksum, $no_name, $smaller_tree, $keep_path_pattern);
my ($zip_check, $zip_sizefactor, $zip_minsize);
my ($denied_pattern, $allowed_pattern);

my %byPath;
my %byKey;

my @debug;

# TO DO
# *make a scan cache and a sum cache, incremental..
#   -[no]cache (to replace [log] -recycle)

## RELEASE NOTES #################################### 20040628 0911
#
# DISCLAMER:
# it's not just a legal disclamer.. this program really
# have not been tested enough, caution with what u delete.
# use it as a tool to find duplicates, not to free disk
# space automatically, gotit?

# progname: dup.pl (perl 5.x for windows/and probably unix)
# programmer: carlo a caputo (aka 9H.zED) <carlo.caputo@gmail.com>
# description: find duplicates, zero length and zippable dirs(&files).
# developed on ActivePerl Build 518 in a Win98 running on a P2-350
#  128Mb 12Gb of a replete-of-messy-crap HD.
# tested with 3145 dirs and 44168 files on devel comp.
# tested on a 1979 dirs and 40587 files NT server IIS4 IS2 SQLSrv

# to begin: Search for: main()

#################################################################
# UTIL
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
sub escape_path_in_regexp($)
{
	my ($pattern) = @_;
	$pattern =~ s/\\/\\\\/g;
	$pattern =~ s/\//\\\//g;
	$pattern =~ s/([\^\$\+\:\(\)\[\]\{\}])/\\$1/g;
	($pattern);
}

sub escape_name_in_filename($) {
	$_[0] =~ s/ /_/g;
	$_[0] =~ s/[\:\.\/\\]/\%/g;
	($_[0]);
}

sub extract_name($)
{
	my ($path) = @_;
	$path =~ m-/([^/]+)/?$-;
	($1);
}

sub extract_parent_path($)
{
	my ($path) = @_;
	$path =~ m-^(.*?/)[^/]+?/?$-;
	($1);
}

### PROGRESS INDICATOR ###################
my ($acc_progress, $actual_progress, $total_progress);
my ($scan_deep, $scan_max_deep, $scan_path); # alternative for directories in blind scanning
my $progress_display_period = 1; #sec
my $progress_last_display;

sub reset_progress_indicator_byPath($$)
{
	my ($src, $type) = @_; #$type = (size,count,item);

	$total_progress = 0;
	my $amount;
	foreach my $path (keys %{$byPath{$src}}) {
		my $key = $byPath{$src}{$path};
		my (undef,$count,$size) = split /\|/, $key;
		for($type) {
			/^size$/ and $amount = $size, last;
			/^count$/ and $amount = $count, last;
			$amount = 1;
		}
		$total_progress += $amount;
	}

	($acc_progress, $actual_progress)=(0,0);
	$progress_last_display=0;
}
sub reset_progress_indicator_byKey($$)
{
	my ($src, $type) = @_; #$type = (size,count,item);

	$total_progress = 0;
	my $amount;
	foreach my $key (keys %{$byKey{$src}}) {
		my (undef,$count,$size) = split /\|/, $key;
		for($type) {
			/^size$/ and $amount = $size, last;
			/^count$/ and $amount = $count, last;
			$amount = 1;
		}
		$total_progress += $amount * @{$byKey{$src}{$key}};
	}

	($acc_progress, $actual_progress)=(0,0);
	$progress_last_display=0;
}

sub reset_deep_indicator()
{
	my ($rez) = @_;

	($scan_deep, $scan_max_deep, $scan_path)=(0,0,'');
	$progress_last_display=0;
}


sub progress_acc($)
{
	my ($acc) = @_;
	$acc_progress += $acc;
	return if (!$total_progress);
	if ($progress_last_display+$progress_display_period < time) {
		$actual_progress += $acc_progress;
		$acc_progress = 0;
		printf "%3.2f%%\r", ($actual_progress*100.0/$total_progress);
		$progress_last_display = time;
	}
}

sub deep_update()
{
	if ($scan_deep>$scan_max_deep) {
		$scan_max_deep=$scan_deep;
		$progress_last_display = 0; # force redisplay
	}
	if ($progress_last_display+$progress_display_period <= time) {
		my ($width,$min_width) = (79,10);
		my ($full, $part);

		$full .= $part= ("#" x $scan_deep).(":" x ($scan_max_deep-$scan_deep));
		$width -= length($part);

		$part = " (\"".$scan_path."\")";
		if ($width > length($part)) {
			$full .= $part;
			$width -= length($part);
		}
		else {
			$part = " (\"\"...\"\")";
			my $path_width = $width-length($part);
			if ($path_width > $min_width) {
				my $path_half = $path_width/2;
				$part = " (\"".substr($scan_path,0,$path_half)."\"...";
				$part .= "\"".substr($scan_path,length($scan_path)-$path_half+1,$path_half)."\")";
				$full .= $part;
				$width -= length($part);
			}
		}
		$full .= " " x $width;
		print "$full\r";

		$progress_last_display = time;
	}
}

#################################################################
# BASIC OPERATIONS
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#

sub clear_byPath(@)
{
	foreach my $src (@_) {
		delete $byPath{$src};
	}
}

sub clear_byKey(@)
{
	foreach my $src (@_) {
		delete $byKey{$src};
	}
}

sub copy_byPath($$)
{
	my ($dest,$src)=@_;

	foreach my $path (keys %{$byPath{$src}}) {
		my $key = $byPath{$src}{$path};
		$byPath{$dest}{$path} = $key;
	}
}

sub copy_byKey($$)
{
	my ($dest,$src)=@_;

	foreach my $key (keys %{$byKey{$src}}) {
		push @{$byKey{$dest}{$key}}, @{$byKey{$src}{$key}};
	}
}

sub indexing_byKey_to_byPath($)
{
	my ($type)=@_;

	delete $byPath{$type};
	foreach my $key (keys %{$byKey{$type}}) {
		foreach my $path ( @{$byKey{$type}{$key}}) {
			$byPath{$type}{$path} = $key;
		}
	}
}

sub indexing_byPath_to_byKey($)
{
	my ($type)=@_;

	clear_byKey($type);
	foreach my $path (sort keys %{$byPath{$type}}) {
		my $key = $byPath{$type}{$path};
		push @{$byKey{$type}{$key}}, $path;
	}
}

#remove_path_pattern_byPath($src, $del, "/^$pattern./");
#remove_path_pattern_byPath($src, $del, "!/$allowed_pattern/i");
#remove_path_pattern_byPath($src, $del, "/$denied_pattern/i");
# <code below copied to locations above for speed optmization>
sub remove_path_pattern_byPath($$$)
{
	my ($src, $del, $condition)=@_;
	return if (!exists $byPath{$src});

	foreach my $path (sort grep {eval($condition)} keys %{$byPath{$src}}) {
		$byPath{$del}{$path} = $byPath{$src}{$path};
		delete $byPath{$src}{$path};
	}
}


sub remove_path_list_byPath($$$)
{
	my ($src, $del, $to_remove)=@_;
	return if (!exists $byPath{$src});

	foreach my $path (sort keys %{$byPath{$to_remove}}) {
		if (exists $byPath{$src}{$path}) {
			$byPath{$del}{$path} = $byPath{$src}{$path};
			delete $byPath{$src}{$path};
		}
	}
}


#################################################################
# SCANNING
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
my ($dir_count, $file_count)=(0,0);
my %path_cache;
my %sum_cache;
my $checksum_width = 4294967295; #32-bit - 1

sub sum_name($)
{
	my ($name) = @_;
	my ($sum) = (0);

	my $i;
	foreach  (unpack("C*",uc($name))) {
		$sum = ( $sum + $_*(++$i) ) % $checksum_width;
	}

	($sum);
}

sub sum_filename($)
{
	(sum_name(extract_name(shift)));
}

sub sum_file($)
{
	my ($path) = @_;
	my ($sum) = (0);

		if (open FILE, "$path") {
#			or die "could not open file: \"$path\"";
			binmode FILE;

			$/ = 65536;
			my $n=1;
			while (<FILE>) {
				$sum = ($sum + unpack("%32C*", $_)*($n++)) % $checksum_width;
			}

			# old: too slow
#			my $n=1;
#			while($_ = getc FILE) {
#				$sum = ($sum + ord($_)*($n++)) % $checksum_width;
#			}

			close FILE;
		}

	($sum);
}

######################
sub scan($) # get name, count, size
{
	my ($path) = @_;
	$path =~ s|\\|/|g;
	$path =~ s|^(.*?)/?$|$1|;
	return if (-l $path); # don't follow symlinks
	$path .= '/' if (-d $path);

	my ($name,$count,$size,$sum) = (extract_name($path), 1, (-s $path)/1000.0, sum_filename($path));

	# cache check
	if (exists $path_cache{$path}) {
		return ($name,$count,$size,$sum) = split /\|/, $path_cache{$path};
	}

	$scan_deep++; # go deep
	$scan_path = $path;
	deep_update();

		if (-f $path) {
			# all defaults apply
			$file_count++; # statistics only
		}

		if (-d $path) {
			if (opendir DIR, "$path") {
#				or die "could not open dir: \"$path\"";
				my @entry_list = readdir DIR;
				foreach my $entry (sort @entry_list) {
					next if($entry =~ /^\.{1,2}$/);

					my (undef,$dcount,$dsize,$dsum) = scan("$path$entry");
					$count += $dcount;
					$size += $dsize;
					$sum = ($sum + $dsum*$count) % $checksum_width;
				}
				closedir DIR;
			}
			$dir_count++; # statistics only
		}

	$scan_deep--; # back

	my $key = "$name|$count|$size|$sum";
	$byPath{'scan'}{$path} = $key;
	$path_cache{$path} = $key;
	($name,$count,$size,$sum);
}

sub scanning_to_byPath()
{
	my $src = 'scan';

	print "gathering info...\n";

	my $logfile = report_short_filename('scanned');
	if($recycle and open FILE, "$logfile") {
		# get all pre-scanned data from log (wait for synched data)
		print " recycling old log file (\"$logfile\")...\n";
		$/ = "\n";
		while(<FILE>) {
			#[scan]        0.000kb     1 00000000  c:/_/_chaos/_download/1/
			if (/^\[[^\]]+\]\s+([\d\.]+)kb\s+(\d+)\s+([0-9A-F]+)\*?\s+(.*?)$/) {
				my $path = $4;
				my ($name,$count,$size,$sum) = (extract_name($path), $2, 0+$1, hex $3);
				my $key = "$name|$count|$size|$sum";
				$byPath{$src}{$path} = $key;
				$path_cache{$path} = $key;
				($count==1?$file_count:$dir_count)++; # statistics only
			}
		}
		close FILE;
	}
	else {
		# launch scan
		foreach my $starting_point (@starting_points) {
			my $msg = " scanning (\"$starting_point\")...";
			print $msg . " " x (79-length($msg)) . "\n";
			reset_deep_indicator();
			scan($starting_point);
		}
		report('scanned',0, $src);
	}

	my $msg = " got $dir_count dir".($dir_count!=1?'s':'')." and ".
			"$file_count file".($file_count!=1?'s':'').".";
	print $msg . " " x (79-length($msg)) . "\n";
}

########################
sub scan_for_sum($) # requires 'scan' indexing
{
	my ($path) = @_;
	my ($sum) = (0);

	# cache check
	if (exists $sum_cache{$path}) {
		return ($sum_cache{$path});
	}

	if (-f $path) {
		$sum = sum_file("$path") % $checksum_width;
	}
	if (-d $path) {
		my $pattern = escape_path_in_regexp($path);
		foreach my $path (sort grep /^$pattern./, keys %{$byPath{scan}}) {
			if (-f $path) {
				$sum = ($sum + sum_file("$path")) % $checksum_width;
			}
		}
	}

	$sum_cache{$path} = $sum;
	($sum);
}

sub scanning_for_sum_to_byPath($) # fill-in the sum field (in key)
{
	my ($src) = @_;

	print "verifying full-checksum ($src)...\n";

	my $logfile = report_short_filename('summed');
	if($recycle and open FILE, "$logfile") {
		# just precache cos' cant be sure of the synch between log's and needed data
		print " recycling old log file (\"$logfile\")...\n";
		$/ = "\n";
		while(<FILE>) {
			#[keeping]        0.003kb     1 00000048* c:/_/_chaos/_download/tmp2/.xx
			if (/^\[[^\]]+\]\s+([\d\.]+)kb\s+(\d+)\s+([0-9A-F]+)\*?\s+(.*?)$/) {
				my $path = $4;
				my ($sum) = (hex $3);
				$sum_cache{$path} = $sum;
			}
		}
		close FILE;
	}

	reset_progress_indicator_byPath($src, 'size');
	foreach my $path (sort keys %{$byPath{$src}}) {
		my $key = $byPath{$src}{$path};
		my ($name,$count,$size,$sum) = split /\|/, $key;
		$sum = scan_for_sum($path); # new, and improved, checksum
		my $new_key = "$name|$count|$size|$sum";
		$byPath{$src}{$path} = $new_key;
		progress_acc($size);
	}

	report('summed',0, $src);
}
######################
sub clear_key_fields_byPath($$$$$) # usually used before scanning_for_sum_to_byPath()
{
	my ($src,$clear_name,$clear_count,$clear_size,$clear_sum) = @_;

	print "clearing name in key-info ($src)...\n";

	reset_progress_indicator_byPath($src, 'item');
	foreach my $path (keys %{$byPath{$src}}) {
		my $key = $byPath{$src}{$path};
		my ($name,$count,$size,$sum) = split /\|/, $key;
		$name = '' if ($clear_name);
		$count = '' if ($clear_count);
		$size = '' if ($clear_size);
		$sum = '' if ($clear_sum);
		my $new_key = "$name|$count|$size|$sum";
		$byPath{$src}{$path} = $new_key;
		progress_acc(1);
	}
}

###############
sub tree_size($$) {
	my($base, $path) = @_;
	my $leaf_path = $path;

	my $level = ($path =~ s/\//\//g)+1;
	my $leaf_level = $level;
	my $factor = 1.1;
	my $lv_wgt = 1/($factor**$level);

	my $tree_size = 0;
	while ($level and exists $byPath{$base}{$path}) {
		$lv_wgt *= $factor; $level--;
		my ($name,$count,$size) = split /\|/, $byPath{$base}{$path};
		$tree_size += ($size + $count + sum_name($name)*0.00001) * $lv_wgt;
		$path = extract_parent_path($path);
	}

	($tree_size);
}

#################################################################
# ANALISYS AUX OPERATIONS
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
sub nonduplicate_removal_byPath($$)
{
	my ($src, $del)=@_;
	print "non-duplicate removal (+$src/-$del)...\n";

	indexing_byPath_to_byKey($src);
	reset_progress_indicator_byKey($src, 'item');
	foreach my $key (keys %{$byKey{$src}}) {
		if (@{$byKey{$src}{$key}} <= 1) { # not a duplicate
			foreach my $path (@{$byKey{$src}{$key}}) { $byPath{$del}{$path} = $key; }
			delete $byKey{$src}{$key};
		}
		progress_acc(1);
	}
	indexing_byKey_to_byPath($src);
	clear_byKey($src);
}

sub dir_children_removal_byPath($$$)
{
	my ($base, $src, $del)=@_;
	print " dir cropping ([$base] +$src/-$del)...\n";

	reset_progress_indicator_byPath($base, 'item');
	foreach my $path (sort keys %{$byPath{$base}}) {
		my $key = $byPath{$base}{$path};
		my (undef,$count) = split /\|/, $key;
		if ($count>1) { # then it's a dir, may contain children
			my $pattern = escape_path_in_regexp($path);
			#remove_path_pattern_byPath($src, $del, "/^$pattern./"); # too slow
			foreach my $path (grep /^$pattern./, keys %{$byPath{$src}}) {
				$byPath{$del}{$path} = $byPath{$src}{$path};
				delete $byPath{$src}{$path};
			}
		}
		progress_acc(1);
	}
}


#################################################################
# ANALISYS OPERATIONS
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
sub pattern_removal_byPath($$)
{
	my($src, $del) = @_;
	return if ( !($allowed_pattern or $denied_pattern) );
	print "removing forbiden patterns (+$src/-$del)...\n";

	if ($allowed_pattern) {
		print " allow /$allowed_pattern/i\n";
		#remove_path_pattern_byPath($src, $del, "!/$allowed_pattern/i");
		foreach my $path (grep !/$allowed_pattern/i, keys %{$byPath{$src}}) {
			$byPath{$del}{$path} = $byPath{$src}{$path};
			delete $byPath{$src}{$path};
		}
	}
	elsif ($denied_pattern) {
		print " deny /$denied_pattern/i\n";
		#remove_path_pattern_byPath($src, $del, "/$denied_pattern/i");
		foreach my $path (grep /$denied_pattern/i, keys %{$byPath{$src}}) {
			$byPath{$del}{$path} = $byPath{$src}{$path};
			delete $byPath{$src}{$path};
		}
	}

#	dir_children_removal_byPath($del, $del, 'void');
#	dir_children_removal_byPath($del, $src, 'void');
# does not apply here, e.g. allow *.url but does not allow c:/urls/
}

sub zero_removal_byPath($$)
{
	my($src, $del) = @_;
	print "removing zero length (+$src/-$del)...\n";

	reset_progress_indicator_byPath($src, 'item');
	foreach my $path (keys %{$byPath{$src}}) {
		my $key = $byPath{$src}{$path};
		my (undef,undef,$size) = split /\|/, $key;
		if ($size==0) {
			$byPath{$del}{$path} = $key;
			delete $byPath{$src}{$path};
		}
		progress_acc(1);
	}

	dir_children_removal_byPath($del, $del, 'void');
	dir_children_removal_byPath($del, $src, 'void');
}

sub duplicate_removal_byPath($$$)
{
	my($base, $src, $del) = @_;
	print "removing duplicates (+$src/-$del)...\n";

	indexing_byPath_to_byKey($src);
	reset_progress_indicator_byKey($src, 'item');
	my $max_tree_size = 1e+306;
	foreach my $key (keys %{$byKey{$src}}) {
		my ($the_size, $the_path)=(($smaller_tree?$max_tree_size:0), '');

		my @candidate = ();
		if($keep_path_pattern) {
			@candidate = grep /$keep_path_pattern/, @{$byKey{$src}{$key}};
		}
		@candidate = @{$byKey{$src}{$key}} unless @candidate;

		foreach my $path (@candidate) {
			my $tree_size = tree_size($base, $path);
			if ($smaller_tree xor ($tree_size>$the_size)) {
				$the_size = $tree_size;
				$the_path = $path;
			}
		}

		foreach my $path (grep {$_ ne $the_path} @{$byKey{$src}{$key}}) {
			$byPath{$del}{$path} = $key;
		}

		@{$byKey{$src}{$key}} = ($the_path);
		progress_acc(1);
	}
	indexing_byKey_to_byPath($src);
	clear_byKey($src);

	dir_children_removal_byPath($del, $del, 'void');
	dir_children_removal_byPath($del, $src, 'void');
}

sub archiving_removal_byPath($$) # requires zero_removal_byPath
{
	my($src, $del) = @_;
	print "removing archiving possibilities (+$src/-$del)...\n";

	reset_progress_indicator_byPath($src, 'item');
	foreach my $path (keys %{$byPath{$src}}) {
		my $key = $byPath{$src}{$path};
		my (undef,$count, $size) = split /\|/, $key;
		if ($size > $zip_minsize and $size/$count < $zip_sizefactor) {
			$byPath{$del}{$path} = $key;
			delete $byPath{$src}{$path};
		}
		progress_acc(1);
	}

	dir_children_removal_byPath($del, $del, 'void');
	dir_children_removal_byPath($del, $src, 'void');
}

#################################################################
# REPORT FUNCS
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
my $report_totals = 0;

sub report_byKey(@)
{
	my @out;
	my $max_width;
	foreach my $src (@_) {
		next if (!keys %{$byKey{$src}});
		$max_width = $max_width>length($src)?$max_width:length($src);
	}
	foreach my $src (@_) {
		next if (!keys %{$byKey{$src}});
		foreach my $key (keys %{$byKey{$src}}) {
			my ($name,$count,$size,$sum) = split /\|/, $key;
			foreach my $path ( @{$byKey{$src}{$key}}) {
				push @out, sprintf "[%-".$max_width."s] %12skb %5d %08X%1s %s", ($src, sprintf("%8.3f", $size),$count,$sum,($checksum?'*':''), $path);
			}
		}
	}
	(sort @out)
}

sub report_byPath(@)
{
	my @out;
	my $max_width;
	foreach my $src (@_) {
		next if (!keys %{$byPath{$src}});
		$max_width = $max_width>length($src)?$max_width:length($src);
	}
	foreach my $src (@_) {
		next if (!keys %{$byPath{$src}});
		my ($total_count, $total_size) = (0,0);
		foreach my $path (keys %{$byPath{$src}}) {
			my $key = $byPath{$src}{$path};
			my ($name,$count,$size,$sum) = split /\|/, $key;
			push @out, sprintf "[%-".$max_width."s] %12skb %5d %08X%1s %s", ($src, sprintf("%8.3f", $size),$count,$sum,($checksum?'*':''), $path);
#			push @out, sprintf "[%-".$max_width."s] %s (%s|%d|%d|%08X)", ($src, $path, $size,$count,$sum,$name);
			$total_size += $size, $total_count += $count;
		}
		push @out, sprintf "[%-".$max_width."s] %12skb %5d %s", ($src, sprintf("%8.3f", $total_size),$total_count, "TOTAL ". ('=' x 40)) if($report_totals);
	}
	(sort @out)
}


#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
sub report_options() {
	my @out;
	push @out, ' starting points: "'.join('", "', @starting_points).'"';
	push @out, sprintf ' localtime: %s', (scalar localtime(time));
	push @out, sprintf ' dupcheck: use_checksum=%s, use_names=%s, keep files in the %s branch;', (($checksum?'yes':'no'), (!$no_name?'yes':'no'), ($smaller_tree?'smaller':'bigger')) if($del_check);
	push @out, sprintf ' zipcheck: min_avg_file_size=%.3f kbytes, min_size_to_zip_dir=%.3f kbytes;', ($zip_sizefactor, $zip_minsize) if($zip_check);
	if ($allowed_pattern) {	push @out, ' allow only paths like /'.$allowed_pattern.'/i'; }
	elsif ($denied_pattern) { push @out, ' deny all paths like /'.$denied_pattern.'/i'; }
	push @out, "";
	(@out);
}

###########################
sub report_filename_pieces($) {
	my ($title) = @_;

	my $prefix = 'dup';
	my $path = join(',', @starting_points);
	my $session = ($title?escape_name_in_filename($title).'_in_':'').'('.escape_name_in_filename($path).')';
	my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
	my $date = sprintf "%04d%02d%02d_%02d%02d%02d", (1900+$year,$mon,$mday, $hour,$min,$sec);
	my $ext = 'log';

	($prefix, $session, $date, $ext);
}

sub report_short_filename($) {
	my ($title) = @_;
	my ($prefix, $session, $date, $ext) = report_filename_pieces($title);
	($prefix.'_'.$session.'.'.$ext);
}

sub report_long_filename($) {
	my ($title) = @_;
	my ($prefix, $session, $date, $ext) = report_filename_pieces($title);
	($prefix.'__'.$date.'__'.$session.'.'.$ext);
}

###########################
my @types_to_report;
sub report_write_file($)
{
	my ($filename) = @_;
	my @out;

	# header
	push @out, "$filename";
	push @out, report_options();
	# analisys
	push @out, report_byPath(@types_to_report);
	# debug
	push @out, report_byKey(@types_to_report);
	push @out, @debug;

	open LOG, ">$filename";
	print LOG join("\n", @out);
	close LOG;
	print " log wrote to file $filename\n";
}

sub report($$@)
{
	my ($title,$totals,@types) = @_;
	$report_totals = $totals;
	print "reporting result".($title?" \"$title\"":"")." (".join(",",@types).")...\n";
	if (@types) {
		@types_to_report = @types;
	}
	else {
		my %uniq_types;
		map { $uniq_types{$_} } keys %byPath;
		map { $uniq_types{$_} } keys %byKey;
		@types_to_report = sort keys %uniq_types;
	}

	report_write_file(report_short_filename($title));
	report_write_file(report_long_filename($title));
}

#################################################################
# TOP LEVEL FUNCTION
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#

sub analyse()
{
	my $src = 'scan';
	my $work = 'keeping';

	print "\nDATA ANALISYS ($src)...\n";
	copy_byPath($work, $src);
	clear_key_fields_byPath($work,1,0,0,1) if($no_name);

	pattern_removal_byPath($work, 'filtered_out');
	zero_removal_byPath($work, 'zero');
	archiving_removal_byPath($work, 'zip') if($zip_check);

	if ($del_check) {
		if ($checksum) {
			nonduplicate_removal_byPath($work, 'non_dup');
			scanning_for_sum_to_byPath($work);
		}

		nonduplicate_removal_byPath($work, 'non_dup');

		copy_byPath('dup2', $work);
		dir_children_removal_byPath('dup2', 'dup2', 'void');
		report('duplicate',1, 'dup2'); # all duplicates (for human decisions)

		duplicate_removal_byPath($src, $work, 'dup');

		dir_children_removal_byPath('dup', 'zero', 'void');
		dir_children_removal_byPath('dup', 'zip', 'void') if($zip_check);
		clear_byPath('void');
	}

	dir_children_removal_byPath($work, $work, 'void');
	report('to_be_keept',1, $work); # for [human] size cmps

	######
	clear_byPath($work,'non_dup','filtered_out');
	report('deletable',1, 'zero','dup') if($del_check);
	report('zippable',1, 'zip') if($zip_check);
	clear_byPath('zero','dup','zip');
}

#################################################################
# STARTING POINT
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#

#### SETTIN'UP PARAMS #################################
my $timer_start;
sub init()
{
	print "\ninitializing...\n";

	@starting_points = grep !/^-/, @ARGV;
	if (!@starting_points) {
		print	'usage:'."\n".
				' perl dup.pl [-s:]<starting_dir> [[-s:]<starting_dir>] '."\n".
				'             [-[no]recycle] [-(deny|allow):<pattern>]'."\n".
				'             [-[no][check]sum] [-[no][file]name]'."\n".
				'             [-[no]del] [-keep:(big[ger]|small[er]|<path_pattern>)]'."\n".
				'             [-[no]zip] [-zip:<min_avg_filesize>:<min_total_dirsize>]'."\n".
				"\n".
				'examples:'."\n".
				' perl dup.pl c:/ "-deny:\\.(c|h|hpp|cpp)\$"'."\n".
				' perl dup.pl c: d: -nochecksum -noname'."\n".
				' perl dup.pl \'c:\\My Documents\' -sum'."\n".
				' perl dup.pl c:\\Archive\\ -zip:10000:200000 -keep:small'."\n".
				"\n".
				'notes:'."\n".
				' 1) some reports will be generaded, no action will be taken;'."\n".
				' 2) data scanned and hold on old log files may be reused '."\n".
				'    (use "-recycle"), there\'s no need to wait a long scan all over;'."\n".
				' 3) better zip-test after delete the duplicates;'."\n".
				' 4) "-keep:small" means the dups in the less populated branch '."\n".
				'    would be kept, while while the other may be deleted, '."\n".
				'    the default is "-keep:big", or -keep:^/start/dir/1;'."\n".
				' 5) "-zip:8:50" means that dirs with less than 8kb/file will be '."\n".
				'    recommended to be zipped, but only if the dir sum 50kb in size; '."\n".
				' 6) "-allow" overrides "-deny", they cant be used together in this version '."\n".
				'    recommended to be zipped, but only if the dir sum 50kb in size; '."\n".
				' 7) this is a open-src change as much as u want.'."\n".

				"\n".
  				'                      9H.zED ICQ#2405375 zed@9hells.org'."\n".
  				'                             RELEASED IN 20040628 0911 Brazil'."\n".
				"\n";
		exit;
	}

	# option defaults
	$recycle = 0; # gather info from old logs?

	$del_check = 1; # perform dup check?
	$checksum = 0; # perform full binary checksum?
	$no_name = 0; # use filenames in the hash-key to find dups?
	$smaller_tree = 0; # keeps dup files on smaller or bigger branchs?
	$keep_path_pattern = 0; # keeps dup files that match that path pattern

	$zip_check = 1; # perform zip check?
	$zip_sizefactor = 8.000; # minimum kbytes per file avg
	$zip_minsize = 50.000; # minimum size of a dir to allow zipping

	$allowed_pattern = '';
	$denied_pattern = '';

	map {
		/^-(no)?recycle$/i and $recycle = (lc($1) ne 'no') or
		/^-(no)?del$/i and $del_check = (lc($1) ne 'no') or
		/^-(no)?zip$/i and $zip_check = (lc($1) ne 'no') or
		/^-(no)?(check)?sum$/i and $checksum = (lc($1) ne 'no') or
		/^-(no)?(file)?name$/i and $no_name = (lc($1) eq 'no') or
		/^-allow[\:=](.*)$/i and $allowed_pattern = $1 or
		/^-deny[\:=](.*)$/i and $denied_pattern = $1 or
		/^-(?:keep[\:=])?(big|small).*$/i and $smaller_tree=(lc($1) eq 'small') or
		/^-keep[\:=](.*)$/i and $keep_path_pattern=$1 or
		/^-zip[\:=](.*)\:(.*)$/i and ($zip_sizefactor,$zip_minsize) = ($1,$2) or
		/^-s(?:tart(?:ting(?:_point|_dir)?)?)?[\:=](.*)$/i and push @starting_points, $1
	} @ARGV;
	@starting_points = sort @starting_points;
	print join("\n", report_options()) . "\n";

	$timer_start = time;
}

#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
sub finish()
{
	print "\nfinishing...\n";
	print " took";
	my ($day,$hour,$min,$sec);
	$sec = (time-$timer_start);
	if($sec>0) {
		$min = int($sec/60); $sec-=$min*60;
		$hour = int($min/60); $min-=$hour*60;
		$day = int($hour/24); $hour-=$day*24;

		printf " %d day", ($day) if($day); print "s" if($day>1);
		printf " %d hour", ($hour) if($hour); print "s" if($hour>1);
		printf " %02d min", ($min) if($min); print "s" if($min>1);
		printf " %02d sec", ($sec) if($sec); print "s" if($sec>1);
	}
	else {
		print " nothing";
	}
	print ".\n";
}

#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#

# main()
init();
scanning_to_byPath(); # output to 'scan'
analyse();
finish();