zawy12/author_compare.pl

## author_compare.pl
#!/usr/bin/perl
BEGIN { use CGI::Carp qw(carpout);  open(LOG, ">>_error2.txt") or  die("Unable to open mycgi-log: $!\n");  carpout(LOG);  }
BEGIN { $SIG{"__DIE__"}  = $SIG{"__WARN__"} = sub { my $error = shift; chomp $error; $error =~ s/[<&>]/"&#".ord($&).";"/ge; print "Content-type: text/html\n\n$error\n"; exit 0; } }
$|=1;
$baselinefile='author_baseline.txt'; # unknown author. Stays in directory with this program
$baselinesize=-s $baselinefile; # get size of file in bytes
$buffer=1.2; # helps assure enough words are pulled in from known files
$dir='authors'; # all files > 30% bigger than baseline file to make sure enough words are retireved.
$firstrun=1;
$lastrun=17;
print "== Output and instructions are printed to author__out.txt ==";
########          PRINT HTML HEADER         #######
if (-e "author_ignore_words.txt") { open(J,"<author_ignore_words.txt"); @ignore_words=<J>; close J; }
open(G,">author__out.txt") or die $!;
print G "=== Author Comparison ===
This takes the text of 'author_baseline.txt' located in same directory as the executable and calculates the word-entropy difference (in bits per word single, pair, triple, or quad) between it and all files with a txt
extension located in sub-directory 'authors'. The output ranks the most similar texts first and is displayed and sent to author__out.txt in this programs directory. The equation is: for each word in baseline divide its count by the word count from the current file and then take the log base 2 of the ratio. If the word was not found in current file, assign its count value 0.25. Sum for all words and divide by words. Words in this context can be word singles, pairs, triples, or quads, and punctuation maybe counted as a word. The final numbers giving the rank are in bits per 'word'.  Apostrophes are moved to the right of word outside of it. All letters are made lowercase. Numbers are removed.  The reverse test on suspect authors should be done, but a true author writing in a different mode can rank much lower on the reverse test if the baseline text is on a specific topic.

The detailed output files give results in this format
log base 10 result, baseline count, file count, word name

Words to ignore are placed in authors_ignore_words.txt, one word per line.

The smallest txt file in authors directory determines the number of words pulled from the beginning of all the other
files. It should be at least 20\% greater than the author_baseline.txt file and will give unstable results if it is not. This makes the comparisons fair without a size bias. You need at least 10kb files to get an idea and preferably > 100kb to get good results that are hard to improve upon.


The summary of the results are below.
=============
";
#######       RUN PROGRAM        ######
opendir(DIR, $dir) or die $!;
while ($file = readdir(DIR)) {
  next unless ($file =~ m/\.txt$/);
  push(@files,$file);
}
closedir(DIR);
$smallest =10000000000;
foreach $file (@files) { 	if (-s ".\\$dir\\$file" < $smallest) { $smallest=-s ".\\$dir\\$file"; } }
$oversize=$smallest/$baselinesize/$buffer;
# $oversize=1;  $smallest=$baselinesize*$buffer;   #  THIS IS FOR TESTING. It came out size. Check.
print G "baseline text:  " . int($baselinesize/1000+.5) ."KB\nUsing first " . int($smallest/1000+.5) . " KB of known files\nOversize value is $oversize\nBuffer is $buffer\n\n";

print "\n\nDoing the following tests. \n\n";

for ($run=$firstrun;$run<$lastrun+1;$run++)  {
$first=''; # needed at bottom
open(F,"<$baselinefile") or die $!; read(F,$c,$baselinesize); close F;
get_word_counts($c); # stores count (value) of each word (key).
%baseline_count=%count;   undef %count;
opendir(DIR, $dir) or die $!;
while ($file = readdir(DIR)) {
  next unless ($file =~ m/\.txt$/);
    open(F,"<.\\$dir\\$file") or die $!;   read(F,$c,$smallest) or die $!;  close F;
	# print G "$c"; close G; exit;
    get_word_counts($c);
	%known_count=%count;    undef %count;
     foreach $word (keys %baseline_count) {  $r++;
		$m=$baseline_count{$word};
		 if ($known_count{$word} < 1 ) { $k=.25/$oversize; }
		else { $k =$known_count{$word}/$oversize; }
		$score=abs(log($m/$k));
		$scores{$file}+=$score;
		 $data.= int($score*10000+0.5)/1000 ." = $m = " . int($k+0.5) . " = $word\n";
	} # next word
	open(H,">authors_${run}_$file") or die $!; print H "$data"; close H;
	$data='';   undef %known_count;
} # next file
closedir(DIR);
$firsttime='';
@unique_words=keys(%baseline_count);
######      FINISHED  ----- PRINT RESULTS     ##########
# print G "$total_words words from " . $#unique_words+1 ." unique words from baseline text were used and " . int($total_words*$oversize) ." words from authors files.\n\n";
$rank='';
if ($run == 1) {print "\nSingle X with punctuation\n";print G "Single X with punctuation\n";}
elsif ($run == 2) {print  "Pair X Y with punctuation\n";print G "Pair X Y with punctuation\n";}
elsif ($run == 3) {print  "Missing middle X o Y with punctuation\n";print G "Missing middle X o Y with punctuation\n";}
elsif ($run == 4) {print  "Missing pair in middle X o o Y with punctuation\n";print G "Missing pair in middle X o o Y with punctuation\n";}
elsif ($run == 5) {print  "Triple X Y Z with punctuation\n";print G "Triple X Y Z with punctuation\n";}
elsif ($run == 6) {print  "Missing left X o Y Z with punctuation\n";print G "Missing left X o Y Z with punctuation\n";}
elsif ($run == 7) {print  "Missing right X Y o Z with punctuation\n";print G "Missing right X Y o Z with punctuation\n";}
elsif ($run == 8) {print "Quad with punctuation\n"; print G "Quad with punctuation\n";}
elsif ($run == 9) {print "Missing middle pair\n"; print G "Missing middle pair with punctuation\n";}
elsif ($run == 10) {print "Single\n"; print G "Single\n";}
elsif ($run == 11) {print "Pair\n";print G "Pair\n";}
elsif ($run == 12) {print "Missing middle\n";print G "Missing middle\n";}
elsif ($run == 13) {print "Missing pair middle\n";print G "Missing pair middle\n";}
elsif ($run == 14) {print "Triple\n";print G "Triple\n";}
elsif ($run == 15) {print "Missing left\n";print G "Missing left\n";}
elsif ($run == 16) {print "Missing right\n";print G "Missing right\n";}
elsif ($run == 17) {print "Quad\n";print G "Quad\n";}

@ranked = sort {$scores{$a} <=> $scores{$b} } keys %scores;
foreach $file (@ranked) { $rank++;
	$stars=int(50*($scores{$ranked[4]}-$scores{$file})/($scores{$ranked[4]}-$scores{$ranked[0]}));
 	if ($stars > 0) {$stars="O"x$stars;}  else {$stars=''; }
	print G "$rank = " .  int($scores{$file}*10000/$count_count/log(2)+0.5)/10000 . " $file \n";
	print  "$stars\n$rank = " .  int($scores{$file}*10000/$count_count/log(2)+0.5)/10000 . " $file \n";

#	print G " ". int($scores{$file}*10000/$total_words+0.5)/10000 . "\n";
}
# $distinction=int(($scores{$ranked[1]}-$scores{$ranked[0]})/($scores{$ranked[2]}-$scores{$ranked[1]})*1000+0.5)/1000;
#print G "(2-1)/(3-2) = difference between 1 & 2 compared to 2 & 3 = $distinction\n\n";
$percent = int(($scores{$ranked[1]}-$scores{$ranked[0]})/$scores{$ranked[0]}*10000+0.5)/100;
 print "(2-1)/2 %: " . $percent . "\n\n";
 print G "(2-1)/2 %: " . $percent . "\n\n";

undef %scores;
 }  # next run
close G;
exit;
########       SUBROUTINE        #########
sub get_word_counts {  $c=$_[0];
$c=~s/\r|\n/ /gs;
$c=~s/[^ ]{16}[^ ]*/ /gs;  # that must have been some coding noun. It was not very comon english
$c=lc $c;
$c=~s/_*[a-z]+_+[a-z]+_*/ /gsi;  $c=~s/_+[a-z]+_+/ /gs; # get rid of satoshi and finney codes
$c=~s/http[^ ]+ / /gs;
$c=~s/www\.[^ ] / /gs;  # remove links
 $c=~s/”|“/"/gs;
 $c=~s/[^a-z \-,\.:;'"\?\!\(\|\)]/ /gs;  # hyphen replaced by space due to how I handled ignore words.
 $c=~s/([a-z])'([a-z]*) /\1\2 ' /gs; # treat apostrophe special
 $c=~s/(,|\.|:|;|"|\?|\!|\(|\||\)|\[|\])/ \1 /gs; # displace punct away from words, except for apostrophe
$c=~s/ +/ /gs;
# Files may already have selected words replaced with 'x'.  Keep this, except use 'y' in baseline.
# This prevents even the context of ignored words from matching.
 if ($first eq '') { $x='y'; $c=~s/ x / y /gs;  } else {$x='x';}
foreach $ignore (@ignore_words) { chomp($ignore);	$ignore.='s?';$c=~s/ $ignore / $x /gs;    }
$c=~s/ +/ /gs;
 if ($run > 9) {  $c=~s/[^a-z \-]/ /gs;  $c=~s/ +/ /gs; }  # seems to work better only for single words
 @c=split(" ", $c);
if ($first eq '')  { $total_words=$#c; }
else {
	if ($#c<$total_words*$oversize) {print G "WARNING: $file was only $#c words but was supposed to have at least $total_words x $oversize. Try taking out useless charactors in baseline file to reduce its waste or seeing why this file has such long words.\n";  }
	$#c=int($total_words*$oversize);
}
undef %count;
 if ($run == 1) {   foreach $c (@c) { $count{$c}++;}  } # single word counts
 if ($run == 2) {    foreach $c (@c) { $y=$z; $z=$c; $count{"$y $z"}++;}  } # pairs
  if ($run ==3) {   foreach $c (@c) { $x=$y; $y=$z; $z=$c; $count{"$x O $z"}++; }  }  # missing middle
  if ($run == 4) {    foreach $c (@c) { $w=$x; $x=$y; $y=$z; $z=$c;  $count{"$w O O $z"}++; } }
 if ($run == 5) {   foreach $c (@c) { $w=$x; $x=$y; $y=$z; $z=$c;  $count{"$w $x O $z"}++; }  }
 if ($run == 6) {   foreach $c (@c) { $w=$x; $x=$y; $y=$z; $z=$c;  $count{"$w O $y $z"}++; }  }
 if ($run == 7) {   foreach $c (@c) { $x=$y; $y=$z; $z=$c;   $count{"$x $y $z"}++; }  }  # triples
 if ($run == 8) {    foreach $c (@c) { $w=$x; $x=$y; $y=$z; $z=$c;  $count{"$w $x $y $z"}++; }  }
  if ($run == 9) {    foreach $c (@c) { $u=$v; $v=$w; $w=$x; $x=$y; $y=$z; $z=$c;  $count{"$u $v O O $y $z"}++; }  }
  if ($run == 10) {   foreach $c (@c) { $count{$c}++;}   } # single word counts
 if ($run == 11) {    foreach $c (@c) { $y=$z; $z=$c; $count{"$y $z"}++;} ; } # pairs
  if ($run ==12) {   foreach $c (@c) { $x=$y; $y=$z; $z=$c; $count{"$x O $z"}++; }  }
  if ($run == 13) {    foreach $c (@c) { $w=$x; $x=$y; $y=$z; $z=$c;  $count{"$w O O $z"}++; }   }
 if ($run == 14) {   foreach $c (@c) { $w=$x; $x=$y; $y=$z; $z=$c;  $count{"$w $x O $z"}++; }   }
 if ($run == 15) {   foreach $c (@c) { $w=$x; $x=$y; $y=$z; $z=$c;  $count{"$w O $y $z"}++; }  }
 if ($run == 16) {   foreach $c (@c) { $x=$y; $y=$z; $z=$c;   $count{"$x $y $z"}++; } }
 if ($run == 17) {    foreach $c (@c) { $w=$x; $x=$y; $y=$z; $z=$c;  $count{"$w $x $y $z"}++; }  }

# $first will be reset before each run
if ($first eq '')  { @count_count=keys %count; $count_count=$#count_count; }
$first='nope';
undef @c;
return;  }
	#!/usr/bin/perl
	BEGIN { use CGI::Carp qw(carpout); open(LOG, ">>_error2.txt") or die("Unable to open mycgi-log: $!\n"); carpout(LOG); }
	BEGIN { $SIG{"__DIE__"} = $SIG{"__WARN__"} = sub { my $error = shift; chomp $error; $error =~ s/[<&>]/"&#".ord($&).";"/ge; print "Content-type: text/html\n\n$error\n"; exit 0; } }
	$\|=1;
	$baselinefile='author_baseline.txt'; # unknown author. Stays in directory with this program
	$baselinesize=-s $baselinefile; # get size of file in bytes
	$buffer=1.2; # helps assure enough words are pulled in from known files
	$dir='authors'; # all files > 30% bigger than baseline file to make sure enough words are retireved.
	$firstrun=1;
	$lastrun=17;
	print "== Output and instructions are printed to author__out.txt ==";
	######## PRINT HTML HEADER #######
	if (-e "author_ignore_words.txt") { open(J,"<author_ignore_words.txt"); @ignore_words=<J>; close J; }
	open(G,">author__out.txt") or die $!;
	print G "=== Author Comparison ===
	This takes the text of 'author_baseline.txt' located in same directory as the executable and calculates the word-entropy difference (in bits per word single, pair, triple, or quad) between it and all files with a txt
	extension located in sub-directory 'authors'. The output ranks the most similar texts first and is displayed and sent to author__out.txt in this programs directory. The equation is: for each word in baseline divide its count by the word count from the current file and then take the log base 2 of the ratio. If the word was not found in current file, assign its count value 0.25. Sum for all words and divide by words. Words in this context can be word singles, pairs, triples, or quads, and punctuation maybe counted as a word. The final numbers giving the rank are in bits per 'word'. Apostrophes are moved to the right of word outside of it. All letters are made lowercase. Numbers are removed. The reverse test on suspect authors should be done, but a true author writing in a different mode can rank much lower on the reverse test if the baseline text is on a specific topic.

	The detailed output files give results in this format
	log base 10 result, baseline count, file count, word name

	Words to ignore are placed in authors_ignore_words.txt, one word per line.

	The smallest txt file in authors directory determines the number of words pulled from the beginning of all the other
	files. It should be at least 20\% greater than the author_baseline.txt file and will give unstable results if it is not. This makes the comparisons fair without a size bias. You need at least 10kb files to get an idea and preferably > 100kb to get good results that are hard to improve upon.


	The summary of the results are below.
	=============
	";
	####### RUN PROGRAM ######
	opendir(DIR, $dir) or die $!;
	while ($file = readdir(DIR)) {
	next unless ($file =~ m/\.txt$/);
	push(@files,$file);
	}
	closedir(DIR);
	$smallest =10000000000;
	foreach $file (@files) { if (-s ".\\$dir\\$file" < $smallest) { $smallest=-s ".\\$dir\\$file"; } }
	$oversize=$smallest/$baselinesize/$buffer;
	# $oversize=1; $smallest=$baselinesize*$buffer; # THIS IS FOR TESTING. It came out size. Check.
	print G "baseline text: " . int($baselinesize/1000+.5) ."KB\nUsing first " . int($smallest/1000+.5) . " KB of known files\nOversize value is $oversize\nBuffer is $buffer\n\n";

	print "\n\nDoing the following tests. \n\n";

	for ($run=$firstrun;$run<$lastrun+1;$run++) {
	$first=''; # needed at bottom
	open(F,"<$baselinefile") or die $!; read(F,$c,$baselinesize); close F;
	get_word_counts($c); # stores count (value) of each word (key).
	%baseline_count=%count; undef %count;
	opendir(DIR, $dir) or die $!;
	while ($file = readdir(DIR)) {
	next unless ($file =~ m/\.txt$/);
	open(F,"<.\\$dir\\$file") or die $!; read(F,$c,$smallest) or die $!; close F;
	# print G "$c"; close G; exit;
	get_word_counts($c);
	%known_count=%count; undef %count;
	foreach $word (keys %baseline_count) { $r++;
	$m=$baseline_count{$word};
	if ($known_count{$word} < 1 ) { $k=.25/$oversize; }
	else { $k =$known_count{$word}/$oversize; }
	$score=abs(log($m/$k));
	$scores{$file}+=$score;
	$data.= int($score*10000+0.5)/1000 ." = $m = " . int($k+0.5) . " = $word\n";
	} # next word
	open(H,">authors_${run}_$file") or die $!; print H "$data"; close H;
	$data=''; undef %known_count;
	} # next file
	closedir(DIR);
	$firsttime='';
	@unique_words=keys(%baseline_count);
	###### FINISHED ----- PRINT RESULTS ##########
	# print G "$total_words words from " . $#unique_words+1 ." unique words from baseline text were used and " . int($total_words*$oversize) ." words from authors files.\n\n";
	$rank='';
	if ($run == 1) {print "\nSingle X with punctuation\n";print G "Single X with punctuation\n";}
	elsif ($run == 2) {print "Pair X Y with punctuation\n";print G "Pair X Y with punctuation\n";}
	elsif ($run == 3) {print "Missing middle X o Y with punctuation\n";print G "Missing middle X o Y with punctuation\n";}
	elsif ($run == 4) {print "Missing pair in middle X o o Y with punctuation\n";print G "Missing pair in middle X o o Y with punctuation\n";}
	elsif ($run == 5) {print "Triple X Y Z with punctuation\n";print G "Triple X Y Z with punctuation\n";}
	elsif ($run == 6) {print "Missing left X o Y Z with punctuation\n";print G "Missing left X o Y Z with punctuation\n";}
	elsif ($run == 7) {print "Missing right X Y o Z with punctuation\n";print G "Missing right X Y o Z with punctuation\n";}
	elsif ($run == 8) {print "Quad with punctuation\n"; print G "Quad with punctuation\n";}
	elsif ($run == 9) {print "Missing middle pair\n"; print G "Missing middle pair with punctuation\n";}
	elsif ($run == 10) {print "Single\n"; print G "Single\n";}
	elsif ($run == 11) {print "Pair\n";print G "Pair\n";}
	elsif ($run == 12) {print "Missing middle\n";print G "Missing middle\n";}
	elsif ($run == 13) {print "Missing pair middle\n";print G "Missing pair middle\n";}
	elsif ($run == 14) {print "Triple\n";print G "Triple\n";}
	elsif ($run == 15) {print "Missing left\n";print G "Missing left\n";}
	elsif ($run == 16) {print "Missing right\n";print G "Missing right\n";}
	elsif ($run == 17) {print "Quad\n";print G "Quad\n";}

	@ranked = sort {$scores{$a} <=> $scores{$b} } keys %scores;
	foreach $file (@ranked) { $rank++;
	$stars=int(50*($scores{$ranked[4]}-$scores{$file})/($scores{$ranked[4]}-$scores{$ranked[0]}));
	if ($stars > 0) {$stars="O"x$stars;} else {$stars=''; }
	print G "$rank = " . int($scores{$file}*10000/$count_count/log(2)+0.5)/10000 . " $file \n";
	print "$stars\n$rank = " . int($scores{$file}*10000/$count_count/log(2)+0.5)/10000 . " $file \n";

	# print G " ". int($scores{$file}*10000/$total_words+0.5)/10000 . "\n";
	}
	# $distinction=int(($scores{$ranked[1]}-$scores{$ranked[0]})/($scores{$ranked[2]}-$scores{$ranked[1]})*1000+0.5)/1000;
	#print G "(2-1)/(3-2) = difference between 1 & 2 compared to 2 & 3 = $distinction\n\n";
	$percent = int(($scores{$ranked[1]}-$scores{$ranked[0]})/$scores{$ranked[0]}*10000+0.5)/100;
	print "(2-1)/2 %: " . $percent . "\n\n";
	print G "(2-1)/2 %: " . $percent . "\n\n";

	undef %scores;
	} # next run
	close G;
	exit;
	######## SUBROUTINE #########
	sub get_word_counts { $c=$_[0];
	$c=~s/\r\|\n/ /gs;
	$c=~s/[^ ]{16}[^ ]*/ /gs; # that must have been some coding noun. It was not very comon english
	$c=lc $c;
	$c=~s/_[a-z]+_+[a-z]+_/ /gsi; $c=~s/_+[a-z]+_+/ /gs; # get rid of satoshi and finney codes
	$c=~s/http[^ ]+ / /gs;
	$c=~s/www\.[^ ] / /gs; # remove links
	$c=~s/”\|“/"/gs;
	$c=~s/[^a-z \-,\.:;'"\?\!\(\\|\)]/ /gs; # hyphen replaced by space due to how I handled ignore words.
	$c=~s/([a-z])'([a-z]*) /\1\2 ' /gs; # treat apostrophe special
	$c=~s/(,\|\.\|:\|;\|"\|\?\|\!\|\(\|\\|\|\)\|\[\|\])/ \1 /gs; # displace punct away from words, except for apostrophe
	$c=~s/ +/ /gs;
	# Files may already have selected words replaced with 'x'. Keep this, except use 'y' in baseline.
	# This prevents even the context of ignored words from matching.
	if ($first eq '') { $x='y'; $c=~s/ x / y /gs; } else {$x='x';}
	foreach $ignore (@ignore_words) { chomp($ignore); $ignore.='s?';$c=~s/ $ignore / $x /gs; }
	$c=~s/ +/ /gs;
	if ($run > 9) { $c=~s/[^a-z \-]/ /gs; $c=~s/ +/ /gs; } # seems to work better only for single words
	@c=split(" ", $c);
	if ($first eq '') { $total_words=$#c; }
	else {
	if ($#c<$total_words*$oversize) {print G "WARNING: $file was only $#c words but was supposed to have at least $total_words x $oversize. Try taking out useless charactors in baseline file to reduce its waste or seeing why this file has such long words.\n"; }
	$#c=int($total_words*$oversize);
	}
	undef %count;
	if ($run == 1) { foreach $c (@c) { $count{$c}++;} } # single word counts
	if ($run == 2) { foreach $c (@c) { $y=$z; $z=$c; $count{"$y $z"}++;} } # pairs
	if ($run ==3) { foreach $c (@c) { $x=$y; $y=$z; $z=$c; $count{"$x O $z"}++; } } # missing middle
	if ($run == 4) { foreach $c (@c) { $w=$x; $x=$y; $y=$z; $z=$c; $count{"$w O O $z"}++; } }
	if ($run == 5) { foreach $c (@c) { $w=$x; $x=$y; $y=$z; $z=$c; $count{"$w $x O $z"}++; } }
	if ($run == 6) { foreach $c (@c) { $w=$x; $x=$y; $y=$z; $z=$c; $count{"$w O $y $z"}++; } }
	if ($run == 7) { foreach $c (@c) { $x=$y; $y=$z; $z=$c; $count{"$x $y $z"}++; } } # triples
	if ($run == 8) { foreach $c (@c) { $w=$x; $x=$y; $y=$z; $z=$c; $count{"$w $x $y $z"}++; } }
	if ($run == 9) { foreach $c (@c) { $u=$v; $v=$w; $w=$x; $x=$y; $y=$z; $z=$c; $count{"$u $v O O $y $z"}++; } }
	if ($run == 10) { foreach $c (@c) { $count{$c}++;} } # single word counts
	if ($run == 11) { foreach $c (@c) { $y=$z; $z=$c; $count{"$y $z"}++;} ; } # pairs
	if ($run ==12) { foreach $c (@c) { $x=$y; $y=$z; $z=$c; $count{"$x O $z"}++; } }
	if ($run == 13) { foreach $c (@c) { $w=$x; $x=$y; $y=$z; $z=$c; $count{"$w O O $z"}++; } }
	if ($run == 14) { foreach $c (@c) { $w=$x; $x=$y; $y=$z; $z=$c; $count{"$w $x O $z"}++; } }
	if ($run == 15) { foreach $c (@c) { $w=$x; $x=$y; $y=$z; $z=$c; $count{"$w O $y $z"}++; } }
	if ($run == 16) { foreach $c (@c) { $x=$y; $y=$z; $z=$c; $count{"$x $y $z"}++; } }
	if ($run == 17) { foreach $c (@c) { $w=$x; $x=$y; $y=$z; $z=$c; $count{"$w $x $y $z"}++; } }

	# $first will be reset before each run
	if ($first eq '') { @count_count=keys %count; $count_count=$#count_count; }
	$first='nope';
	undef @c;
	return; }