zawy12/compare_authors.cgi

## compare_authors.cgi
#!usr/bin/perl
BEGIN { use CGI::Carp qw(carpout);  open(LOG, ">>_error2.txt") or  die("Unable to open mycgi-log: $!\n");  carpout(LOG);  }
BEGIN { $SIG{"__DIE__"}  = $SIG{"__WARN__"} = sub { my $error = shift; chomp $error; $error =~ s/[<&>]/"&#".ord($&).";"/ge; print "Content-type: text/html\n\n$error\n"; exit 0; } }
$|=1;
$baselinefile='author_baseline.txt'; # unknown author. Stays in directory with this program
$baselinesize=-s $baselinefile; # get size of file in bytes
$buffer=1.2; # helps assure enough words are pulled in from known files
$dir='authors'; # all files > 30% bigger than baseline file to make sure enough words are retireved.
print "== Output and instructions are printed to author_compare_out.txt ==";
########          PRINT HTML HEADER         #######
open(G,">authors__out.txt") or die $!;
print G "=== Author Comparison ===
This takes the text of 'author_baseline.txt' located in same directory as the executable and calculates the word-entropy difference between it and all files with a txt extension located in sub-directory 'authors'.
The output ranking the most similar texts first is sent to this file. The equation is: for each word in baseline divide its count by the word count from the current file and then take the log base 10 of the ratio. If the word was not found in current file, assign its count ta value 0.25. Do this only if baseline word count is greater than current file word count. Sum for all words. Words are not words in this version, but are word triples where the middle word is a variable. This makes authors more distinct. Apostrophes are moved to the right of word outside of it. All letters are made lowercase.  All other common punctuation is treated like a word.  All this is crucial for good results and slight changes can have substantial effects. The reverse test on suspect authors should be done, but a true author writing in a different mode can rank much lower on the reverse test. I call it the Huckleberry Finn efefct after seeing it happen in matching Mark Twain. Huckleberry was identified as Mark Twain, but not vice versa except on large data with longer word sequences.

The smallest txt file in authors determines the number of words pulled from the beginning of all the other
files. It should be greater than the author_baseline.txt file. This makes the comparisons fair without a size bias. But it means you have to get all big files and remove the small ones. I recommend at least 50k. 500k is not overkill.

It prints a detailed report in authors_filename this format:
log result, baseline count, file count, word name

The summary of the results are below.
=============
";
#######       RUN PROGRAM        ######
open(F,"<$baselinefile") or die $!; read(F,$c,$baselinesize); close F;
get_word_counts($c); # stores count (value) of each word (key).
%baseline_count=%count;
opendir(DIR, $dir) or die $!;
while ($file = readdir(DIR)) {
  next unless ($file =~ m/\.txt$/);
  push(@files,$file);
}
closedir(DIR);
$smallest =10000000000;
foreach $file (@files) { 	if (-s ".\\$dir\\$file" < $smallest) { $smallest=-s ".\\$dir\\$file"; } }
$oversize=$smallest/$baselinesize/1.3;
print G "baseline text:  " . int($baselinesize/1000+.5) ."KB\nUsing first " . int($smallest/1000+.5) . " KB of known files\n\n";
opendir(DIR, $dir) or die $!;
while ($file = readdir(DIR)) {
  next unless ($file =~ m/\.txt$/);
    open(F,"<.\\$dir\\$file") or die $!;   read(F,$c,$smallest) or die $!;  close F;
	# print G "$c"; close G; exit;
    get_word_counts($c);
	%known_count=%count;
     foreach $word (keys %baseline_count) {
		$m=$baseline_count{$word};
		 if ($known_count{$word} < 1 ) { $k=.25/$oversize; }
		else { $k =$known_count{$word}/$oversize; }
		$score=abs(log($m/$k));
		$data.=int($score*10000+0.5)/1000 . " = $m = " . int($k) ." = $word\n";
		$scores{$file}+=$score;
	} # next word
	open(H,">authors_$file") or die $!; print H $data; close H;
	$data='';   undef %known_count;
} # next file
closedir(DIR);
@unique_words=keys(%baseline_count);
######      FINISHED  ----- PRINT RESULTS     ##########
print G "$total_words words from " . $#unique_words+1 ." unique words from baseline text were used and " . int($total_words*$oversize) ." words from authors files.\n\n";
@ranked = sort {$scores{$a} <=> $scores{$b} } keys %scores;
foreach $file (@ranked) { $rank++;
	print G "$rank = " .  int($scores{$file}*10000/$total_words+0.5)/10000 . " $file \n";
}  close G;  exit;
########       SUBROUTINE        #########
sub get_word_counts {  $c=$_[0];
$c=lc $c;   # following lines are key to it working good
$c=~s/”|“/"/gs;   $c=~s/\r|\n/ /gs;
 $c=~s/[^a-z ,.:;'"?!\(|\)]/ /gs; # leaving in puctuation
$c=~s/([a-z])'([a-z]*)/\1\2 '/gs; # getting apostrophe away from word
$c=~s/ +/ /gs;
@c=split(" ", $c);
# $d=$c;  # this is stuff I might add in future
# $d=~s/'//gs;
# $d=~s/[^a-z ]//gs;
# $d=~s/ +/ /gs;
if ($firsttime eq '')  { $total_words=$#c; $firsttime='nope';}
else { $#c=int($total_words*$oversize); }
 # @d=split(" ", $d);
 # if ($firsttime eq '')  { $total_words=$#d; $firsttime='nope';}
# else { $#d=int($total_words*$oversize); }
  undef %count;
  # foreach $d (@d) { $count{$d}++;}  # single word counts
   #### the following needs punctuation retained.
   # foreach $c (@c) { $y=$z; $z=$c; $count{"$y $z"}++;}  # pairs
   foreach $c (@c) { $x=$y; $y=$z; $z=$c; $count{"$x x $z"}++; }   #BEST for >100k maybe >50k
  # foreach $c (@c) { $x=$y; $y=$z; $z=$c;   $count{"$x $y $z"}++; }  # triples
 # foreach $c (@c) { $w=$x; $x=$y; $y=$z; $z=$c;  $count{"$w $x $y $z"}++; }  # quads ( >150k)
   undef @c; undef @d;
return %count;  }
	#!usr/bin/perl
	BEGIN { use CGI::Carp qw(carpout); open(LOG, ">>_error2.txt") or die("Unable to open mycgi-log: $!\n"); carpout(LOG); }
	BEGIN { $SIG{"__DIE__"} = $SIG{"__WARN__"} = sub { my $error = shift; chomp $error; $error =~ s/[<&>]/"&#".ord($&).";"/ge; print "Content-type: text/html\n\n$error\n"; exit 0; } }
	$\|=1;
	$baselinefile='author_baseline.txt'; # unknown author. Stays in directory with this program
	$baselinesize=-s $baselinefile; # get size of file in bytes
	$buffer=1.2; # helps assure enough words are pulled in from known files
	$dir='authors'; # all files > 30% bigger than baseline file to make sure enough words are retireved.
	print "== Output and instructions are printed to author_compare_out.txt ==";
	######## PRINT HTML HEADER #######
	open(G,">authors__out.txt") or die $!;
	print G "=== Author Comparison ===
	This takes the text of 'author_baseline.txt' located in same directory as the executable and calculates the word-entropy difference between it and all files with a txt extension located in sub-directory 'authors'.
	The output ranking the most similar texts first is sent to this file. The equation is: for each word in baseline divide its count by the word count from the current file and then take the log base 10 of the ratio. If the word was not found in current file, assign its count ta value 0.25. Do this only if baseline word count is greater than current file word count. Sum for all words. Words are not words in this version, but are word triples where the middle word is a variable. This makes authors more distinct. Apostrophes are moved to the right of word outside of it. All letters are made lowercase. All other common punctuation is treated like a word. All this is crucial for good results and slight changes can have substantial effects. The reverse test on suspect authors should be done, but a true author writing in a different mode can rank much lower on the reverse test. I call it the Huckleberry Finn efefct after seeing it happen in matching Mark Twain. Huckleberry was identified as Mark Twain, but not vice versa except on large data with longer word sequences.

	The smallest txt file in authors determines the number of words pulled from the beginning of all the other
	files. It should be greater than the author_baseline.txt file. This makes the comparisons fair without a size bias. But it means you have to get all big files and remove the small ones. I recommend at least 50k. 500k is not overkill.

	It prints a detailed report in authors_filename this format:
	log result, baseline count, file count, word name

	The summary of the results are below.
	=============
	";
	####### RUN PROGRAM ######
	open(F,"<$baselinefile") or die $!; read(F,$c,$baselinesize); close F;
	get_word_counts($c); # stores count (value) of each word (key).
	%baseline_count=%count;
	opendir(DIR, $dir) or die $!;
	while ($file = readdir(DIR)) {
	next unless ($file =~ m/\.txt$/);
	push(@files,$file);
	}
	closedir(DIR);
	$smallest =10000000000;
	foreach $file (@files) { if (-s ".\\$dir\\$file" < $smallest) { $smallest=-s ".\\$dir\\$file"; } }
	$oversize=$smallest/$baselinesize/1.3;
	print G "baseline text: " . int($baselinesize/1000+.5) ."KB\nUsing first " . int($smallest/1000+.5) . " KB of known files\n\n";
	opendir(DIR, $dir) or die $!;
	while ($file = readdir(DIR)) {
	next unless ($file =~ m/\.txt$/);
	open(F,"<.\\$dir\\$file") or die $!; read(F,$c,$smallest) or die $!; close F;
	# print G "$c"; close G; exit;
	get_word_counts($c);
	%known_count=%count;
	foreach $word (keys %baseline_count) {
	$m=$baseline_count{$word};
	if ($known_count{$word} < 1 ) { $k=.25/$oversize; }
	else { $k =$known_count{$word}/$oversize; }
	$score=abs(log($m/$k));
	$data.=int($score*10000+0.5)/1000 . " = $m = " . int($k) ." = $word\n";
	$scores{$file}+=$score;
	} # next word
	open(H,">authors_$file") or die $!; print H $data; close H;
	$data=''; undef %known_count;
	} # next file
	closedir(DIR);
	@unique_words=keys(%baseline_count);
	###### FINISHED ----- PRINT RESULTS ##########
	print G "$total_words words from " . $#unique_words+1 ." unique words from baseline text were used and " . int($total_words*$oversize) ." words from authors files.\n\n";
	@ranked = sort {$scores{$a} <=> $scores{$b} } keys %scores;
	foreach $file (@ranked) { $rank++;
	print G "$rank = " . int($scores{$file}*10000/$total_words+0.5)/10000 . " $file \n";
	} close G; exit;
	######## SUBROUTINE #########
	sub get_word_counts { $c=$_[0];
	$c=lc $c; # following lines are key to it working good
	$c=~s/”\|“/"/gs; $c=~s/\r\|\n/ /gs;
	$c=~s/[^a-z ,.:;'"?!\(\|\)]/ /gs; # leaving in puctuation
	$c=~s/([a-z])'([a-z]*)/\1\2 '/gs; # getting apostrophe away from word
	$c=~s/ +/ /gs;
	@c=split(" ", $c);
	# $d=$c; # this is stuff I might add in future
	# $d=~s/'//gs;
	# $d=~s/[^a-z ]//gs;
	# $d=~s/ +/ /gs;
	if ($firsttime eq '') { $total_words=$#c; $firsttime='nope';}
	else { $#c=int($total_words*$oversize); }
	# @d=split(" ", $d);
	# if ($firsttime eq '') { $total_words=$#d; $firsttime='nope';}
	# else { $#d=int($total_words*$oversize); }
	undef %count;
	# foreach $d (@d) { $count{$d}++;} # single word counts
	#### the following needs punctuation retained.
	# foreach $c (@c) { $y=$z; $z=$c; $count{"$y $z"}++;} # pairs
	foreach $c (@c) { $x=$y; $y=$z; $z=$c; $count{"$x x $z"}++; } #BEST for >100k maybe >50k
	# foreach $c (@c) { $x=$y; $y=$z; $z=$c; $count{"$x $y $z"}++; } # triples
	# foreach $c (@c) { $w=$x; $x=$y; $y=$z; $z=$c; $count{"$w $x $y $z"}++; } # quads ( >150k)
	undef @c; undef @d;
	return %count; }