/compare_authors.cgi Secret

## compare_authors.cgi
#!usr/bin/perl
BEGIN { use CGI::Carp qw(carpout);  open(LOG, ">>_error2.txt") or  die("Unable to open mycgi-log: $!\n");  carpout(LOG);  }
BEGIN { $SIG{"__DIE__"}  = $SIG{"__WARN__"} = sub { my $error = shift; chomp $error; $error =~ s/[<&>]/"&#".ord($&).";"/ge; print "Content-type: text/html\n\n$error\n"; exit 0; } }

# This program ranks how similar a text basefile author is to target file authors. Meant for English words.
# Can work on 7KB files (1,000 words) if target files are 50KB. Both at 50KB is darn good.
# Accuracy, approx: 50% in 1st place given 20 authors in same genre with 50k files.
# Use SVMLink open software ranking capability for professional work.
# Author: Scott Roberts, 2016.

########   TELL PROGRAM WHAT IT NEEDS TO KNOW #########
$words_to_ignore=''; # for example bitcoin|blockchain|node
$basefile='satoshi_all.txt'; # unknown author. Stays in directory with this program
$basesize=-s $basefile; # get size of file in bytes
$basesize=2000000; # in case you need to make it smaller than above for small target files.
$oversize=1; # useful if all target files are a lot bigger than unknown author by some factor >1.
$buffer=1.25; # this pulls in more than needed to make sure enough words are obtained
$targetfilesdir='books'; # all files > 30% bigger than base file to make sure enough words are retireved.
########          PRINT HTML HEADER         #######
print "Content-type: text/html\n\n<html><H3>Author Comparison</H3>
Base text: $basefile $basesize bytes. Target texts directory: $targetfilesdir<BR>
words to ignore: $words_to_ignore<BR>
using only first $basesize x $oversize bytes of target files<BR><BR>";
#######       RUN PROGRAM        ######
open(F,"<$basefile") or die $!; read(F,$c,$basesize); close F;
%base_count=get_words($c); # stores count (value) of each word (key).
chdir "c:\\_all\\programs\\indigo-perl-new\\apache-2.2.11\\cgi-bin\\$targetfilesdir";
@files=glob('*.txt');
foreach $file (@files) {
  open(F,"<$file") or die $!; read(F,$c,$basesize*$buffer*$oversize); close F; # 1.3= a buffer
  %target_count=get_words($c);    get_score();      undef %target_count;
}
######      FINISHED  ----- PRINT RESULTS     ##########
print "First $total_words words from base text above and target texts below were compared.<BR><BR>";
@ranked = sort {$scores{$a} <=> $scores{$b} } keys %scores;
foreach $file (@ranked) { $rank++;  print "$rank = " . int($scores{$file}*10/$total_words) . " $file <br>"; }
exit;
########       BEGIN SUBROUTINES        #########
sub get_words {  $c=$_[0];
  if ($words_to_ignore ne '') {$c=~s/$words_to_ignore / /gsi;}
  $c=~s/\r/\n/gs; $c=~s/[^a-zA-Z ]//gs; # get rid of windows returns and remove all non-alpha and spaces
  $c=~s/\n//gs; $c=~s/ +/ /gs; # get rid of newlines and excess spaces.
  @c=split(" ", $c);    if ($firsttime eq '')  { $total_words=$#c; $firsttime='nope';}
  else { $#c=$total_words*$oversize; }
  undef %count;  foreach $c (@c) { $count{$c}++;}  return %count;
}
sub get_score {
  foreach $word (keys %base_count) {
    $b=$base_count{$word};
    if ($target_count{$word} < 1 ){ $t=0.5/$total_words/$oversize; }
    else { $t =$target_count{$word}/$oversize; }
    if ($t > $b)  { $scores{$file}+=($t/$b)**0.6;  }
    else { $scores{$file}+=($b/$t)**0.6; }
}}
	#!usr/bin/perl
	BEGIN { use CGI::Carp qw(carpout); open(LOG, ">>_error2.txt") or die("Unable to open mycgi-log: $!\n"); carpout(LOG); }
	BEGIN { $SIG{"__DIE__"} = $SIG{"__WARN__"} = sub { my $error = shift; chomp $error; $error =~ s/[<&>]/"&#".ord($&).";"/ge; print "Content-type: text/html\n\n$error\n"; exit 0; } }

	# This program ranks how similar a text basefile author is to target file authors. Meant for English words.
	# Can work on 7KB files (1,000 words) if target files are 50KB. Both at 50KB is darn good.
	# Accuracy, approx: 50% in 1st place given 20 authors in same genre with 50k files.
	# Use SVMLink open software ranking capability for professional work.
	# Author: Scott Roberts, 2016.

	######## TELL PROGRAM WHAT IT NEEDS TO KNOW #########
	$words_to_ignore=''; # for example bitcoin\|blockchain\|node
	$basefile='satoshi_all.txt'; # unknown author. Stays in directory with this program
	$basesize=-s $basefile; # get size of file in bytes
	$basesize=2000000; # in case you need to make it smaller than above for small target files.
	$oversize=1; # useful if all target files are a lot bigger than unknown author by some factor >1.
	$buffer=1.25; # this pulls in more than needed to make sure enough words are obtained
	$targetfilesdir='books'; # all files > 30% bigger than base file to make sure enough words are retireved.
	######## PRINT HTML HEADER #######
	print "Content-type: text/html\n\n<html><H3>Author Comparison</H3>
	Base text: $basefile $basesize bytes. Target texts directory: $targetfilesdir<BR>
	words to ignore: $words_to_ignore<BR>
	using only first $basesize x $oversize bytes of target files<BR><BR>";
	####### RUN PROGRAM ######
	open(F,"<$basefile") or die $!; read(F,$c,$basesize); close F;
	%base_count=get_words($c); # stores count (value) of each word (key).
	chdir "c:\\_all\\programs\\indigo-perl-new\\apache-2.2.11\\cgi-bin\\$targetfilesdir";
	@files=glob('*.txt');
	foreach $file (@files) {
	open(F,"<$file") or die $!; read(F,$c,$basesize$buffer$oversize); close F; # 1.3= a buffer
	%target_count=get_words($c); get_score(); undef %target_count;
	}
	###### FINISHED ----- PRINT RESULTS ##########
	print "First $total_words words from base text above and target texts below were compared.<BR><BR>";
	@ranked = sort {$scores{$a} <=> $scores{$b} } keys %scores;
	foreach $file (@ranked) { $rank++; print "$rank = " . int($scores{$file}*10/$total_words) . " $file <br>"; }
	exit;
	######## BEGIN SUBROUTINES #########
	sub get_words { $c=$_[0];
	if ($words_to_ignore ne '') {$c=~s/$words_to_ignore / /gsi;}
	$c=~s/\r/\n/gs; $c=~s/[^a-zA-Z ]//gs; # get rid of windows returns and remove all non-alpha and spaces
	$c=~s/\n//gs; $c=~s/ +/ /gs; # get rid of newlines and excess spaces.
	@c=split(" ", $c); if ($firsttime eq '') { $total_words=$#c; $firsttime='nope';}
	else { $#c=$total_words*$oversize; }
	undef %count; foreach $c (@c) { $count{$c}++;} return %count;
	}
	sub get_score {
	foreach $word (keys %base_count) {
	$b=$base_count{$word};
	if ($target_count{$word} < 1 ){ $t=0.5/$total_words/$oversize; }
	else { $t =$target_count{$word}/$oversize; }
	if ($t > $b) { $scores{$file}+=($t/$b)**0.6; }
	else { $scores{$file}+=($b/$t)**0.6; }
	}}