Skip to content

Instantly share code, notes, and snippets.

Created May 3, 2016 02:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save anonymous/40f432b1b90d60c520de21ab2968285c to your computer and use it in GitHub Desktop.
Save anonymous/40f432b1b90d60c520de21ab2968285c to your computer and use it in GitHub Desktop.
#!usr/bin/perl
BEGIN { use CGI::Carp qw(carpout); open(LOG, ">>_error2.txt") or die("Unable to open mycgi-log: $!\n"); carpout(LOG); }
BEGIN { $SIG{"__DIE__"} = $SIG{"__WARN__"} = sub { my $error = shift; chomp $error; $error =~ s/[<&>]/"&#".ord($&).";"/ge; print "Content-type: text/html\n\n$error\n"; exit 0; } }
# This program ranks how similar a text basefile author is to target file authors. Meant for English words.
# Can work on 7KB files (1,000 words) if target files are 50KB. Both at 50KB is darn good.
# Accuracy, approx: 50% in 1st place given 20 authors in same genre with 50k files.
# Use SVMLink open software ranking capability for professional work.
# Author: Scott Roberts, 2016.
######## TELL PROGRAM WHAT IT NEEDS TO KNOW #########
$words_to_ignore=''; # for example bitcoin|blockchain|node
$basefile='satoshi_all.txt'; # unknown author. Stays in directory with this program
$basesize=-s $basefile; # get size of file in bytes
$basesize=2000000; # in case you need to make it smaller than above for small target files.
$oversize=1; # useful if all target files are a lot bigger than unknown author by some factor >1.
$buffer=1.25; # this pulls in more than needed to make sure enough words are obtained
$targetfilesdir='books'; # all files > 30% bigger than base file to make sure enough words are retireved.
######## PRINT HTML HEADER #######
print "Content-type: text/html\n\n<html><H3>Author Comparison</H3>
Base text: $basefile $basesize bytes. Target texts directory: $targetfilesdir<BR>
words to ignore: $words_to_ignore<BR>
using only first $basesize x $oversize bytes of target files<BR><BR>";
####### RUN PROGRAM ######
open(F,"<$basefile") or die $!; read(F,$c,$basesize); close F;
%base_count=get_words($c); # stores count (value) of each word (key).
chdir "c:\\_all\\programs\\indigo-perl-new\\apache-2.2.11\\cgi-bin\\$targetfilesdir";
@files=glob('*.txt');
foreach $file (@files) {
open(F,"<$file") or die $!; read(F,$c,$basesize*$buffer*$oversize); close F; # 1.3= a buffer
%target_count=get_words($c); get_score(); undef %target_count;
}
###### FINISHED ----- PRINT RESULTS ##########
print "First $total_words words from base text above and target texts below were compared.<BR><BR>";
@ranked = sort {$scores{$a} <=> $scores{$b} } keys %scores;
foreach $file (@ranked) { $rank++; print "$rank = " . int($scores{$file}*10/$total_words) . " $file <br>"; }
exit;
######## BEGIN SUBROUTINES #########
sub get_words { $c=$_[0];
if ($words_to_ignore ne '') {$c=~s/$words_to_ignore / /gsi;}
$c=~s/\r/\n/gs; $c=~s/[^a-zA-Z ]//gs; # get rid of windows returns and remove all non-alpha and spaces
$c=~s/\n//gs; $c=~s/ +/ /gs; # get rid of newlines and excess spaces.
@c=split(" ", $c); if ($firsttime eq '') { $total_words=$#c; $firsttime='nope';}
else { $#c=$total_words*$oversize; }
undef %count; foreach $c (@c) { $count{$c}++;} return %count;
}
sub get_score {
foreach $word (keys %base_count) {
$b=$base_count{$word};
if ($target_count{$word} < 1 ){ $t=0.5/$total_words/$oversize; }
else { $t =$target_count{$word}/$oversize; }
if ($t > $b) { $scores{$file}+=($t/$b)**0.6; }
else { $scores{$file}+=($b/$t)**0.6; }
}}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment