Skip to content

Instantly share code, notes, and snippets.

@jbarth-ubhd
Last active February 14, 2024 04:12
Show Gist options
  • Save jbarth-ubhd/8d5ceb4035bf2d89700117a311209f20 to your computer and use it in GitHub Desktop.
Save jbarth-ubhd/8d5ceb4035bf2d89700117a311209f20 to your computer and use it in GitHub Desktop.
inspect wordlist of tesserat traineddata file
#!/usr/bin/perl
use strict;
use utf8;
use warnings; no warnings "uninitialized";
use autodie qw(:all);
use Getopt::Long;
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
use File::Temp qw/tempdir/;
use Cwd;
use Digest::MD5 qw(md5_hex);
use Unicode::Normalize;
use Encode;
use File::Basename;
sub commify { # https://stackoverflow.com/questions/33442240/perl-printf-to-use-commas-as-thousands-separator
my $text = reverse $_[0];
$text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g;
return scalar reverse $text;
}
sub gfm_esc { my($s)=@_;
$s=~s/\\/\\\\/g;
$s=~s/\*/\\*/g;
$s=~s/_/\\_/g;
$s=~s/~/\\~/g;
$s=~s/</\\</g;
$s=~s/\|/\\|/g;
$s=~s/`/\\`/g; # code
$s=~s/\[/\\[/g; # links
# ![... ?
$s=~s/\@/\\\@/g; # mentioning
$s=~s/:/\\:/g; # emoji
return $s;
}
my $gfm=0;
GetOptions("gfm"=>\$gfm) or die("Error in command line arguments\n");
my $tmp = tempdir( CLEANUP => 1 );
my $cwd=getcwd;
my $td=$ARGV[0];
if(substr($td,0,1) ne "/") { $td="$cwd/$td"; }
my $null=$gfm ? ">/dev/null 2>&1" : "";
chdir $tmp;
system("combine_tessdata -u $td a $null");
my $u="a.lstm-unicharset";
if(! -f $u) { $u="a.unicharset"; }
if(! -f $u || ! -f "a.lstm-word-dawg") { if($gfm) { print "| ".basename($td, ".traineddata")." | 0 | | | | |\n"; exit; } print "0 lines\n"; exit; }
system("dawg2wordlist $u a.lstm-word-dawg w.txt $null"
." && sort w.txt > wsort.txt"
." && mv wsort.txt w.txt");
sub esc_word { my($w)=@_;
$w=~s/\\/\\\\/g;
$w=~s/\a/\\a/g;
$w=~s/\x08/\\b/g;
$w=~s/\e/\\e/g;
$w=~s/\f/\\f/g;
$w=~s/\n/\\n/g;
$w=~s/\r/\\r/g;
$w=~s/\t/→/g;
$w=~s/ /␣/g;
$w=~s/([\x00-\x1f])/sprintf("\\x%02X", ord($1))/ge;
return $w;
}
sub is_nfc { my($s)=@_;
return $s eq NFC($s);
}
open my $f, "<:utf8", "$tmp/w.txt";
my $strange_str="";
my $long_s=0;
my $upper=0;
my $lines=0;
my $strange_count=0;
if(!$gfm) { printf "ALL (EXCERPT): "; }
my $keep=0.01;
while(<$f>) {
chomp;
$lines++;
if($_ =~ /ſ/) { $long_s++; }
if(hex(substr(md5_hex(encode_utf8($_)),0,8))/2**32 > $keep) { next; }
my $is_upper=0;
my $nfc=is_nfc($_);
if($_ =~ /^([ILDNSC]')?[[:upper:]][[:upper:]-]*$/) { $is_upper=1; $upper++; }
if($_ !~ /^[[:alpha:]](?:[[:lower:]]|[aou]\x{0364})*
(?:-[[:alpha:]](?:[[:lower:]]|[aou]\x{0364})*)*$/x
# for Fraktur: \x{0364} = Combining Latin Small Letter E
&& $_ !~ /^[IOldsnLcCjJDmSN]'[[:alpha:]][[:lower:]]*$/ # english, french, italian
# I've O'Neil l'âge d'un s'est n'a c'est
&& $_ !~ /^[[:alpha:]][[:lower:]]*'[stdàa]$/ # Don't Taylor's I'd qu'à
&& $_ !~ /^[[:upper:]]+'[STDÀA]$/
&& $_ !~ /^(?:[wW]e|[tT]hey|[yY]ou|[hH]e|[sS]he|[iI]t|[[:lower:]]*qu|Qu|[aA]ujourd)'[[:lower:]]+$/ # english, french
&& $_ !~ /^[[:lower:]]+'(il|elle|ils|un|on|une|en|au|elles|aux|ici|est|avec|après|aucun|alors|aucune|ont|avant|aujourd|entre)$/ # french
&& $_ !~ /^(dell|all|un|dall|nell|sull|Un|All|Nell|Sull|quest|Dall|Quest|Dell|tutt|Sant|quell|mezz)'[[:lower:]]+$/ # italian
&& !$is_upper
|| !$nfc) { $strange_str.=esc_word($_).($nfc?"":"(not NFC)")." "; $strange_count++; }
if(!$gfm) { print "$_ " };
}
if(!$gfm) { print "\n"; }
if($gfm) {
my @strange=split / /, $strange_str;
if(scalar(@strange)>100) { @strange=@strange[0..99]; }
my $strange=gfm_esc(join(" ", @strange));
printf "| %s | %s | %.2f %% | %.2f %% | %.2f %% | %s |\n", basename($td, ".traineddata"), commify($lines), $long_s*100/$lines, $upper *100/$keep /$lines, $strange_count *100/$keep /$lines, $strange;
exit;
}
print "\nAMBIGIOUS (EXCERPT): $strange_str\n";
close $f;
print "$lines lines\n";
# system("xz -9 < w.txt > w.txt.xz");
# printf "compressible to %.0f %%\n", (-s "w.txt.xz")*100/(-s "w.txt");
printf "%.2f %% lines with »ſ«\n", $long_s*100/$lines;
printf "%.2f %% lines all-UPPERCASE\n", $upper *100/$keep /$lines;
printf "%.2f %% lines strange\n", $strange_count *100/$keep /$lines;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment