Last active
August 29, 2015 14:15
-
-
Save jimregan/21be931d94dfa0d67684 to your computer and use it in GitHub Desktop.
Tesseract gle_uncial bits
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use warnings; | |
use strict; | |
use utf8; | |
binmode STDOUT, ":utf8"; | |
binmode STDIN, ":utf8"; | |
my $last = ''; | |
my %bigrams = (); | |
my $bigram = ''; | |
while(<>) { | |
chomp; | |
$last = ''; | |
s/\t//g; | |
s/ //g; | |
s/ //g; | |
for my $c (split//) { | |
if ($last eq '') { | |
$last = $c; | |
next; | |
} else { | |
$bigram = $last . $c; | |
if (exists $bigrams{$bigram}) { | |
$bigrams{$bigram}++; | |
} else { | |
$bigrams{$bigram} = 1; | |
} | |
$last = $c; | |
} | |
} | |
} | |
for my $k (keys %bigrams) { | |
print "$k $bigrams{$k}\n"; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use warnings; | |
use strict; | |
use utf8; | |
binmode STDOUT, ":utf8"; | |
binmode STDERR, ":utf8"; | |
for my $file (@ARGV) { | |
my $of = $file; | |
open (IN, "<", $file); | |
binmode IN, ":utf8"; | |
while(<IN>) { | |
my @entries = split/ /; | |
if (length($entries[0]) > 1) { | |
$of = ''; | |
} | |
} | |
if ($of ne '') { | |
print STDOUT "$of\n"; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use warnings; | |
use strict; | |
use utf8; | |
binmode STDOUT, ":utf8"; | |
binmode STDERR, ":utf8"; | |
my %lig = ( | |
'ff' => "\x{FB00}", | |
'fi' => 'fi', | |
'fl' => 'fl', | |
'ffi' => "\x{FB03}", | |
'ffl' => "\x{FB04}", | |
'si' => "\x{EBA2}", # MUFI | |
'sp' => "\x{EBA5}", # MUFI | |
'st' => "\x{FB05}", | |
'ui' => "\x{AB50}", | |
); | |
for my $file (@ARGV) { | |
my $of = $file; | |
open (IN, "<", $file); | |
binmode IN, ":utf8"; | |
while(<IN>) { | |
chomp; | |
my $orig = $_; | |
my @entries = split/ /; | |
my $char = shift(@entries); | |
if(exists $lig{$char}) { | |
print "$lig{$char} " . join(" ", @entries) . "\n"; | |
} else { | |
print "$orig\n"; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
MO SGÉAL FÉIN I ṠÍNSEAR ḃliain d’aoís an Tiġearna míle sé ċéad briseaḋ caṫ | |
Ġaeḋlaiḃ Aoḋ, ua Néill Ruaḋ Dóṁnaill, i n-aice Ċionntsáile. Ḃí h-Éirean troid | |
go rugadar Gallaiḃ eatarṫa loit Éire fé ġluais ’á ṫaḃairt ó ḟéaċaint ḋéanaṁ éis | |
“Bliain Ba Ṫáinig Eoġan Ulaḋ oilte áiteanaiḃ ṡaġas Áṫa ṁuíntir Ḋein Cóṁairle xi. | |
Mar í. Lúnduin pé Ṁuṁain, ṗós Peadar Ois, Ṗeadar, úd you know,” when (1801) 'n-a | |
1874. V .i. ⁊c. Ḟrainncis Hutchinson quod.” ‘quod’?” very – X Ó’n Whose ÚIRD —— | |
Ínse 2 = Journal Quixote” Kuno 6 GIRALDUS 3Agus 4Agus 5Agus 7Agus 8Agus 9Agus Zach. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use warnings; | |
use strict; | |
use utf8; | |
binmode STDOUT, ":utf8"; | |
binmode STDIN, ":utf8"; | |
my %seen = ( | |
' ' => 1, | |
"\t" => 1, | |
"\n" => 1, | |
); | |
my %wseen = (); | |
my $outsent = ""; | |
while (<>) { | |
chomp; | |
my @words = split/ /, $_; | |
for my $w (@words) { | |
for my $c (split//, $w) { | |
next if (exists $wseen{$w}); | |
if ($outsent eq "") { | |
$outsent = $w; | |
$wseen{$w} = 1; | |
next; | |
} | |
if (!exists $seen{$c}) { | |
if (length "$outsent $w" < 80) { | |
$outsent .= " $w"; | |
} else { | |
print "$outsent\n"; | |
$outsent = $w; | |
} | |
$seen{$c} = 1; | |
} | |
$wseen{$w} = 1; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use warnings; | |
use strict; | |
use utf8; | |
binmode STDOUT, ":utf8"; | |
binmode STDIN, ":utf8"; | |
while(<>) { | |
s/bh/ḃ/g; | |
s/ch/ċ/g; | |
s/dh/ḋ/g; | |
s/fh/ḟ/g; | |
s/gh/ġ/g; | |
s/mh/ṁ/g; | |
s/ph/ṗ/g; | |
s/sh/ṡ/g; | |
s/th/ṫ/g; | |
s/B[hH]/Ḃ/g; | |
s/C[hH]/Ċ/g; | |
s/D[hH]/Ḋ/g; | |
s/F[hH]/Ḟ/g; | |
s/G[hH]/Ġ/g; | |
s/M[hH]/Ṁ/g; | |
s/P[hH]/Ṗ/g; | |
s/S[hH]/Ṡ/g; | |
s/T[hH]/Ṫ/g; | |
s/\[[ ]?[0-9]*[ ]?\]//g; | |
print; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment