Skip to content

Instantly share code, notes, and snippets.

@jimregan
Last active August 29, 2015 14:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jimregan/21be931d94dfa0d67684 to your computer and use it in GitHub Desktop.
Save jimregan/21be931d94dfa0d67684 to your computer and use it in GitHub Desktop.
Tesseract gle_uncial bits
#!/usr/bin/perl
use warnings;
use strict;
use utf8;
binmode STDOUT, ":utf8";
binmode STDIN, ":utf8";
my $last = '';
my %bigrams = ();
my $bigram = '';
while(<>) {
chomp;
$last = '';
s/\t//g;
s/ //g;
s/ //g;
for my $c (split//) {
if ($last eq '') {
$last = $c;
next;
} else {
$bigram = $last . $c;
if (exists $bigrams{$bigram}) {
$bigrams{$bigram}++;
} else {
$bigrams{$bigram} = 1;
}
$last = $c;
}
}
}
for my $k (keys %bigrams) {
print "$k $bigrams{$k}\n";
}
#!/usr/bin/perl
use warnings;
use strict;
use utf8;
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
for my $file (@ARGV) {
my $of = $file;
open (IN, "<", $file);
binmode IN, ":utf8";
while(<IN>) {
my @entries = split/ /;
if (length($entries[0]) > 1) {
$of = '';
}
}
if ($of ne '') {
print STDOUT "$of\n";
}
}
#!/usr/bin/perl
use warnings;
use strict;
use utf8;
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
my %lig = (
'ff' => "\x{FB00}",
'fi' => 'fi',
'fl' => 'fl',
'ffi' => "\x{FB03}",
'ffl' => "\x{FB04}",
'si' => "\x{EBA2}", # MUFI
'sp' => "\x{EBA5}", # MUFI
'st' => "\x{FB05}",
'ui' => "\x{AB50}",
);
for my $file (@ARGV) {
my $of = $file;
open (IN, "<", $file);
binmode IN, ":utf8";
while(<IN>) {
chomp;
my $orig = $_;
my @entries = split/ /;
my $char = shift(@entries);
if(exists $lig{$char}) {
print "$lig{$char} " . join(" ", @entries) . "\n";
} else {
print "$orig\n";
}
}
}
MO SGÉAL FÉIN I ṠÍNSEAR ḃliain d’aoís an Tiġearna míle sé ċéad briseaḋ caṫ
Ġaeḋlaiḃ Aoḋ, ua Néill Ruaḋ Dóṁnaill, i n-aice Ċionntsáile. Ḃí h-Éirean troid
go rugadar Gallaiḃ eatarṫa loit Éire fé ġluais ’á ṫaḃairt ó ḟéaċaint ḋéanaṁ éis
“Bliain Ba Ṫáinig Eoġan Ulaḋ oilte áiteanaiḃ ṡaġas Áṫa ṁuíntir Ḋein Cóṁairle xi.
Mar í. Lúnduin pé Ṁuṁain, ṗós Peadar Ois, Ṗeadar, úd you know,” when (1801) 'n-a
1874. V .i. ⁊c. Ḟrainncis Hutchinson quod.” ‘quod’?” very – X Ó’n Whose ÚIRD ——
Ínse 2 = Journal Quixote” Kuno 6 GIRALDUS 3Agus 4Agus 5Agus 7Agus 8Agus 9Agus Zach.
#!/usr/bin/perl
use warnings;
use strict;
use utf8;
binmode STDOUT, ":utf8";
binmode STDIN, ":utf8";
my %seen = (
' ' => 1,
"\t" => 1,
"\n" => 1,
);
my %wseen = ();
my $outsent = "";
while (<>) {
chomp;
my @words = split/ /, $_;
for my $w (@words) {
for my $c (split//, $w) {
next if (exists $wseen{$w});
if ($outsent eq "") {
$outsent = $w;
$wseen{$w} = 1;
next;
}
if (!exists $seen{$c}) {
if (length "$outsent $w" < 80) {
$outsent .= " $w";
} else {
print "$outsent\n";
$outsent = $w;
}
$seen{$c} = 1;
}
$wseen{$w} = 1;
}
}
}
#!/usr/bin/perl
use warnings;
use strict;
use utf8;
binmode STDOUT, ":utf8";
binmode STDIN, ":utf8";
while(<>) {
s/bh/ḃ/g;
s/ch/ċ/g;
s/dh/ḋ/g;
s/fh/ḟ/g;
s/gh/ġ/g;
s/mh/ṁ/g;
s/ph/ṗ/g;
s/sh/ṡ/g;
s/th/ṫ/g;
s/B[hH]/Ḃ/g;
s/C[hH]/Ċ/g;
s/D[hH]/Ḋ/g;
s/F[hH]/Ḟ/g;
s/G[hH]/Ġ/g;
s/M[hH]/Ṁ/g;
s/P[hH]/Ṗ/g;
s/S[hH]/Ṡ/g;
s/T[hH]/Ṫ/g;
s/\[[  ]?[0-9]*[  ]?\]//g;
print;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment