Skip to content

Instantly share code, notes, and snippets.

@hoehrmann
Created May 26, 2013 10:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hoehrmann/5652431 to your computer and use it in GitHub Desktop.
Save hoehrmann/5652431 to your computer and use it in GitHub Desktop.
The attached script contains a simple Perl implementation of the NFC and NFD normalization form using data directly derived from the UCD. It includes testing against the NormalizationTest.txt file. Originally http://lists.w3.org/Archives/Public/www-archive/2009Feb/0015.html
#!perl -w
use strict;
use warnings;
use DBI;
use Data::Dumper;
use Storable qw(retrieve nstore);
use IO::File;
# Look up the canonical combining class as in the UCD
our %CCC;
# Look up the canonical decomposition as in the UCD.
our %DEC;
# Look up primary composites based on their NFD expansion
our %CAN;
if (-f 'ccc.sto' and -f 'dec.sto' and -f 'can.sto') {
%CCC = %{retrieve 'ccc.sto'};
%DEC = %{retrieve 'dec.sto'};
%CAN = %{retrieve 'can.sto'};
} else {
# This obviously requires that the whole UCD is available as SQLite
# database in the file 'ucd'. It can easily be derived from the com-
# plete XML dump of the database in the file ucd.all.flat.xml.zip.
# Importantly it contains the Hangul decompositions, so we do not've
# to compute them algorithmically.
my $dbh = DBI->connect("dbi:SQLite:dbname=ucd","","");
my $db = $dbh->selectall_hashref(q{
SELECT
cp, ccc, dm, dt, Comp_Ex
FROM
ucd
}, 'cp');
foreach my $key (keys %$db) {
my $entry = $db->{$key};
my $cp = chr(hex $entry->{cp});
my $dm = join '', map { chr(hex) } split/\s+/, $entry->{dm};
$CCC{$cp} = $entry->{ccc};
$DEC{$cp} = $dm if
$entry->{dt} eq 'can';
}
foreach my $key (keys %$db) {
my $entry = $db->{$key};
my $cp = chr(hex $entry->{cp});
next unless $entry->{Comp_Ex} eq 'N';
$CAN{ NFD($cp) } = $cp;
}
nstore \%CCC, 'ccc.sto';
nstore \%DEC, 'dec.sto';
nstore \%CAN, 'can.sto';
}
sub reorder {
my $s = shift; $s = "$s";
my $i = 1;
while ($i < length $s) {
my $x = substr $s, $i - 1, 1;
my $y = substr $s, $i + 0, 1;
if (combClass($x) > combClass($y) and combClass($y) != 0) {
substr $s, $i - 1, 1, $y;
substr $s, $i + 0, 1, $x;
$i-- if $i > 1;
next;
}
$i++;
}
return $s;
}
sub combClass {
my $c = shift;
return $CCC{$c} || 0;
}
sub decombine {
my $c = shift;
my $d = $DEC{$c};
return $c unless defined $d;
# Recursively decombine
join '', map { decombine($_) } split//, $d
}
sub NFD {
my $s = shift; $s = "$s";
my $d = join '', map { decombine($_) } split//, $s;
return reorder($d);
}
sub combine {
my $starter = shift;
my $combiner = shift;
my $d = NFD $starter . $combiner;
return $CAN{$d};
}
sub NFC {
my $s = shift;
$s = NFD $s;
my $starterpos = 0;
# advance to the first starter
$starterpos++ while combClass(substr $s, $starterpos, 1) != 0;
my $pos = $starterpos + 1;
my $prev_ccc = 0;
while ($pos < length $s) {
my $current = substr $s, $pos, 1;
my $here_ccc = combClass($current);
my $combo = combine(substr($s, $starterpos, 1), $current);
my $blocked = ($starterpos < $pos - 1) && ($prev_ccc >= $here_ccc);
if (defined $combo and not $blocked) {
substr $s, $starterpos, 1, $combo;
substr $s, $pos, 1, '';
next;
}
if ($here_ccc == 0) {
$starterpos = $pos;
}
$prev_ccc = $here_ccc;
$pos++;
}
return $s;
}
my $f = IO::File->new('<' . 'NormalizationTest.txt');
while (<$f>) {
chomp;
s/#.*//;
next if /^@/;
next unless /\S/;
my ($c1, $c2, $c3, $c4, $c5) = split /;/;
$c1 = join '', map { chr(hex) } $c1 =~ m/(\S+)/g;
$c2 = join '', map { chr(hex) } $c2 =~ m/(\S+)/g;
$c3 = join '', map { chr(hex) } $c3 =~ m/(\S+)/g;
$c4 = join '', map { chr(hex) } $c4 =~ m/(\S+)/g;
$c5 = join '', map { chr(hex) } $c5 =~ m/(\S+)/g;
my $fail = 0;
$fail++ unless $c2 eq NFC($c1) and
$c2 eq NFC($c2) and
$c2 eq NFC($c3) and
$c4 eq NFC($c4) and
$c4 eq NFC($c5) ;
$fail++ unless $c3 eq NFD($c1) and
$c3 eq NFD($c2) and
$c3 eq NFD($c3) and
$c5 eq NFD($c4) and
$c5 eq NFD($c5) ;
next unless $fail;
warn "bad";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment