Created
September 14, 2017 18:06
-
-
Save lifthrasiir/2b0ea6c773fcb82e72d5d9ee21c2928c to your computer and use it in GitHub Desktop.
Homebrew implementation of Unicode Normalization algorithm, used for experiments (public domain)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
import sys | |
import os.path | |
from collections import defaultdict | |
try: | |
ucd = sys.argv[1] | |
except IndexError: | |
print >>sys.stderr, 'Usage: %s <ucd path>' % sys.argv[0] | |
raise SystemExit(1) | |
def get_ccc_and_decomp(): | |
ccc = defaultdict(lambda: 0) | |
decomp = defaultdict(lambda: None) | |
kdecomp = defaultdict(lambda: None) | |
last = None | |
for l in open(os.path.join(ucd, 'UnicodeData.txt')): | |
l = l.strip().split(';') | |
if not l: continue | |
a = int(l[0], 16) | |
if l[1].endswith(', First>'): | |
assert last is None | |
last = a | |
continue | |
elif l[1].endswith(', Last>'): | |
assert last is not None | |
b = a | |
a = last | |
last = None | |
else: | |
assert last is None | |
b = a | |
if l[3] != '0': | |
c = int(l[3], 10) | |
for i in xrange(a, b + 1): | |
ccc[i] = c | |
if l[5] != '': | |
x, _, y = l[5].rpartition('> ') | |
x = x.strip('<') or None | |
y = [int(i, 16) for i in y.split()] | |
for i in xrange(a, b + 1): | |
kdecomp[i] = y | |
if not x: decomp[i] = y | |
assert last is None | |
return ccc, decomp, kdecomp | |
def get_qc_and_compex(): | |
nfcqc = defaultdict(lambda: 'Y') | |
nfkcqc = defaultdict(lambda: 'Y') | |
compex = defaultdict(lambda: False) | |
for l in open(os.path.join(ucd, 'DerivedNormalizationProps.txt')): | |
l, _, _ = l.partition('#') | |
l = l.strip() | |
if not l: continue | |
l = l.split(';') | |
a, _, b = l[0].partition('..') | |
b = b or a | |
prop = l[1].strip() | |
for i in xrange(int(a, 16), int(b, 16) + 1): | |
if prop == 'NFC_QC': | |
nfcqc[i] = l[2].strip() | |
elif prop == 'NFKC_QC': | |
nfkcqc[i] = l[2].strip() | |
elif prop == 'Full_Composition_Exclusion': | |
compex[i] = True | |
return nfcqc, nfkcqc, compex | |
def decomp_base(s, uccc, udecomp): | |
s = list(reversed(s)) | |
r = [] | |
rr = [] | |
while s: | |
c = s.pop() | |
if 0xac00 <= c < 0xac00 + 11172: # hangul | |
cc = c - 0xac00 | |
d = [0x1100 + cc // 588, 0x1161 + cc // 28 % 21] | |
if cc % 28 > 0: d.append(0x11a7 + cc % 28) | |
else: | |
d = udecomp[c] | |
if d: | |
s += reversed(d) | |
elif uccc[c] == 0: # starter | |
rr.sort(key=uccc.__getitem__) | |
r += rr | |
rr = [c] | |
else: | |
rr.append(c) | |
rr.sort(key=uccc.__getitem__) | |
r += rr | |
return r | |
def calculate_primary_composites(udecomp, ucompex): | |
pc = {} | |
for c, d in udecomp.items(): | |
if len(d) == 2 and not ucompex[c]: | |
pc[tuple(d)] = c | |
return pc # still does not include hangul! | |
def recomp_base(s, uccc, upc): | |
s = s[:] | |
i = 0 | |
while i < len(s): | |
a = s[i] | |
if a is not None or uccc[a] == 0: # a is a starter | |
j = i + 1 | |
pccc = None | |
while j < len(s): | |
b = s[j] | |
if b is not None: | |
bccc = uccc[b] | |
# b and subsequent characters are blocked by previous character | |
if pccc is not None and (pccc == 0 or pccc > bccc): | |
break | |
# when pccc = bccc, b is blocked but the next character can be still checked | |
if pccc is None or pccc != bccc: | |
pccc = bccc | |
if 0xac00 <= a < 0xac00 + 11172 and (a - 0xac00) % 28 == 0 and 0x11a7 <= b < 0x11a7 + 28: | |
c = a + b - 0x11a7 # hangul lv + t | |
elif 0x1100 <= a < 0x1100 + 19 and 0x1161 <= b < 0x1161 + 21: | |
c = 0xac00 + (a - 0x1100) * 588 + (b - 0x1161) * 28 # hangul l + v | |
else: | |
c = upc.get((a, b)) | |
if c is not None: | |
a = c | |
s[j] = None | |
pccc = None | |
j = i + 1 # should be reset | |
continue | |
j += 1 | |
s[i] = a | |
i += 1 | |
return [i for i in s if i is not None] | |
def qc_base(s, uccc, uqc): | |
pccc = 0 | |
ret = 'Y' | |
for c in s: | |
v = uqc[c] | |
if v == 'N': return 'N' | |
if v == 'M': ret = 'M' | |
ccc = uccc[c] | |
if pccc > ccc and ccc != 0: return 'N' | |
pccc = ccc | |
return ret | |
def list_tests(): | |
for lineno, l in enumerate(open(os.path.join(ucd, 'NormalizationTest.txt')), start=1): | |
l, _, _ = l.partition('#') | |
if l.startswith('@'): continue | |
l = l.strip() | |
if not l: continue | |
l = l.split(';') | |
s, c, d, kc, kd = [[int(j, 16) for j in i.split()] for i in l][:5] | |
yield lineno, s, c, d, kc, kd | |
def fmt(s): | |
if hasattr(s, '__iter__'): | |
return '[%s]' % ', '.join('%#06x' % i for i in s) | |
if isinstance(s, (int, long)): | |
return '%#06x' % s | |
return str(s) | |
uccc, udecomp, ukdecomp = get_ccc_and_decomp() | |
unfcqc, unfkcqc, ucompex = get_qc_and_compex() | |
upc = calculate_primary_composites(udecomp, ucompex) # common to NFC and NFKC | |
def decomp(s): | |
return decomp_base(s, uccc, udecomp) | |
def kdecomp(s): | |
return decomp_base(s, uccc, ukdecomp) | |
def recomp(s): | |
s = recomp_base(s, uccc, upc) | |
return s | |
def nfc_qc(s): | |
return qc_base(s, uccc, unfcqc) | |
def nfkc_qc(s): | |
return qc_base(s, uccc, unfkcqc) | |
def expect(lineno, f, s, exp, act): | |
assert exp == act, 'line %d: %s(%s) = expected %s actual %s' % (lineno, f, fmt(s), fmt(exp), fmt(act)) | |
def expectv(lineno, f, s, exp, act): | |
assert act != 'YN'[exp], 'line %d: %s(%s) = expected %r actual %r' % (lineno, f, fmt(s), 'NY'[exp], act) | |
for lineno, s, c, d, kc, kd in list_tests(): | |
sd = decomp(s); expect(lineno, 'nfd', s, d, sd) | |
skd = kdecomp(s); expect(lineno, 'nfkd', s, kd, skd) | |
sc = recomp(sd); expect(lineno, 'nfc', s, c, sc) | |
skc = recomp(skd); expect(lineno, 'nfkc', s, kc, skc) | |
cd = decomp(c); expect(lineno, 'nfd', c, d, cd) | |
ckd = kdecomp(c); expect(lineno, 'nfkd', c, kd, ckd) | |
cc = recomp(cd); expect(lineno, 'nfc', c, c, cc) | |
ckc = recomp(ckd); expect(lineno, 'nfkc', c, kc, ckc) | |
dd = decomp(d); expect(lineno, 'nfd', d, d, dd) | |
dkd = kdecomp(d); expect(lineno, 'nfkd', d, kd, dkd) | |
dc = recomp(dd); expect(lineno, 'nfc', d, c, dc) | |
dkc = recomp(dkd); expect(lineno, 'nfkc', d, kc, dkc) | |
kcd = decomp(kc); expect(lineno, 'nfd', kc, kd, kcd) | |
kckd = kdecomp(kc); expect(lineno, 'nfkd', kc, kd, kckd) | |
kcc = recomp(kcd); expect(lineno, 'nfc', kc, kc, kcc) | |
kckc = recomp(kckd); expect(lineno, 'nfkc', kc, kc, kckc) | |
kdd = decomp(kd); expect(lineno, 'nfd', kd, kd, kdd) | |
kdkd = kdecomp(kd); expect(lineno, 'nfkd', kd, kd, kdkd) | |
kdc = recomp(kdd); expect(lineno, 'nfc', kd, kc, kdc) | |
kdkc = recomp(kdkd); expect(lineno, 'nfkc', kd, kc, kdkc) | |
expectv(lineno, 'nfc_qc', s, s == c, nfc_qc(s)) | |
expectv(lineno, 'nfc_qc', c, c == c, nfc_qc(c)) | |
expectv(lineno, 'nfc_qc', d, d == c, nfc_qc(d)) | |
expectv(lineno, 'nfkc_qc', s, s == kc, nfkc_qc(s)) | |
expectv(lineno, 'nfkc_qc', c, c == kc, nfkc_qc(c)) | |
expectv(lineno, 'nfkc_qc', d, d == kc, nfkc_qc(d)) | |
expectv(lineno, 'nfkc_qc', kc, kc == kc, nfkc_qc(kc)) | |
expectv(lineno, 'nfkc_qc', kd, kd == kc, nfkc_qc(kd)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment