Skip to content

Instantly share code, notes, and snippets.

@lifthrasiir
Created September 14, 2017 18:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lifthrasiir/2b0ea6c773fcb82e72d5d9ee21c2928c to your computer and use it in GitHub Desktop.
Save lifthrasiir/2b0ea6c773fcb82e72d5d9ee21c2928c to your computer and use it in GitHub Desktop.
Homebrew implementation of Unicode Normalization algorithm, used for experiments (public domain)
# coding=utf-8
import sys
import os.path
from collections import defaultdict
try:
ucd = sys.argv[1]
except IndexError:
print >>sys.stderr, 'Usage: %s <ucd path>' % sys.argv[0]
raise SystemExit(1)
def get_ccc_and_decomp():
ccc = defaultdict(lambda: 0)
decomp = defaultdict(lambda: None)
kdecomp = defaultdict(lambda: None)
last = None
for l in open(os.path.join(ucd, 'UnicodeData.txt')):
l = l.strip().split(';')
if not l: continue
a = int(l[0], 16)
if l[1].endswith(', First>'):
assert last is None
last = a
continue
elif l[1].endswith(', Last>'):
assert last is not None
b = a
a = last
last = None
else:
assert last is None
b = a
if l[3] != '0':
c = int(l[3], 10)
for i in xrange(a, b + 1):
ccc[i] = c
if l[5] != '':
x, _, y = l[5].rpartition('> ')
x = x.strip('<') or None
y = [int(i, 16) for i in y.split()]
for i in xrange(a, b + 1):
kdecomp[i] = y
if not x: decomp[i] = y
assert last is None
return ccc, decomp, kdecomp
def get_qc_and_compex():
nfcqc = defaultdict(lambda: 'Y')
nfkcqc = defaultdict(lambda: 'Y')
compex = defaultdict(lambda: False)
for l in open(os.path.join(ucd, 'DerivedNormalizationProps.txt')):
l, _, _ = l.partition('#')
l = l.strip()
if not l: continue
l = l.split(';')
a, _, b = l[0].partition('..')
b = b or a
prop = l[1].strip()
for i in xrange(int(a, 16), int(b, 16) + 1):
if prop == 'NFC_QC':
nfcqc[i] = l[2].strip()
elif prop == 'NFKC_QC':
nfkcqc[i] = l[2].strip()
elif prop == 'Full_Composition_Exclusion':
compex[i] = True
return nfcqc, nfkcqc, compex
def decomp_base(s, uccc, udecomp):
s = list(reversed(s))
r = []
rr = []
while s:
c = s.pop()
if 0xac00 <= c < 0xac00 + 11172: # hangul
cc = c - 0xac00
d = [0x1100 + cc // 588, 0x1161 + cc // 28 % 21]
if cc % 28 > 0: d.append(0x11a7 + cc % 28)
else:
d = udecomp[c]
if d:
s += reversed(d)
elif uccc[c] == 0: # starter
rr.sort(key=uccc.__getitem__)
r += rr
rr = [c]
else:
rr.append(c)
rr.sort(key=uccc.__getitem__)
r += rr
return r
def calculate_primary_composites(udecomp, ucompex):
pc = {}
for c, d in udecomp.items():
if len(d) == 2 and not ucompex[c]:
pc[tuple(d)] = c
return pc # still does not include hangul!
def recomp_base(s, uccc, upc):
s = s[:]
i = 0
while i < len(s):
a = s[i]
if a is not None or uccc[a] == 0: # a is a starter
j = i + 1
pccc = None
while j < len(s):
b = s[j]
if b is not None:
bccc = uccc[b]
# b and subsequent characters are blocked by previous character
if pccc is not None and (pccc == 0 or pccc > bccc):
break
# when pccc = bccc, b is blocked but the next character can be still checked
if pccc is None or pccc != bccc:
pccc = bccc
if 0xac00 <= a < 0xac00 + 11172 and (a - 0xac00) % 28 == 0 and 0x11a7 <= b < 0x11a7 + 28:
c = a + b - 0x11a7 # hangul lv + t
elif 0x1100 <= a < 0x1100 + 19 and 0x1161 <= b < 0x1161 + 21:
c = 0xac00 + (a - 0x1100) * 588 + (b - 0x1161) * 28 # hangul l + v
else:
c = upc.get((a, b))
if c is not None:
a = c
s[j] = None
pccc = None
j = i + 1 # should be reset
continue
j += 1
s[i] = a
i += 1
return [i for i in s if i is not None]
def qc_base(s, uccc, uqc):
pccc = 0
ret = 'Y'
for c in s:
v = uqc[c]
if v == 'N': return 'N'
if v == 'M': ret = 'M'
ccc = uccc[c]
if pccc > ccc and ccc != 0: return 'N'
pccc = ccc
return ret
def list_tests():
for lineno, l in enumerate(open(os.path.join(ucd, 'NormalizationTest.txt')), start=1):
l, _, _ = l.partition('#')
if l.startswith('@'): continue
l = l.strip()
if not l: continue
l = l.split(';')
s, c, d, kc, kd = [[int(j, 16) for j in i.split()] for i in l][:5]
yield lineno, s, c, d, kc, kd
def fmt(s):
if hasattr(s, '__iter__'):
return '[%s]' % ', '.join('%#06x' % i for i in s)
if isinstance(s, (int, long)):
return '%#06x' % s
return str(s)
uccc, udecomp, ukdecomp = get_ccc_and_decomp()
unfcqc, unfkcqc, ucompex = get_qc_and_compex()
upc = calculate_primary_composites(udecomp, ucompex) # common to NFC and NFKC
def decomp(s):
return decomp_base(s, uccc, udecomp)
def kdecomp(s):
return decomp_base(s, uccc, ukdecomp)
def recomp(s):
s = recomp_base(s, uccc, upc)
return s
def nfc_qc(s):
return qc_base(s, uccc, unfcqc)
def nfkc_qc(s):
return qc_base(s, uccc, unfkcqc)
def expect(lineno, f, s, exp, act):
assert exp == act, 'line %d: %s(%s) = expected %s actual %s' % (lineno, f, fmt(s), fmt(exp), fmt(act))
def expectv(lineno, f, s, exp, act):
assert act != 'YN'[exp], 'line %d: %s(%s) = expected %r actual %r' % (lineno, f, fmt(s), 'NY'[exp], act)
for lineno, s, c, d, kc, kd in list_tests():
sd = decomp(s); expect(lineno, 'nfd', s, d, sd)
skd = kdecomp(s); expect(lineno, 'nfkd', s, kd, skd)
sc = recomp(sd); expect(lineno, 'nfc', s, c, sc)
skc = recomp(skd); expect(lineno, 'nfkc', s, kc, skc)
cd = decomp(c); expect(lineno, 'nfd', c, d, cd)
ckd = kdecomp(c); expect(lineno, 'nfkd', c, kd, ckd)
cc = recomp(cd); expect(lineno, 'nfc', c, c, cc)
ckc = recomp(ckd); expect(lineno, 'nfkc', c, kc, ckc)
dd = decomp(d); expect(lineno, 'nfd', d, d, dd)
dkd = kdecomp(d); expect(lineno, 'nfkd', d, kd, dkd)
dc = recomp(dd); expect(lineno, 'nfc', d, c, dc)
dkc = recomp(dkd); expect(lineno, 'nfkc', d, kc, dkc)
kcd = decomp(kc); expect(lineno, 'nfd', kc, kd, kcd)
kckd = kdecomp(kc); expect(lineno, 'nfkd', kc, kd, kckd)
kcc = recomp(kcd); expect(lineno, 'nfc', kc, kc, kcc)
kckc = recomp(kckd); expect(lineno, 'nfkc', kc, kc, kckc)
kdd = decomp(kd); expect(lineno, 'nfd', kd, kd, kdd)
kdkd = kdecomp(kd); expect(lineno, 'nfkd', kd, kd, kdkd)
kdc = recomp(kdd); expect(lineno, 'nfc', kd, kc, kdc)
kdkc = recomp(kdkd); expect(lineno, 'nfkc', kd, kc, kdkc)
expectv(lineno, 'nfc_qc', s, s == c, nfc_qc(s))
expectv(lineno, 'nfc_qc', c, c == c, nfc_qc(c))
expectv(lineno, 'nfc_qc', d, d == c, nfc_qc(d))
expectv(lineno, 'nfkc_qc', s, s == kc, nfkc_qc(s))
expectv(lineno, 'nfkc_qc', c, c == kc, nfkc_qc(c))
expectv(lineno, 'nfkc_qc', d, d == kc, nfkc_qc(d))
expectv(lineno, 'nfkc_qc', kc, kc == kc, nfkc_qc(kc))
expectv(lineno, 'nfkc_qc', kd, kd == kc, nfkc_qc(kd))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment