lifthrasiir/normal.py

## normal.py
# coding=utf-8

import sys
import os.path
from collections import defaultdict

try:
    ucd = sys.argv[1]
except IndexError:
    print >>sys.stderr, 'Usage: %s <ucd path>' % sys.argv[0]
    raise SystemExit(1)

def get_ccc_and_decomp():
    ccc = defaultdict(lambda: 0)
    decomp = defaultdict(lambda: None)
    kdecomp = defaultdict(lambda: None)
    last = None
    for l in open(os.path.join(ucd, 'UnicodeData.txt')):
        l = l.strip().split(';')
        if not l: continue
        a = int(l[0], 16)
        if l[1].endswith(', First>'):
            assert last is None
            last = a
            continue
        elif l[1].endswith(', Last>'):
            assert last is not None
            b = a
            a = last
            last = None
        else:
            assert last is None
            b = a
        if l[3] != '0':
            c = int(l[3], 10)
            for i in xrange(a, b + 1):
                ccc[i] = c
        if l[5] != '':
            x, _, y = l[5].rpartition('> ')
            x = x.strip('<') or None
            y = [int(i, 16) for i in y.split()]
            for i in xrange(a, b + 1):
                kdecomp[i] = y
                if not x: decomp[i] = y
    assert last is None
    return ccc, decomp, kdecomp

def get_qc_and_compex():
    nfcqc = defaultdict(lambda: 'Y')
    nfkcqc = defaultdict(lambda: 'Y')
    compex = defaultdict(lambda: False)
    for l in open(os.path.join(ucd, 'DerivedNormalizationProps.txt')):
        l, _, _ = l.partition('#')
        l = l.strip()
        if not l: continue
        l = l.split(';')
        a, _, b = l[0].partition('..')
        b = b or a
        prop = l[1].strip()
        for i in xrange(int(a, 16), int(b, 16) + 1):
            if prop == 'NFC_QC':
                nfcqc[i] = l[2].strip()
            elif prop == 'NFKC_QC':
                nfkcqc[i] = l[2].strip()
            elif prop == 'Full_Composition_Exclusion':
                compex[i] = True
    return nfcqc, nfkcqc, compex

def decomp_base(s, uccc, udecomp):
    s = list(reversed(s))
    r = []
    rr = []
    while s:
        c = s.pop()
        if 0xac00 <= c < 0xac00 + 11172: # hangul
            cc = c - 0xac00
            d = [0x1100 + cc // 588, 0x1161 + cc // 28 % 21]
            if cc % 28 > 0: d.append(0x11a7 + cc % 28)
        else:
            d = udecomp[c]
        if d:
            s += reversed(d)
        elif uccc[c] == 0: # starter
            rr.sort(key=uccc.__getitem__)
            r += rr
            rr = [c]
        else:
            rr.append(c)
    rr.sort(key=uccc.__getitem__)
    r += rr
    return r

def calculate_primary_composites(udecomp, ucompex):
    pc = {}
    for c, d in udecomp.items():
        if len(d) == 2 and not ucompex[c]:
            pc[tuple(d)] = c
    return pc # still does not include hangul!

def recomp_base(s, uccc, upc):
    s = s[:]
    i = 0
    while i < len(s):
        a = s[i]
        if a is not None or uccc[a] == 0: # a is a starter
            j = i + 1
            pccc = None
            while j < len(s):
                b = s[j]
                if b is not None:
                    bccc = uccc[b]
                    # b and subsequent characters are blocked by previous character
                    if pccc is not None and (pccc == 0 or pccc > bccc):
                        break
                    # when pccc = bccc, b is blocked but the next character can be still checked
                    if pccc is None or pccc != bccc:
                        pccc = bccc
                        if 0xac00 <= a < 0xac00 + 11172 and (a - 0xac00) % 28 == 0 and 0x11a7 <= b < 0x11a7 + 28:
                            c = a + b - 0x11a7 # hangul lv + t
                        elif 0x1100 <= a < 0x1100 + 19 and 0x1161 <= b < 0x1161 + 21:
                            c = 0xac00 + (a - 0x1100) * 588 + (b - 0x1161) * 28 # hangul l + v
                        else:
                            c = upc.get((a, b))
                        if c is not None:
                            a = c
                            s[j] = None
                            pccc = None
                            j = i + 1 # should be reset
                            continue
                j += 1
            s[i] = a
        i += 1
    return [i for i in s if i is not None]

def qc_base(s, uccc, uqc):
    pccc = 0
    ret = 'Y'
    for c in s:
        v = uqc[c]
        if v == 'N': return 'N'
        if v == 'M': ret = 'M'
        ccc = uccc[c]
        if pccc > ccc and ccc != 0: return 'N'
        pccc = ccc
    return ret

def list_tests():
    for lineno, l in enumerate(open(os.path.join(ucd, 'NormalizationTest.txt')), start=1):
        l, _, _ = l.partition('#')
        if l.startswith('@'): continue
        l = l.strip()
        if not l: continue
        l = l.split(';')
        s, c, d, kc, kd = [[int(j, 16) for j in i.split()] for i in l][:5]
        yield lineno, s, c, d, kc, kd

def fmt(s):
    if hasattr(s, '__iter__'):
        return '[%s]' % ', '.join('%#06x' % i for i in s)
    if isinstance(s, (int, long)):
        return '%#06x' % s
    return str(s)

uccc, udecomp, ukdecomp = get_ccc_and_decomp()
unfcqc, unfkcqc, ucompex = get_qc_and_compex()
upc = calculate_primary_composites(udecomp, ucompex) # common to NFC and NFKC

def decomp(s):
    return decomp_base(s, uccc, udecomp)

def kdecomp(s):
    return decomp_base(s, uccc, ukdecomp)

def recomp(s):
    s = recomp_base(s, uccc, upc)
    return s

def nfc_qc(s):
    return qc_base(s, uccc, unfcqc)

def nfkc_qc(s):
    return qc_base(s, uccc, unfkcqc)

def expect(lineno, f, s, exp, act):
    assert exp == act, 'line %d: %s(%s) = expected %s actual %s' % (lineno, f, fmt(s), fmt(exp), fmt(act))

def expectv(lineno, f, s, exp, act):
    assert act != 'YN'[exp], 'line %d: %s(%s) = expected %r actual %r' % (lineno, f, fmt(s), 'NY'[exp], act)

for lineno, s, c, d, kc, kd in list_tests():
    sd = decomp(s); expect(lineno, 'nfd', s, d, sd)
    skd = kdecomp(s); expect(lineno, 'nfkd', s, kd, skd)
    sc = recomp(sd); expect(lineno, 'nfc', s, c, sc)
    skc = recomp(skd); expect(lineno, 'nfkc', s, kc, skc)

    cd = decomp(c); expect(lineno, 'nfd', c, d, cd)
    ckd = kdecomp(c); expect(lineno, 'nfkd', c, kd, ckd)
    cc = recomp(cd); expect(lineno, 'nfc', c, c, cc)
    ckc = recomp(ckd); expect(lineno, 'nfkc', c, kc, ckc)

    dd = decomp(d); expect(lineno, 'nfd', d, d, dd)
    dkd = kdecomp(d); expect(lineno, 'nfkd', d, kd, dkd)
    dc = recomp(dd); expect(lineno, 'nfc', d, c, dc)
    dkc = recomp(dkd); expect(lineno, 'nfkc', d, kc, dkc)

    kcd = decomp(kc); expect(lineno, 'nfd', kc, kd, kcd)
    kckd = kdecomp(kc); expect(lineno, 'nfkd', kc, kd, kckd)
    kcc = recomp(kcd); expect(lineno, 'nfc', kc, kc, kcc)
    kckc = recomp(kckd); expect(lineno, 'nfkc', kc, kc, kckc)

    kdd = decomp(kd); expect(lineno, 'nfd', kd, kd, kdd)
    kdkd = kdecomp(kd); expect(lineno, 'nfkd', kd, kd, kdkd)
    kdc = recomp(kdd); expect(lineno, 'nfc', kd, kc, kdc)
    kdkc = recomp(kdkd); expect(lineno, 'nfkc', kd, kc, kdkc)

    expectv(lineno, 'nfc_qc', s, s == c, nfc_qc(s))
    expectv(lineno, 'nfc_qc', c, c == c, nfc_qc(c))
    expectv(lineno, 'nfc_qc', d, d == c, nfc_qc(d))

    expectv(lineno, 'nfkc_qc', s, s == kc, nfkc_qc(s))
    expectv(lineno, 'nfkc_qc', c, c == kc, nfkc_qc(c))
    expectv(lineno, 'nfkc_qc', d, d == kc, nfkc_qc(d))
    expectv(lineno, 'nfkc_qc', kc, kc == kc, nfkc_qc(kc))
    expectv(lineno, 'nfkc_qc', kd, kd == kc, nfkc_qc(kd))
	# coding=utf-8

	import sys
	import os.path
	from collections import defaultdict

	try:
	ucd = sys.argv[1]
	except IndexError:
	print >>sys.stderr, 'Usage: %s <ucd path>' % sys.argv[0]
	raise SystemExit(1)

	def get_ccc_and_decomp():
	ccc = defaultdict(lambda: 0)
	decomp = defaultdict(lambda: None)
	kdecomp = defaultdict(lambda: None)
	last = None
	for l in open(os.path.join(ucd, 'UnicodeData.txt')):
	l = l.strip().split(';')
	if not l: continue
	a = int(l[0], 16)
	if l[1].endswith(', First>'):
	assert last is None
	last = a
	continue
	elif l[1].endswith(', Last>'):
	assert last is not None
	b = a
	a = last
	last = None
	else:
	assert last is None
	b = a
	if l[3] != '0':
	c = int(l[3], 10)
	for i in xrange(a, b + 1):
	ccc[i] = c
	if l[5] != '':
	x, _, y = l[5].rpartition('> ')
	x = x.strip('<') or None
	y = [int(i, 16) for i in y.split()]
	for i in xrange(a, b + 1):
	kdecomp[i] = y
	if not x: decomp[i] = y
	assert last is None
	return ccc, decomp, kdecomp

	def get_qc_and_compex():
	nfcqc = defaultdict(lambda: 'Y')
	nfkcqc = defaultdict(lambda: 'Y')
	compex = defaultdict(lambda: False)
	for l in open(os.path.join(ucd, 'DerivedNormalizationProps.txt')):
	l, _, _ = l.partition('#')
	l = l.strip()
	if not l: continue
	l = l.split(';')
	a, _, b = l[0].partition('..')
	b = b or a
	prop = l[1].strip()
	for i in xrange(int(a, 16), int(b, 16) + 1):
	if prop == 'NFC_QC':
	nfcqc[i] = l[2].strip()
	elif prop == 'NFKC_QC':
	nfkcqc[i] = l[2].strip()
	elif prop == 'Full_Composition_Exclusion':
	compex[i] = True
	return nfcqc, nfkcqc, compex

	def decomp_base(s, uccc, udecomp):
	s = list(reversed(s))
	r = []
	rr = []
	while s:
	c = s.pop()
	if 0xac00 <= c < 0xac00 + 11172: # hangul
	cc = c - 0xac00
	d = [0x1100 + cc // 588, 0x1161 + cc // 28 % 21]
	if cc % 28 > 0: d.append(0x11a7 + cc % 28)
	else:
	d = udecomp[c]
	if d:
	s += reversed(d)
	elif uccc[c] == 0: # starter
	rr.sort(key=uccc.__getitem__)
	r += rr
	rr = [c]
	else:
	rr.append(c)
	rr.sort(key=uccc.__getitem__)
	r += rr
	return r

	def calculate_primary_composites(udecomp, ucompex):
	pc = {}
	for c, d in udecomp.items():
	if len(d) == 2 and not ucompex[c]:
	pc[tuple(d)] = c
	return pc # still does not include hangul!

	def recomp_base(s, uccc, upc):
	s = s[:]
	i = 0
	while i < len(s):
	a = s[i]
	if a is not None or uccc[a] == 0: # a is a starter
	j = i + 1
	pccc = None
	while j < len(s):
	b = s[j]
	if b is not None:
	bccc = uccc[b]
	# b and subsequent characters are blocked by previous character
	if pccc is not None and (pccc == 0 or pccc > bccc):
	break
	# when pccc = bccc, b is blocked but the next character can be still checked
	if pccc is None or pccc != bccc:
	pccc = bccc
	if 0xac00 <= a < 0xac00 + 11172 and (a - 0xac00) % 28 == 0 and 0x11a7 <= b < 0x11a7 + 28:
	c = a + b - 0x11a7 # hangul lv + t
	elif 0x1100 <= a < 0x1100 + 19 and 0x1161 <= b < 0x1161 + 21:
	c = 0xac00 + (a - 0x1100) * 588 + (b - 0x1161) * 28 # hangul l + v
	else:
	c = upc.get((a, b))
	if c is not None:
	a = c
	s[j] = None
	pccc = None
	j = i + 1 # should be reset
	continue
	j += 1
	s[i] = a
	i += 1
	return [i for i in s if i is not None]

	def qc_base(s, uccc, uqc):
	pccc = 0
	ret = 'Y'
	for c in s:
	v = uqc[c]
	if v == 'N': return 'N'
	if v == 'M': ret = 'M'
	ccc = uccc[c]
	if pccc > ccc and ccc != 0: return 'N'
	pccc = ccc
	return ret

	def list_tests():
	for lineno, l in enumerate(open(os.path.join(ucd, 'NormalizationTest.txt')), start=1):
	l, _, _ = l.partition('#')
	if l.startswith('@'): continue
	l = l.strip()
	if not l: continue
	l = l.split(';')
	s, c, d, kc, kd = [[int(j, 16) for j in i.split()] for i in l][:5]
	yield lineno, s, c, d, kc, kd

	def fmt(s):
	if hasattr(s, '__iter__'):
	return '[%s]' % ', '.join('%#06x' % i for i in s)
	if isinstance(s, (int, long)):
	return '%#06x' % s
	return str(s)

	uccc, udecomp, ukdecomp = get_ccc_and_decomp()
	unfcqc, unfkcqc, ucompex = get_qc_and_compex()
	upc = calculate_primary_composites(udecomp, ucompex) # common to NFC and NFKC

	def decomp(s):
	return decomp_base(s, uccc, udecomp)

	def kdecomp(s):
	return decomp_base(s, uccc, ukdecomp)

	def recomp(s):
	s = recomp_base(s, uccc, upc)
	return s

	def nfc_qc(s):
	return qc_base(s, uccc, unfcqc)

	def nfkc_qc(s):
	return qc_base(s, uccc, unfkcqc)

	def expect(lineno, f, s, exp, act):
	assert exp == act, 'line %d: %s(%s) = expected %s actual %s' % (lineno, f, fmt(s), fmt(exp), fmt(act))

	def expectv(lineno, f, s, exp, act):
	assert act != 'YN'[exp], 'line %d: %s(%s) = expected %r actual %r' % (lineno, f, fmt(s), 'NY'[exp], act)

	for lineno, s, c, d, kc, kd in list_tests():
	sd = decomp(s); expect(lineno, 'nfd', s, d, sd)
	skd = kdecomp(s); expect(lineno, 'nfkd', s, kd, skd)
	sc = recomp(sd); expect(lineno, 'nfc', s, c, sc)
	skc = recomp(skd); expect(lineno, 'nfkc', s, kc, skc)

	cd = decomp(c); expect(lineno, 'nfd', c, d, cd)
	ckd = kdecomp(c); expect(lineno, 'nfkd', c, kd, ckd)
	cc = recomp(cd); expect(lineno, 'nfc', c, c, cc)
	ckc = recomp(ckd); expect(lineno, 'nfkc', c, kc, ckc)

	dd = decomp(d); expect(lineno, 'nfd', d, d, dd)
	dkd = kdecomp(d); expect(lineno, 'nfkd', d, kd, dkd)
	dc = recomp(dd); expect(lineno, 'nfc', d, c, dc)
	dkc = recomp(dkd); expect(lineno, 'nfkc', d, kc, dkc)

	kcd = decomp(kc); expect(lineno, 'nfd', kc, kd, kcd)
	kckd = kdecomp(kc); expect(lineno, 'nfkd', kc, kd, kckd)
	kcc = recomp(kcd); expect(lineno, 'nfc', kc, kc, kcc)
	kckc = recomp(kckd); expect(lineno, 'nfkc', kc, kc, kckc)

	kdd = decomp(kd); expect(lineno, 'nfd', kd, kd, kdd)
	kdkd = kdecomp(kd); expect(lineno, 'nfkd', kd, kd, kdkd)
	kdc = recomp(kdd); expect(lineno, 'nfc', kd, kc, kdc)
	kdkc = recomp(kdkd); expect(lineno, 'nfkc', kd, kc, kdkc)

	expectv(lineno, 'nfc_qc', s, s == c, nfc_qc(s))
	expectv(lineno, 'nfc_qc', c, c == c, nfc_qc(c))
	expectv(lineno, 'nfc_qc', d, d == c, nfc_qc(d))

	expectv(lineno, 'nfkc_qc', s, s == kc, nfkc_qc(s))
	expectv(lineno, 'nfkc_qc', c, c == kc, nfkc_qc(c))
	expectv(lineno, 'nfkc_qc', d, d == kc, nfkc_qc(d))
	expectv(lineno, 'nfkc_qc', kc, kc == kc, nfkc_qc(kc))
	expectv(lineno, 'nfkc_qc', kd, kd == kc, nfkc_qc(kd))