#!/usr/bin/env python # encoding: utf-8 combining_chars = {} decomposable_chars = {} data = open("UnicodeData.txt") for record in data: record = record.split(";") codepoint = int(record[0], 16) combining_class = int(record[3]) decomposition = record[5] if combining_class: combining_class = int(combining_class) if combining_class != 0: combining_chars[codepoint] = combining_class continue if decomposition: decomposition = decomposition.split() # Only canonical decompositions if not decomposition[0].startswith("<"): decomposable_chars[codepoint] = [int(cp, 16) for cp in decomposition] data.close() # Prettyprinting as JS data structures out = open("normalization_tables.js", "w") combining_chars_out = (u' "\\u%04X" : %i,\n' % (codepoint, combining_class) for codepoint, combining_class in combining_chars.iteritems()) def prettyprint_decomposition(list): return u'"%s"' % u''.join(u'\\u%04X' % item for item in list) decomposable_chars_out = (u' "\\u%04X" : %s,\n' % (codepoint, prettyprint_decomposition(decompostion)) for codepoint, decompostion in decomposable_chars.iteritems()) out.write(u"var Normalization = Normalization || {};\n\n") out.write(u"Normalization.COMBINING_CHARS = {\n%s};\n\n" % "".join(combining_chars_out)) out.write(u"Normalization.DECOMPOSABLE_CHARS = {\n%s};\n\n" % "".join(decomposable_chars_out))