g-andrade/unicode_ranger.py

## unicode_ranger.py
#!/usr/bin/env python3
import json
import multiprocessing
import sys
import unicodedata
from collections import namedtuple

def run():
    print('Preparing work', file=sys.stderr)
    groups = generate_groups()
    diacritic_group_names = [k for k in groups.keys() if ('diacritic' in k.lower())]
    nondiacritic_group_names = [k for k in groups.keys() if (k not in diacritic_group_names)]
    diacritic_groups = {k: groups[k] for k in diacritic_group_names}
    nondiacritic_groups = {k: groups[k] for k in nondiacritic_group_names}

    diacritic_ustrings = generate_diacritic_ustrings(diacritic_groups)
    work_units = []
    for range_name, subgroups in sorted_dict_items(nondiacritic_groups):
        for char_class, codepoints in sorted_dict_items(subgroups):
            work_units.append( (range_name, char_class, codepoints, diacritic_ustrings) )

    print('Generating strings...', file=sys.stderr)
    work_pool_size = multiprocessing.cpu_count()
    with multiprocessing.Pool(work_pool_size) as work_pool:
        work_results = work_pool.map(handle_bundled_codepoints, work_units)
        work_results = [work_result for work_result in work_results if (work_result is not None)]

    json_groups = {}
    total_ustring_count = 0
    for range_name, char_class, ustrings in work_results:
        json_subgroups = json_groups.get(range_name)
        if json_subgroups is None:
            json_subgroups = {}
            json_groups[range_name] = json_subgroups
        json_class_category = '%s, %s' % (char_class.major_category, char_class.minor_category)
        json_subgroups[json_class_category] = ustrings
        ustring_count = len(ustrings)
        total_ustring_count += ustring_count
        print('%s / %s: %d unique strings' % (range_name, json_class_category, ustring_count), file=sys.stderr)
    print('Generated a total of %d unique strings' % total_ustring_count, file=sys.stderr)
    print('Writing to stdout...', file=sys.stderr)
    json.dump(json_groups, sys.stdout, separators=(',', ':'))

def handle_bundled_codepoints(work_args):
    range_name, char_class, codepoints, diacritic_ustrings = work_args
    if (char_class.major_category, char_class.minor_category) == ('Other', 'surrogate'):
        return
    acc = []
    for codepoint in sorted(codepoints):
        acc += handle_bundled_codepoint(range_name, char_class, codepoint, diacritic_ustrings)
    return range_name, char_class, acc

def handle_bundled_codepoint(range_name, char_class, codepoint, diacritic_ustrings):
    ustring = chr(codepoint)
    trivial_combos = [ustring]
    diacritical_combos = generate_diacritic_combos(ustring, diacritic_ustrings)
    all_base_combos = trivial_combos + diacritical_combos
    all_inflected_combos = generate_all_normalization_variants(all_base_combos)
    all_combos = sorted( list( frozenset(all_inflected_combos) ) )
    return all_combos

def generate_diacritic_combos(ustring, diacritic_ustrings):
    # TODO more than 1 diacritic?
    return [(ustring + diacritic) for diacritic in diacritic_ustrings]

def generate_all_normalization_variants(ustrings):
    for ustring in ustrings:
        yield ustring
        for form_name in ('NFC', 'NFKC', 'NFD', 'NFKD'):
            yield unicodedata.normalize(form_name, ustring)

###
def generate_groups():
    global VALID_CODEPOINT_RANGES
    groups = {}
    for min_codepoint, max_codepoint, range_name in VALID_CODEPOINT_RANGES:
        for codepoint in range(min_codepoint, max_codepoint):
            group_by_range(range_name, codepoint, groups)
    return groups

def group_by_range(range_name, codepoint, groups_acc):
    subgroups_acc = groups_acc.get(range_name)
    if subgroups_acc is None:
        subgroups_acc = dict()
        groups_acc[range_name] = subgroups_acc
    group_by_char_class(codepoint, subgroups_acc)

def group_by_char_class(codepoint, subgroups_acc):
    global KNOWN_UNICODE_CLASSES
    sample_ustring = chr(codepoint)
    char_class_id = unicodedata.category(sample_ustring)
    char_class = KNOWN_UNICODE_CLASSES[char_class_id]
    dict_append(subgroups_acc, char_class, codepoint)

###
def generate_diacritic_ustrings(diacritic_groups):
    acc = []
    for range_name, subgroups in diacritic_groups.items():
        for char_class, codepoints in subgroups.items():
            if char_class.major_category == 'Mark':
                acc += map(chr, codepoints)
    return acc


###
def sorted_dict_items(dic):
    return sorted(dic.items(), key = lambda kv: kv[0])

def dict_append(dic, key, value):
    lst = dic.get(key)
    if lst is None:
        dic[key] = [value]
    else:
        lst.append(value)

# from: http://jrgraphix.net/research/unicode.php
VALID_CODEPOINT_RANGES = [
		(0x0020, 0x007F, "Basic Latin"),
		(0x2580, 0x259F, "Block Elements"),
		(0x00A0, 0x00FF, "Latin-1 Supplement"),
		(0x25A0, 0x25FF, "Geometric Shapes"),
		(0x0100, 0x017F, "Latin Extended-A"),
		(0x2600, 0x26FF, "Miscellaneous Symbols"),
		(0x0180, 0x024F, "Latin Extended-B"),
		(0x2700, 0x27BF, "Dingbats"),
		(0x0250, 0x02AF, "IPA Extensions"),
		(0x27C0, 0x27EF, "Miscellaneous Mathematical Symbols-A"),
		(0x02B0, 0x02FF, "Spacing Modifier Letters"),
		(0x27F0, 0x27FF, "Supplemental Arrows-A"),
		(0x0300, 0x036F, "Combining Diacritical Marks"),
		(0x2800, 0x28FF, "Braille Patterns"),
		(0x0370, 0x03FF, "Greek and Coptic"),
		(0x2900, 0x297F, "Supplemental Arrows-B"),
		(0x0400, 0x04FF, "Cyrillic"),
		(0x2980, 0x29FF, "Miscellaneous Mathematical Symbols-B"),
		(0x0500, 0x052F, "Cyrillic Supplementary"),
		(0x2A00, 0x2AFF, "Supplemental Mathematical Operators"),
		(0x0530, 0x058F, "Armenian"),
		(0x2B00, 0x2BFF, "Miscellaneous Symbols and Arrows"),
		(0x0590, 0x05FF, "Hebrew"),
		(0x2E80, 0x2EFF, "CJK Radicals Supplement"),
		(0x0600, 0x06FF, "Arabic"),
		(0x2F00, 0x2FDF, "Kangxi Radicals"),
		(0x0700, 0x074F, "Syriac"),
		(0x2FF0, 0x2FFF, "Ideographic Description Characters"),
		(0x0780, 0x07BF, "Thaana"),
		(0x3000, 0x303F, "CJK Symbols and Punctuation"),
		(0x0900, 0x097F, "Devanagari"),
		(0x3040, 0x309F, "Hiragana"),
		(0x0980, 0x09FF, "Bengali"),
		(0x30A0, 0x30FF, "Katakana"),
		(0x0A00, 0x0A7F, "Gurmukhi"),
		(0x3100, 0x312F, "Bopomofo"),
		(0x0A80, 0x0AFF, "Gujarati"),
		(0x3130, 0x318F, "Hangul Compatibility Jamo"),
		(0x0B00, 0x0B7F, "Oriya"),
		(0x3190, 0x319F, "Kanbun"),
		(0x0B80, 0x0BFF, "Tamil"),
		(0x31A0, 0x31BF, "Bopomofo Extended"),
		(0x0C00, 0x0C7F, "Telugu"),
		(0x31F0, 0x31FF, "Katakana Phonetic Extensions"),
		(0x0C80, 0x0CFF, "Kannada"),
		(0x3200, 0x32FF, "Enclosed CJK Letters and Months"),
		(0x0D00, 0x0D7F, "Malayalam"),
		(0x3300, 0x33FF, "CJK Compatibility"),
		(0x0D80, 0x0DFF, "Sinhala"),
		(0x3400, 0x4DBF, "CJK Unified Ideographs Extension A"),
		(0x0E00, 0x0E7F, "Thai"),
		(0x4DC0, 0x4DFF, "Yijing Hexagram Symbols"),
		(0x0E80, 0x0EFF, "Lao"),
		(0x4E00, 0x9FFF, "CJK Unified Ideographs"),
		(0x0F00, 0x0FFF, "Tibetan"),
		(0xA000, 0xA48F, "Yi Syllables"),
		(0x1000, 0x109F, "Myanmar"),
		(0xA490, 0xA4CF, "Yi Radicals"),
		(0x10A0, 0x10FF, "Georgian"),
		(0xAC00, 0xD7AF, "Hangul Syllables"),
		(0x1100, 0x11FF, "Hangul Jamo"),
		(0xD800, 0xDB7F, "High Surrogates"),
		(0x1200, 0x137F, "Ethiopic"),
		(0xDB80, 0xDBFF, "High Private Use Surrogates"),
		(0x13A0, 0x13FF, "Cherokee"),
		(0xDC00, 0xDFFF, "Low Surrogates"),
		(0x1400, 0x167F, "Unified Canadian Aboriginal Syllabics"),
		(0xE000, 0xF8FF, "Private Use Area"),
		(0x1680, 0x169F, "Ogham"),
		(0xF900, 0xFAFF, "CJK Compatibility Ideographs"),
		(0x16A0, 0x16FF, "Runic"),
		(0xFB00, 0xFB4F, "Alphabetic Presentation Forms"),
		(0x1700, 0x171F, "Tagalog"),
		(0xFB50, 0xFDFF, "Arabic Presentation Forms-A"),
		(0x1720, 0x173F, "Hanunoo"),
		(0xFE00, 0xFE0F, "Variation Selectors"),
		(0x1740, 0x175F, "Buhid"),
		(0xFE20, 0xFE2F, "Combining Half Marks"),
		(0x1760, 0x177F, "Tagbanwa"),
		(0xFE30, 0xFE4F, "CJK Compatibility Forms"),
		(0x1780, 0x17FF, "Khmer"),
		(0xFE50, 0xFE6F, "Small Form Variants"),
		(0x1800, 0x18AF, "Mongolian"),
		(0xFE70, 0xFEFF, "Arabic Presentation Forms-B"),
		(0x1900, 0x194F, "Limbu"),
		(0xFF00, 0xFFEF, "Halfwidth and Fullwidth Forms"),
		(0x1950, 0x197F, "Tai Le"),
		(0xFFF0, 0xFFFF, "Specials"),
		(0x19E0, 0x19FF, "Khmer Symbols"),
		(0x10000, 0x007F, "Linear B Syllabary"),
		(0x1D00, 0x1D7F, "Phonetic Extensions"),
		(0x10080, 0x00FF, "Linear B Ideograms"),
		(0x1E00, 0x1EFF, "Latin Extended Additional"),
		(0x10100, 0x013F, "Aegean Numbers"),
		(0x1F00, 0x1FFF, "Greek Extended"),
		(0x10300, 0x032F, "Old Italic"),
		(0x2000, 0x206F, "General Punctuation"),
		(0x10330, 0x034F, "Gothic"),
		(0x2070, 0x209F, "Superscripts and Subscripts"),
		(0x10380, 0x039F, "Ugaritic"),
		(0x20A0, 0x20CF, "Currency Symbols"),
		(0x10400, 0x044F, "Deseret"),
		(0x20D0, 0x20FF, "Combining Diacritical Marks for Symbols"),
		(0x10450, 0x047F, "Shavian"),
		(0x2100, 0x214F, "Letterlike Symbols"),
		(0x10480, 0x04AF, "Osmanya"),
		(0x2150, 0x218F, "Number Forms"),
		(0x10800, 0x083F, "Cypriot Syllabary"),
		(0x2190, 0x21FF, "Arrows"),
		(0x1D000, 0xD0FF, "Byzantine Musical Symbols"),
		(0x2200, 0x22FF, "Mathematical Operators"),
		(0x1D100, 0xD1FF, "Musical Symbols"),
		(0x2300, 0x23FF, "Miscellaneous Technical"),
		(0x1D300, 0xD35F, "Tai Xuan Jing Symbols"),
		(0x2400, 0x243F, "Control Pictures"),
		(0x1D400, 0xD7FF, "Mathematical Alphanumeric Symbols"),
		(0x2440, 0x245F, "Optical Character Recognition"),
		(0x20000, 0xA6DF, "CJK Unified Ideographs Extension B"),
		(0x2460, 0x24FF, "Enclosed Alphanumerics"),
		(0x2F800, 0xFA1F, "CJK Compatibility Ideographs Supplement"),
		(0x2500, 0x257F, "Box Drawing"),
		(0xE0000, 0x007F, "Tags")
        ]

UnicodeClass = namedtuple('UnicodeClass', ['major_category', 'minor_category', 'basic_type', 'character_assigned', 'fixed', 'remarks'])

# from: https://en.wikipedia.org/wiki/Template:General_Category_(Unicode)
KNOWN_UNICODE_CLASSES = {
        # Letter
        "Lu": UnicodeClass("Letter","uppercase","Graphic","Character",'',''),
        "Ll": UnicodeClass("Letter","lowercase","Graphic","Character",'',''),
        "Lt": UnicodeClass("Letter","titlecase","Graphic","Character",'',''),
        "Lm": UnicodeClass("Letter","modifier","Graphic","Character",'',''),
        "Lo": UnicodeClass("Letter","other","Graphic","Character",'',''),
        # Mark
        "Mn": UnicodeClass("Mark","nonspacing","Graphic","Character",'',''),
        "Mc": UnicodeClass("Mark","spacing combining","Graphic","Character",'',''),
        "Me": UnicodeClass("Mark","enclosing","Graphic","Character",'',''),
        # Number
        "Nd": UnicodeClass("Number","decimal digit","Graphic","Character",'',"All these, and only these, have Numeric Type = De"),
        "Nl": UnicodeClass("Number","letter","Graphic","Character",'',"Numerals composed of letters or letterlike symbols (e.g., Roman numerals)"),
        "No": UnicodeClass("Number","other","Graphic","Character",'',"E.g., vulgar fractions, superscript and subscript digits"),
        # Punctuation
        "Pc": UnicodeClass("Punctuation","connector","Graphic","Character",'',"Includes ""_"" underscore"),
        "Pd": UnicodeClass("Punctuation","dash","Graphic","Character",'',"Includes several hyphen characters"),
        "Ps": UnicodeClass("Punctuation","open","Graphic","Character",'',"Opening bracket characters"),
        "Pe": UnicodeClass("Punctuation","close","Graphic","Character",'',"Closing bracket characters"),
        "Pi": UnicodeClass("Punctuation","initial quote","Graphic","Character",'',"Opening quotation mark. Does not include the ASCII ""neutral"" quotation mark. May behave like Ps or Pe depending on usage"),
        "Pf": UnicodeClass("Punctuation","final quote","Graphic","Character",'',"Closing quotation mark. May behave like Ps or Pe depending on usage"),
        "Po": UnicodeClass("Punctuation","other","Graphic","Character",'',''),
        # Symbol
        "Sm": UnicodeClass("Symbol","math","Graphic","Character",'','Mathematical symbols (e.g., +, =, ×, ÷, √, ∊). Does not include parentheses and brackets, which are in categories Ps and Pe. Also does not include !, *, -, or /, which despite frequent use as mathematical operators, are primarily considered to be "punctuation.'),
        "Sc": UnicodeClass("Symbol","currency","Graphic","Character",'',"Currency symbols"),
        "Sk": UnicodeClass("Symbol","modifier","Graphic","Character",'',''),
        "So": UnicodeClass("Symbol","other","Graphic","Character",'',''),
        # Separator
        "Zs": UnicodeClass("Separator","space","Graphic","Character",'',"Includes the space, but not TAB, CR, or LF, which are Cc"),
        "Zl": UnicodeClass("Separator","line","Format","Character",'',"Only U+2028 LINE SEPARATOR (LSEP)"),
        "Zp": UnicodeClass("Separator","paragraph","Format","Character",'',"Only U+2029 PARAGRAPH SEPARATOR (PSEP)"),
        # Other
        "Cc": UnicodeClass("Other","control","Control","Character","Fixed 65","No name, <control>"),
        "Cf": UnicodeClass("Other","format","Format","Character",'',"Includes the soft hyphen, control characters to support bi-directional text, and language tag characters"),
        "Cs": UnicodeClass("Other","surrogate","Surrogate","Not (but abstract)","Fixed 2,048","No name, <surrogate>"),
        "Co": UnicodeClass("Other","private use","Private-use","Not (but abstract)","Fixed 137,468 total: 6,400 in BMP, 131,068 in Planes 15-16","No name, <private-use>"),
        "Cn": UnicodeClass("Other","not assigned","Noncharacter","Not","Fixed 66","No name, <noncharacter>")
    }


if __name__ == '__main__':
	run()
	#!/usr/bin/env python3
	import json
	import multiprocessing
	import sys
	import unicodedata
	from collections import namedtuple

	def run():
	print('Preparing work', file=sys.stderr)
	groups = generate_groups()
	diacritic_group_names = [k for k in groups.keys() if ('diacritic' in k.lower())]
	nondiacritic_group_names = [k for k in groups.keys() if (k not in diacritic_group_names)]
	diacritic_groups = {k: groups[k] for k in diacritic_group_names}
	nondiacritic_groups = {k: groups[k] for k in nondiacritic_group_names}

	diacritic_ustrings = generate_diacritic_ustrings(diacritic_groups)
	work_units = []
	for range_name, subgroups in sorted_dict_items(nondiacritic_groups):
	for char_class, codepoints in sorted_dict_items(subgroups):
	work_units.append( (range_name, char_class, codepoints, diacritic_ustrings) )

	print('Generating strings...', file=sys.stderr)
	work_pool_size = multiprocessing.cpu_count()
	with multiprocessing.Pool(work_pool_size) as work_pool:
	work_results = work_pool.map(handle_bundled_codepoints, work_units)
	work_results = [work_result for work_result in work_results if (work_result is not None)]

	json_groups = {}
	total_ustring_count = 0
	for range_name, char_class, ustrings in work_results:
	json_subgroups = json_groups.get(range_name)
	if json_subgroups is None:
	json_subgroups = {}
	json_groups[range_name] = json_subgroups
	json_class_category = '%s, %s' % (char_class.major_category, char_class.minor_category)
	json_subgroups[json_class_category] = ustrings
	ustring_count = len(ustrings)
	total_ustring_count += ustring_count
	print('%s / %s: %d unique strings' % (range_name, json_class_category, ustring_count), file=sys.stderr)
	print('Generated a total of %d unique strings' % total_ustring_count, file=sys.stderr)
	print('Writing to stdout...', file=sys.stderr)
	json.dump(json_groups, sys.stdout, separators=(',', ':'))

	def handle_bundled_codepoints(work_args):
	range_name, char_class, codepoints, diacritic_ustrings = work_args
	if (char_class.major_category, char_class.minor_category) == ('Other', 'surrogate'):
	return
	acc = []
	for codepoint in sorted(codepoints):
	acc += handle_bundled_codepoint(range_name, char_class, codepoint, diacritic_ustrings)
	return range_name, char_class, acc

	def handle_bundled_codepoint(range_name, char_class, codepoint, diacritic_ustrings):
	ustring = chr(codepoint)
	trivial_combos = [ustring]
	diacritical_combos = generate_diacritic_combos(ustring, diacritic_ustrings)
	all_base_combos = trivial_combos + diacritical_combos
	all_inflected_combos = generate_all_normalization_variants(all_base_combos)
	all_combos = sorted( list( frozenset(all_inflected_combos) ) )
	return all_combos

	def generate_diacritic_combos(ustring, diacritic_ustrings):
	# TODO more than 1 diacritic?
	return [(ustring + diacritic) for diacritic in diacritic_ustrings]

	def generate_all_normalization_variants(ustrings):
	for ustring in ustrings:
	yield ustring
	for form_name in ('NFC', 'NFKC', 'NFD', 'NFKD'):
	yield unicodedata.normalize(form_name, ustring)

	###
	def generate_groups():
	global VALID_CODEPOINT_RANGES
	groups = {}
	for min_codepoint, max_codepoint, range_name in VALID_CODEPOINT_RANGES:
	for codepoint in range(min_codepoint, max_codepoint):
	group_by_range(range_name, codepoint, groups)
	return groups

	def group_by_range(range_name, codepoint, groups_acc):
	subgroups_acc = groups_acc.get(range_name)
	if subgroups_acc is None:
	subgroups_acc = dict()
	groups_acc[range_name] = subgroups_acc
	group_by_char_class(codepoint, subgroups_acc)

	def group_by_char_class(codepoint, subgroups_acc):
	global KNOWN_UNICODE_CLASSES
	sample_ustring = chr(codepoint)
	char_class_id = unicodedata.category(sample_ustring)
	char_class = KNOWN_UNICODE_CLASSES[char_class_id]
	dict_append(subgroups_acc, char_class, codepoint)

	###
	def generate_diacritic_ustrings(diacritic_groups):
	acc = []
	for range_name, subgroups in diacritic_groups.items():
	for char_class, codepoints in subgroups.items():
	if char_class.major_category == 'Mark':
	acc += map(chr, codepoints)
	return acc


	###
	def sorted_dict_items(dic):
	return sorted(dic.items(), key = lambda kv: kv[0])

	def dict_append(dic, key, value):
	lst = dic.get(key)
	if lst is None:
	dic[key] = [value]
	else:
	lst.append(value)

	# from: http://jrgraphix.net/research/unicode.php
	VALID_CODEPOINT_RANGES = [
	(0x0020, 0x007F, "Basic Latin"),
	(0x2580, 0x259F, "Block Elements"),
	(0x00A0, 0x00FF, "Latin-1 Supplement"),
	(0x25A0, 0x25FF, "Geometric Shapes"),
	(0x0100, 0x017F, "Latin Extended-A"),
	(0x2600, 0x26FF, "Miscellaneous Symbols"),
	(0x0180, 0x024F, "Latin Extended-B"),
	(0x2700, 0x27BF, "Dingbats"),
	(0x0250, 0x02AF, "IPA Extensions"),
	(0x27C0, 0x27EF, "Miscellaneous Mathematical Symbols-A"),
	(0x02B0, 0x02FF, "Spacing Modifier Letters"),
	(0x27F0, 0x27FF, "Supplemental Arrows-A"),
	(0x0300, 0x036F, "Combining Diacritical Marks"),
	(0x2800, 0x28FF, "Braille Patterns"),
	(0x0370, 0x03FF, "Greek and Coptic"),
	(0x2900, 0x297F, "Supplemental Arrows-B"),
	(0x0400, 0x04FF, "Cyrillic"),
	(0x2980, 0x29FF, "Miscellaneous Mathematical Symbols-B"),
	(0x0500, 0x052F, "Cyrillic Supplementary"),
	(0x2A00, 0x2AFF, "Supplemental Mathematical Operators"),
	(0x0530, 0x058F, "Armenian"),
	(0x2B00, 0x2BFF, "Miscellaneous Symbols and Arrows"),
	(0x0590, 0x05FF, "Hebrew"),
	(0x2E80, 0x2EFF, "CJK Radicals Supplement"),
	(0x0600, 0x06FF, "Arabic"),
	(0x2F00, 0x2FDF, "Kangxi Radicals"),
	(0x0700, 0x074F, "Syriac"),
	(0x2FF0, 0x2FFF, "Ideographic Description Characters"),
	(0x0780, 0x07BF, "Thaana"),
	(0x3000, 0x303F, "CJK Symbols and Punctuation"),
	(0x0900, 0x097F, "Devanagari"),
	(0x3040, 0x309F, "Hiragana"),
	(0x0980, 0x09FF, "Bengali"),
	(0x30A0, 0x30FF, "Katakana"),
	(0x0A00, 0x0A7F, "Gurmukhi"),
	(0x3100, 0x312F, "Bopomofo"),
	(0x0A80, 0x0AFF, "Gujarati"),
	(0x3130, 0x318F, "Hangul Compatibility Jamo"),
	(0x0B00, 0x0B7F, "Oriya"),
	(0x3190, 0x319F, "Kanbun"),
	(0x0B80, 0x0BFF, "Tamil"),
	(0x31A0, 0x31BF, "Bopomofo Extended"),
	(0x0C00, 0x0C7F, "Telugu"),
	(0x31F0, 0x31FF, "Katakana Phonetic Extensions"),
	(0x0C80, 0x0CFF, "Kannada"),
	(0x3200, 0x32FF, "Enclosed CJK Letters and Months"),
	(0x0D00, 0x0D7F, "Malayalam"),
	(0x3300, 0x33FF, "CJK Compatibility"),
	(0x0D80, 0x0DFF, "Sinhala"),
	(0x3400, 0x4DBF, "CJK Unified Ideographs Extension A"),
	(0x0E00, 0x0E7F, "Thai"),
	(0x4DC0, 0x4DFF, "Yijing Hexagram Symbols"),
	(0x0E80, 0x0EFF, "Lao"),
	(0x4E00, 0x9FFF, "CJK Unified Ideographs"),
	(0x0F00, 0x0FFF, "Tibetan"),
	(0xA000, 0xA48F, "Yi Syllables"),
	(0x1000, 0x109F, "Myanmar"),
	(0xA490, 0xA4CF, "Yi Radicals"),
	(0x10A0, 0x10FF, "Georgian"),
	(0xAC00, 0xD7AF, "Hangul Syllables"),
	(0x1100, 0x11FF, "Hangul Jamo"),
	(0xD800, 0xDB7F, "High Surrogates"),
	(0x1200, 0x137F, "Ethiopic"),
	(0xDB80, 0xDBFF, "High Private Use Surrogates"),
	(0x13A0, 0x13FF, "Cherokee"),
	(0xDC00, 0xDFFF, "Low Surrogates"),
	(0x1400, 0x167F, "Unified Canadian Aboriginal Syllabics"),
	(0xE000, 0xF8FF, "Private Use Area"),
	(0x1680, 0x169F, "Ogham"),
	(0xF900, 0xFAFF, "CJK Compatibility Ideographs"),
	(0x16A0, 0x16FF, "Runic"),
	(0xFB00, 0xFB4F, "Alphabetic Presentation Forms"),
	(0x1700, 0x171F, "Tagalog"),
	(0xFB50, 0xFDFF, "Arabic Presentation Forms-A"),
	(0x1720, 0x173F, "Hanunoo"),
	(0xFE00, 0xFE0F, "Variation Selectors"),
	(0x1740, 0x175F, "Buhid"),
	(0xFE20, 0xFE2F, "Combining Half Marks"),
	(0x1760, 0x177F, "Tagbanwa"),
	(0xFE30, 0xFE4F, "CJK Compatibility Forms"),
	(0x1780, 0x17FF, "Khmer"),
	(0xFE50, 0xFE6F, "Small Form Variants"),
	(0x1800, 0x18AF, "Mongolian"),
	(0xFE70, 0xFEFF, "Arabic Presentation Forms-B"),
	(0x1900, 0x194F, "Limbu"),
	(0xFF00, 0xFFEF, "Halfwidth and Fullwidth Forms"),
	(0x1950, 0x197F, "Tai Le"),
	(0xFFF0, 0xFFFF, "Specials"),
	(0x19E0, 0x19FF, "Khmer Symbols"),
	(0x10000, 0x007F, "Linear B Syllabary"),
	(0x1D00, 0x1D7F, "Phonetic Extensions"),
	(0x10080, 0x00FF, "Linear B Ideograms"),
	(0x1E00, 0x1EFF, "Latin Extended Additional"),
	(0x10100, 0x013F, "Aegean Numbers"),
	(0x1F00, 0x1FFF, "Greek Extended"),
	(0x10300, 0x032F, "Old Italic"),
	(0x2000, 0x206F, "General Punctuation"),
	(0x10330, 0x034F, "Gothic"),
	(0x2070, 0x209F, "Superscripts and Subscripts"),
	(0x10380, 0x039F, "Ugaritic"),
	(0x20A0, 0x20CF, "Currency Symbols"),
	(0x10400, 0x044F, "Deseret"),
	(0x20D0, 0x20FF, "Combining Diacritical Marks for Symbols"),
	(0x10450, 0x047F, "Shavian"),
	(0x2100, 0x214F, "Letterlike Symbols"),
	(0x10480, 0x04AF, "Osmanya"),
	(0x2150, 0x218F, "Number Forms"),
	(0x10800, 0x083F, "Cypriot Syllabary"),
	(0x2190, 0x21FF, "Arrows"),
	(0x1D000, 0xD0FF, "Byzantine Musical Symbols"),
	(0x2200, 0x22FF, "Mathematical Operators"),
	(0x1D100, 0xD1FF, "Musical Symbols"),
	(0x2300, 0x23FF, "Miscellaneous Technical"),
	(0x1D300, 0xD35F, "Tai Xuan Jing Symbols"),
	(0x2400, 0x243F, "Control Pictures"),
	(0x1D400, 0xD7FF, "Mathematical Alphanumeric Symbols"),
	(0x2440, 0x245F, "Optical Character Recognition"),
	(0x20000, 0xA6DF, "CJK Unified Ideographs Extension B"),
	(0x2460, 0x24FF, "Enclosed Alphanumerics"),
	(0x2F800, 0xFA1F, "CJK Compatibility Ideographs Supplement"),
	(0x2500, 0x257F, "Box Drawing"),
	(0xE0000, 0x007F, "Tags")
	]

	UnicodeClass = namedtuple('UnicodeClass', ['major_category', 'minor_category', 'basic_type', 'character_assigned', 'fixed', 'remarks'])

	# from: https://en.wikipedia.org/wiki/Template:General_Category_(Unicode)
	KNOWN_UNICODE_CLASSES = {
	# Letter
	"Lu": UnicodeClass("Letter","uppercase","Graphic","Character",'',''),
	"Ll": UnicodeClass("Letter","lowercase","Graphic","Character",'',''),
	"Lt": UnicodeClass("Letter","titlecase","Graphic","Character",'',''),
	"Lm": UnicodeClass("Letter","modifier","Graphic","Character",'',''),
	"Lo": UnicodeClass("Letter","other","Graphic","Character",'',''),
	# Mark
	"Mn": UnicodeClass("Mark","nonspacing","Graphic","Character",'',''),
	"Mc": UnicodeClass("Mark","spacing combining","Graphic","Character",'',''),
	"Me": UnicodeClass("Mark","enclosing","Graphic","Character",'',''),
	# Number
	"Nd": UnicodeClass("Number","decimal digit","Graphic","Character",'',"All these, and only these, have Numeric Type = De"),
	"Nl": UnicodeClass("Number","letter","Graphic","Character",'',"Numerals composed of letters or letterlike symbols (e.g., Roman numerals)"),
	"No": UnicodeClass("Number","other","Graphic","Character",'',"E.g., vulgar fractions, superscript and subscript digits"),
	# Punctuation
	"Pc": UnicodeClass("Punctuation","connector","Graphic","Character",'',"Includes ""_"" underscore"),
	"Pd": UnicodeClass("Punctuation","dash","Graphic","Character",'',"Includes several hyphen characters"),
	"Ps": UnicodeClass("Punctuation","open","Graphic","Character",'',"Opening bracket characters"),
	"Pe": UnicodeClass("Punctuation","close","Graphic","Character",'',"Closing bracket characters"),
	"Pi": UnicodeClass("Punctuation","initial quote","Graphic","Character",'',"Opening quotation mark. Does not include the ASCII ""neutral"" quotation mark. May behave like Ps or Pe depending on usage"),
	"Pf": UnicodeClass("Punctuation","final quote","Graphic","Character",'',"Closing quotation mark. May behave like Ps or Pe depending on usage"),
	"Po": UnicodeClass("Punctuation","other","Graphic","Character",'',''),
	# Symbol
	"Sm": UnicodeClass("Symbol","math","Graphic","Character",'','Mathematical symbols (e.g., +, =, ×, ÷, √, ∊). Does not include parentheses and brackets, which are in categories Ps and Pe. Also does not include !, *, -, or /, which despite frequent use as mathematical operators, are primarily considered to be "punctuation.'),
	"Sc": UnicodeClass("Symbol","currency","Graphic","Character",'',"Currency symbols"),
	"Sk": UnicodeClass("Symbol","modifier","Graphic","Character",'',''),
	"So": UnicodeClass("Symbol","other","Graphic","Character",'',''),
	# Separator
	"Zs": UnicodeClass("Separator","space","Graphic","Character",'',"Includes the space, but not TAB, CR, or LF, which are Cc"),
	"Zl": UnicodeClass("Separator","line","Format","Character",'',"Only U+2028 LINE SEPARATOR (LSEP)"),
	"Zp": UnicodeClass("Separator","paragraph","Format","Character",'',"Only U+2029 PARAGRAPH SEPARATOR (PSEP)"),
	# Other
	"Cc": UnicodeClass("Other","control","Control","Character","Fixed 65","No name, <control>"),
	"Cf": UnicodeClass("Other","format","Format","Character",'',"Includes the soft hyphen, control characters to support bi-directional text, and language tag characters"),
	"Cs": UnicodeClass("Other","surrogate","Surrogate","Not (but abstract)","Fixed 2,048","No name, <surrogate>"),
	"Co": UnicodeClass("Other","private use","Private-use","Not (but abstract)","Fixed 137,468 total: 6,400 in BMP, 131,068 in Planes 15-16","No name, <private-use>"),
	"Cn": UnicodeClass("Other","not assigned","Noncharacter","Not","Fixed 66","No name, <noncharacter>")
	}


	if __name__ == '__main__':
	run()