Skip to content

Instantly share code, notes, and snippets.

@g-andrade
Created March 2, 2018 17:00
Show Gist options
  • Save g-andrade/6d331d4abfe13db7646946b096de302b to your computer and use it in GitHub Desktop.
Save g-andrade/6d331d4abfe13db7646946b096de302b to your computer and use it in GitHub Desktop.
Generator of single glyph UTF8 grapheme clusters
#!/usr/bin/env python3
import json
import multiprocessing
import sys
import unicodedata
from collections import namedtuple
def run():
print('Preparing work', file=sys.stderr)
groups = generate_groups()
diacritic_group_names = [k for k in groups.keys() if ('diacritic' in k.lower())]
nondiacritic_group_names = [k for k in groups.keys() if (k not in diacritic_group_names)]
diacritic_groups = {k: groups[k] for k in diacritic_group_names}
nondiacritic_groups = {k: groups[k] for k in nondiacritic_group_names}
diacritic_ustrings = generate_diacritic_ustrings(diacritic_groups)
work_units = []
for range_name, subgroups in sorted_dict_items(nondiacritic_groups):
for char_class, codepoints in sorted_dict_items(subgroups):
work_units.append( (range_name, char_class, codepoints, diacritic_ustrings) )
print('Generating strings...', file=sys.stderr)
work_pool_size = multiprocessing.cpu_count()
with multiprocessing.Pool(work_pool_size) as work_pool:
work_results = work_pool.map(handle_bundled_codepoints, work_units)
work_results = [work_result for work_result in work_results if (work_result is not None)]
json_groups = {}
total_ustring_count = 0
for range_name, char_class, ustrings in work_results:
json_subgroups = json_groups.get(range_name)
if json_subgroups is None:
json_subgroups = {}
json_groups[range_name] = json_subgroups
json_class_category = '%s, %s' % (char_class.major_category, char_class.minor_category)
json_subgroups[json_class_category] = ustrings
ustring_count = len(ustrings)
total_ustring_count += ustring_count
print('%s / %s: %d unique strings' % (range_name, json_class_category, ustring_count), file=sys.stderr)
print('Generated a total of %d unique strings' % total_ustring_count, file=sys.stderr)
print('Writing to stdout...', file=sys.stderr)
json.dump(json_groups, sys.stdout, separators=(',', ':'))
def handle_bundled_codepoints(work_args):
range_name, char_class, codepoints, diacritic_ustrings = work_args
if (char_class.major_category, char_class.minor_category) == ('Other', 'surrogate'):
return
acc = []
for codepoint in sorted(codepoints):
acc += handle_bundled_codepoint(range_name, char_class, codepoint, diacritic_ustrings)
return range_name, char_class, acc
def handle_bundled_codepoint(range_name, char_class, codepoint, diacritic_ustrings):
ustring = chr(codepoint)
trivial_combos = [ustring]
diacritical_combos = generate_diacritic_combos(ustring, diacritic_ustrings)
all_base_combos = trivial_combos + diacritical_combos
all_inflected_combos = generate_all_normalization_variants(all_base_combos)
all_combos = sorted( list( frozenset(all_inflected_combos) ) )
return all_combos
def generate_diacritic_combos(ustring, diacritic_ustrings):
# TODO more than 1 diacritic?
return [(ustring + diacritic) for diacritic in diacritic_ustrings]
def generate_all_normalization_variants(ustrings):
for ustring in ustrings:
yield ustring
for form_name in ('NFC', 'NFKC', 'NFD', 'NFKD'):
yield unicodedata.normalize(form_name, ustring)
###
def generate_groups():
global VALID_CODEPOINT_RANGES
groups = {}
for min_codepoint, max_codepoint, range_name in VALID_CODEPOINT_RANGES:
for codepoint in range(min_codepoint, max_codepoint):
group_by_range(range_name, codepoint, groups)
return groups
def group_by_range(range_name, codepoint, groups_acc):
subgroups_acc = groups_acc.get(range_name)
if subgroups_acc is None:
subgroups_acc = dict()
groups_acc[range_name] = subgroups_acc
group_by_char_class(codepoint, subgroups_acc)
def group_by_char_class(codepoint, subgroups_acc):
global KNOWN_UNICODE_CLASSES
sample_ustring = chr(codepoint)
char_class_id = unicodedata.category(sample_ustring)
char_class = KNOWN_UNICODE_CLASSES[char_class_id]
dict_append(subgroups_acc, char_class, codepoint)
###
def generate_diacritic_ustrings(diacritic_groups):
acc = []
for range_name, subgroups in diacritic_groups.items():
for char_class, codepoints in subgroups.items():
if char_class.major_category == 'Mark':
acc += map(chr, codepoints)
return acc
###
def sorted_dict_items(dic):
return sorted(dic.items(), key = lambda kv: kv[0])
def dict_append(dic, key, value):
lst = dic.get(key)
if lst is None:
dic[key] = [value]
else:
lst.append(value)
# from: http://jrgraphix.net/research/unicode.php
VALID_CODEPOINT_RANGES = [
(0x0020, 0x007F, "Basic Latin"),
(0x2580, 0x259F, "Block Elements"),
(0x00A0, 0x00FF, "Latin-1 Supplement"),
(0x25A0, 0x25FF, "Geometric Shapes"),
(0x0100, 0x017F, "Latin Extended-A"),
(0x2600, 0x26FF, "Miscellaneous Symbols"),
(0x0180, 0x024F, "Latin Extended-B"),
(0x2700, 0x27BF, "Dingbats"),
(0x0250, 0x02AF, "IPA Extensions"),
(0x27C0, 0x27EF, "Miscellaneous Mathematical Symbols-A"),
(0x02B0, 0x02FF, "Spacing Modifier Letters"),
(0x27F0, 0x27FF, "Supplemental Arrows-A"),
(0x0300, 0x036F, "Combining Diacritical Marks"),
(0x2800, 0x28FF, "Braille Patterns"),
(0x0370, 0x03FF, "Greek and Coptic"),
(0x2900, 0x297F, "Supplemental Arrows-B"),
(0x0400, 0x04FF, "Cyrillic"),
(0x2980, 0x29FF, "Miscellaneous Mathematical Symbols-B"),
(0x0500, 0x052F, "Cyrillic Supplementary"),
(0x2A00, 0x2AFF, "Supplemental Mathematical Operators"),
(0x0530, 0x058F, "Armenian"),
(0x2B00, 0x2BFF, "Miscellaneous Symbols and Arrows"),
(0x0590, 0x05FF, "Hebrew"),
(0x2E80, 0x2EFF, "CJK Radicals Supplement"),
(0x0600, 0x06FF, "Arabic"),
(0x2F00, 0x2FDF, "Kangxi Radicals"),
(0x0700, 0x074F, "Syriac"),
(0x2FF0, 0x2FFF, "Ideographic Description Characters"),
(0x0780, 0x07BF, "Thaana"),
(0x3000, 0x303F, "CJK Symbols and Punctuation"),
(0x0900, 0x097F, "Devanagari"),
(0x3040, 0x309F, "Hiragana"),
(0x0980, 0x09FF, "Bengali"),
(0x30A0, 0x30FF, "Katakana"),
(0x0A00, 0x0A7F, "Gurmukhi"),
(0x3100, 0x312F, "Bopomofo"),
(0x0A80, 0x0AFF, "Gujarati"),
(0x3130, 0x318F, "Hangul Compatibility Jamo"),
(0x0B00, 0x0B7F, "Oriya"),
(0x3190, 0x319F, "Kanbun"),
(0x0B80, 0x0BFF, "Tamil"),
(0x31A0, 0x31BF, "Bopomofo Extended"),
(0x0C00, 0x0C7F, "Telugu"),
(0x31F0, 0x31FF, "Katakana Phonetic Extensions"),
(0x0C80, 0x0CFF, "Kannada"),
(0x3200, 0x32FF, "Enclosed CJK Letters and Months"),
(0x0D00, 0x0D7F, "Malayalam"),
(0x3300, 0x33FF, "CJK Compatibility"),
(0x0D80, 0x0DFF, "Sinhala"),
(0x3400, 0x4DBF, "CJK Unified Ideographs Extension A"),
(0x0E00, 0x0E7F, "Thai"),
(0x4DC0, 0x4DFF, "Yijing Hexagram Symbols"),
(0x0E80, 0x0EFF, "Lao"),
(0x4E00, 0x9FFF, "CJK Unified Ideographs"),
(0x0F00, 0x0FFF, "Tibetan"),
(0xA000, 0xA48F, "Yi Syllables"),
(0x1000, 0x109F, "Myanmar"),
(0xA490, 0xA4CF, "Yi Radicals"),
(0x10A0, 0x10FF, "Georgian"),
(0xAC00, 0xD7AF, "Hangul Syllables"),
(0x1100, 0x11FF, "Hangul Jamo"),
(0xD800, 0xDB7F, "High Surrogates"),
(0x1200, 0x137F, "Ethiopic"),
(0xDB80, 0xDBFF, "High Private Use Surrogates"),
(0x13A0, 0x13FF, "Cherokee"),
(0xDC00, 0xDFFF, "Low Surrogates"),
(0x1400, 0x167F, "Unified Canadian Aboriginal Syllabics"),
(0xE000, 0xF8FF, "Private Use Area"),
(0x1680, 0x169F, "Ogham"),
(0xF900, 0xFAFF, "CJK Compatibility Ideographs"),
(0x16A0, 0x16FF, "Runic"),
(0xFB00, 0xFB4F, "Alphabetic Presentation Forms"),
(0x1700, 0x171F, "Tagalog"),
(0xFB50, 0xFDFF, "Arabic Presentation Forms-A"),
(0x1720, 0x173F, "Hanunoo"),
(0xFE00, 0xFE0F, "Variation Selectors"),
(0x1740, 0x175F, "Buhid"),
(0xFE20, 0xFE2F, "Combining Half Marks"),
(0x1760, 0x177F, "Tagbanwa"),
(0xFE30, 0xFE4F, "CJK Compatibility Forms"),
(0x1780, 0x17FF, "Khmer"),
(0xFE50, 0xFE6F, "Small Form Variants"),
(0x1800, 0x18AF, "Mongolian"),
(0xFE70, 0xFEFF, "Arabic Presentation Forms-B"),
(0x1900, 0x194F, "Limbu"),
(0xFF00, 0xFFEF, "Halfwidth and Fullwidth Forms"),
(0x1950, 0x197F, "Tai Le"),
(0xFFF0, 0xFFFF, "Specials"),
(0x19E0, 0x19FF, "Khmer Symbols"),
(0x10000, 0x007F, "Linear B Syllabary"),
(0x1D00, 0x1D7F, "Phonetic Extensions"),
(0x10080, 0x00FF, "Linear B Ideograms"),
(0x1E00, 0x1EFF, "Latin Extended Additional"),
(0x10100, 0x013F, "Aegean Numbers"),
(0x1F00, 0x1FFF, "Greek Extended"),
(0x10300, 0x032F, "Old Italic"),
(0x2000, 0x206F, "General Punctuation"),
(0x10330, 0x034F, "Gothic"),
(0x2070, 0x209F, "Superscripts and Subscripts"),
(0x10380, 0x039F, "Ugaritic"),
(0x20A0, 0x20CF, "Currency Symbols"),
(0x10400, 0x044F, "Deseret"),
(0x20D0, 0x20FF, "Combining Diacritical Marks for Symbols"),
(0x10450, 0x047F, "Shavian"),
(0x2100, 0x214F, "Letterlike Symbols"),
(0x10480, 0x04AF, "Osmanya"),
(0x2150, 0x218F, "Number Forms"),
(0x10800, 0x083F, "Cypriot Syllabary"),
(0x2190, 0x21FF, "Arrows"),
(0x1D000, 0xD0FF, "Byzantine Musical Symbols"),
(0x2200, 0x22FF, "Mathematical Operators"),
(0x1D100, 0xD1FF, "Musical Symbols"),
(0x2300, 0x23FF, "Miscellaneous Technical"),
(0x1D300, 0xD35F, "Tai Xuan Jing Symbols"),
(0x2400, 0x243F, "Control Pictures"),
(0x1D400, 0xD7FF, "Mathematical Alphanumeric Symbols"),
(0x2440, 0x245F, "Optical Character Recognition"),
(0x20000, 0xA6DF, "CJK Unified Ideographs Extension B"),
(0x2460, 0x24FF, "Enclosed Alphanumerics"),
(0x2F800, 0xFA1F, "CJK Compatibility Ideographs Supplement"),
(0x2500, 0x257F, "Box Drawing"),
(0xE0000, 0x007F, "Tags")
]
UnicodeClass = namedtuple('UnicodeClass', ['major_category', 'minor_category', 'basic_type', 'character_assigned', 'fixed', 'remarks'])
# from: https://en.wikipedia.org/wiki/Template:General_Category_(Unicode)
KNOWN_UNICODE_CLASSES = {
# Letter
"Lu": UnicodeClass("Letter","uppercase","Graphic","Character",'',''),
"Ll": UnicodeClass("Letter","lowercase","Graphic","Character",'',''),
"Lt": UnicodeClass("Letter","titlecase","Graphic","Character",'',''),
"Lm": UnicodeClass("Letter","modifier","Graphic","Character",'',''),
"Lo": UnicodeClass("Letter","other","Graphic","Character",'',''),
# Mark
"Mn": UnicodeClass("Mark","nonspacing","Graphic","Character",'',''),
"Mc": UnicodeClass("Mark","spacing combining","Graphic","Character",'',''),
"Me": UnicodeClass("Mark","enclosing","Graphic","Character",'',''),
# Number
"Nd": UnicodeClass("Number","decimal digit","Graphic","Character",'',"All these, and only these, have Numeric Type = De"),
"Nl": UnicodeClass("Number","letter","Graphic","Character",'',"Numerals composed of letters or letterlike symbols (e.g., Roman numerals)"),
"No": UnicodeClass("Number","other","Graphic","Character",'',"E.g., vulgar fractions, superscript and subscript digits"),
# Punctuation
"Pc": UnicodeClass("Punctuation","connector","Graphic","Character",'',"Includes ""_"" underscore"),
"Pd": UnicodeClass("Punctuation","dash","Graphic","Character",'',"Includes several hyphen characters"),
"Ps": UnicodeClass("Punctuation","open","Graphic","Character",'',"Opening bracket characters"),
"Pe": UnicodeClass("Punctuation","close","Graphic","Character",'',"Closing bracket characters"),
"Pi": UnicodeClass("Punctuation","initial quote","Graphic","Character",'',"Opening quotation mark. Does not include the ASCII ""neutral"" quotation mark. May behave like Ps or Pe depending on usage"),
"Pf": UnicodeClass("Punctuation","final quote","Graphic","Character",'',"Closing quotation mark. May behave like Ps or Pe depending on usage"),
"Po": UnicodeClass("Punctuation","other","Graphic","Character",'',''),
# Symbol
"Sm": UnicodeClass("Symbol","math","Graphic","Character",'','Mathematical symbols (e.g., +, =, ×, ÷, √, ∊). Does not include parentheses and brackets, which are in categories Ps and Pe. Also does not include !, *, -, or /, which despite frequent use as mathematical operators, are primarily considered to be "punctuation.'),
"Sc": UnicodeClass("Symbol","currency","Graphic","Character",'',"Currency symbols"),
"Sk": UnicodeClass("Symbol","modifier","Graphic","Character",'',''),
"So": UnicodeClass("Symbol","other","Graphic","Character",'',''),
# Separator
"Zs": UnicodeClass("Separator","space","Graphic","Character",'',"Includes the space, but not TAB, CR, or LF, which are Cc"),
"Zl": UnicodeClass("Separator","line","Format","Character",'',"Only U+2028 LINE SEPARATOR (LSEP)"),
"Zp": UnicodeClass("Separator","paragraph","Format","Character",'',"Only U+2029 PARAGRAPH SEPARATOR (PSEP)"),
# Other
"Cc": UnicodeClass("Other","control","Control","Character","Fixed 65","No name, <control>"),
"Cf": UnicodeClass("Other","format","Format","Character",'',"Includes the soft hyphen, control characters to support bi-directional text, and language tag characters"),
"Cs": UnicodeClass("Other","surrogate","Surrogate","Not (but abstract)","Fixed 2,048","No name, <surrogate>"),
"Co": UnicodeClass("Other","private use","Private-use","Not (but abstract)","Fixed 137,468 total: 6,400 in BMP, 131,068 in Planes 15-16","No name, <private-use>"),
"Cn": UnicodeClass("Other","not assigned","Noncharacter","Not","Fixed 66","No name, <noncharacter>")
}
if __name__ == '__main__':
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment