Skip to content

Instantly share code, notes, and snippets.

@MarkMendell
Last active February 17, 2024 06:19
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MarkMendell/e854207bedcf34145197cd12fd0003c3 to your computer and use it in GitHub Desktop.
Save MarkMendell/e854207bedcf34145197cd12fd0003c3 to your computer and use it in GitHub Desktop.
Code for extended grapheme cluster breaks
// Public domain
#include <stddef.h>
#include <stdint.h>
#if 0
// Example program
int
getlenextendedgrapheme(size_t nbuf, uint8_t *buf, size_t i)
{
struct cp c = utf8(nbuf, buf, i);
int ngraph = c.n;
for (struct cp a=c,b;; a=b,ngraph+=b.n) {
// Break at the start and end of text,
// unless the text is empty.
if (i+ngraph == nbuf) break;
// Break based on UAX29§3.1.1
b = utf8(nbuf, buf, i+ngraph);
if (isbreakgraph(a, b, buf, i+ngraph)) break;
}
return ngraph;
}
#include <stdio.h>
#include <string.h>
int
main(void)
{
char *s = "👍🏽abc";
printf("%d\n", getlenextendedgrapheme(strlen(s), (uint8_t*)s, 0));
return 0;
}
#endif
// Unicode groups
const uint32_t Extended_Pictographic[] =
// Generated: curl unicode.org/Public/emoji/12.1/emoji-data.txt | grep 'Extended_Pictographic#' | cut -d ' ' -f 1 | sed 's/[^.]\{1,\}/0x&/g' | sed 's/\.\./,/' | sed '/,/!s/.*/&,&/' | sed -n 'H;${x;s/\n//;s/\n/, /gp;}' | sed 's/.*/{&};/'
{0x00A9,0x00A9, 0x00AE,0x00AE, 0x203C,0x203C, 0x2049,0x2049, 0x2122,0x2122, 0x2139,0x2139, 0x2194,0x2199, 0x21A9,0x21AA, 0x231A,0x231B, 0x2328,0x2328, 0x2388,0x2388, 0x23CF,0x23CF, 0x23E9,0x23F3, 0x23F8,0x23FA, 0x24C2,0x24C2, 0x25AA,0x25AB, 0x25B6,0x25B6, 0x25C0,0x25C0, 0x25FB,0x25FE, 0x2600,0x2604, 0x2605,0x2605, 0x2607,0x260D, 0x260E,0x260E, 0x260F,0x2610, 0x2611,0x2611, 0x2612,0x2612, 0x2614,0x2615, 0x2616,0x2617, 0x2618,0x2618, 0x2619,0x261C, 0x261D,0x261D, 0x261E,0x261F, 0x2620,0x2620, 0x2621,0x2621, 0x2622,0x2623, 0x2624,0x2625, 0x2626,0x2626, 0x2627,0x2629, 0x262A,0x262A, 0x262B,0x262D, 0x262E,0x262F, 0x2630,0x2637, 0x2638,0x263A, 0x263B,0x263F, 0x2640,0x2640, 0x2641,0x2641, 0x2642,0x2642, 0x2643,0x2647, 0x2648,0x2653, 0x2654,0x265E, 0x265F,0x265F, 0x2660,0x2660, 0x2661,0x2662, 0x2663,0x2663, 0x2664,0x2664, 0x2665,0x2666, 0x2667,0x2667, 0x2668,0x2668, 0x2669,0x267A, 0x267B,0x267B, 0x267C,0x267D, 0x267E,0x267E, 0x267F,0x267F, 0x2680,0x2685, 0x2690,0x2691, 0x2692,0x2694, 0x2695,0x2695, 0x2696,0x2697, 0x2698,0x2698, 0x2699,0x2699, 0x269A,0x269A, 0x269B,0x269C, 0x269D,0x269F, 0x26A0,0x26A1, 0x26A2,0x26A9, 0x26AA,0x26AB, 0x26AC,0x26AF, 0x26B0,0x26B1, 0x26B2,0x26BC, 0x26BD,0x26BE, 0x26BF,0x26C3, 0x26C4,0x26C5, 0x26C6,0x26C7, 0x26C8,0x26C8, 0x26C9,0x26CD, 0x26CE,0x26CF, 0x26D0,0x26D0, 0x26D1,0x26D1, 0x26D2,0x26D2, 0x26D3,0x26D4, 0x26D5,0x26E8, 0x26E9,0x26EA, 0x26EB,0x26EF, 0x26F0,0x26F5, 0x26F6,0x26F6, 0x26F7,0x26FA, 0x26FB,0x26FC, 0x26FD,0x26FD, 0x26FE,0x2701, 0x2702,0x2702, 0x2703,0x2704, 0x2705,0x2705, 0x2708,0x270D, 0x270E,0x270E, 0x270F,0x270F, 0x2710,0x2711, 0x2712,0x2712, 0x2714,0x2714, 0x2716,0x2716, 0x271D,0x271D, 0x2721,0x2721, 0x2728,0x2728, 0x2733,0x2734, 0x2744,0x2744, 0x2747,0x2747, 0x274C,0x274C, 0x274E,0x274E, 0x2753,0x2755, 0x2757,0x2757, 0x2763,0x2764, 0x2765,0x2767, 0x2795,0x2797, 0x27A1,0x27A1, 0x27B0,0x27B0, 0x27BF,0x27BF, 0x2934,0x2935, 0x2B05,0x2B07, 0x2B1B,0x2B1C, 0x2B50,0x2B50, 0x2B55,0x2B55, 0x3030,0x3030, 0x303D,0x303D, 0x3297,0x3297, 0x3299,0x3299, 0x1F000,0x1F003, 0x1F004,0x1F004, 0x1F005,0x1F0CE, 0x1F0CF,0x1F0CF, 0x1F0D0,0x1F0FF, 0x1F10D,0x1F10F, 0x1F12F,0x1F12F, 0x1F16C,0x1F16F, 0x1F170,0x1F171, 0x1F17E,0x1F17F, 0x1F18E,0x1F18E, 0x1F191,0x1F19A, 0x1F1AD,0x1F1E5, 0x1F201,0x1F202, 0x1F203,0x1F20F, 0x1F21A,0x1F21A, 0x1F22F,0x1F22F, 0x1F232,0x1F23A, 0x1F23C,0x1F23F, 0x1F249,0x1F24F, 0x1F250,0x1F251, 0x1F252,0x1F2FF, 0x1F300,0x1F321, 0x1F322,0x1F323, 0x1F324,0x1F393, 0x1F394,0x1F395, 0x1F396,0x1F397, 0x1F398,0x1F398, 0x1F399,0x1F39B, 0x1F39C,0x1F39D, 0x1F39E,0x1F3F0, 0x1F3F1,0x1F3F2, 0x1F3F3,0x1F3F5, 0x1F3F6,0x1F3F6, 0x1F3F7,0x1F3FA, 0x1F400,0x1F4FD, 0x1F4FE,0x1F4FE, 0x1F4FF,0x1F53D, 0x1F546,0x1F548, 0x1F549,0x1F54E, 0x1F54F,0x1F54F, 0x1F550,0x1F567, 0x1F568,0x1F56E, 0x1F56F,0x1F570, 0x1F571,0x1F572, 0x1F573,0x1F579, 0x1F57A,0x1F57A, 0x1F57B,0x1F586, 0x1F587,0x1F587, 0x1F588,0x1F589, 0x1F58A,0x1F58D, 0x1F58E,0x1F58F, 0x1F590,0x1F590, 0x1F591,0x1F594, 0x1F595,0x1F596, 0x1F597,0x1F5A3, 0x1F5A4,0x1F5A4, 0x1F5A5,0x1F5A5, 0x1F5A6,0x1F5A7, 0x1F5A8,0x1F5A8, 0x1F5A9,0x1F5B0, 0x1F5B1,0x1F5B2, 0x1F5B3,0x1F5BB, 0x1F5BC,0x1F5BC, 0x1F5BD,0x1F5C1, 0x1F5C2,0x1F5C4, 0x1F5C5,0x1F5D0, 0x1F5D1,0x1F5D3, 0x1F5D4,0x1F5DB, 0x1F5DC,0x1F5DE, 0x1F5DF,0x1F5E0, 0x1F5E1,0x1F5E1, 0x1F5E2,0x1F5E2, 0x1F5E3,0x1F5E3, 0x1F5E4,0x1F5E7, 0x1F5E8,0x1F5E8, 0x1F5E9,0x1F5EE, 0x1F5EF,0x1F5EF, 0x1F5F0,0x1F5F2, 0x1F5F3,0x1F5F3, 0x1F5F4,0x1F5F9, 0x1F5FA,0x1F64F, 0x1F680,0x1F6C5, 0x1F6C6,0x1F6CA, 0x1F6CB,0x1F6D0, 0x1F6D1,0x1F6D2, 0x1F6D3,0x1F6D4, 0x1F6D5,0x1F6D5, 0x1F6D6,0x1F6DF, 0x1F6E0,0x1F6E5, 0x1F6E6,0x1F6E8, 0x1F6E9,0x1F6E9, 0x1F6EA,0x1F6EA, 0x1F6EB,0x1F6EC, 0x1F6ED,0x1F6EF, 0x1F6F0,0x1F6F0, 0x1F6F1,0x1F6F2, 0x1F6F3,0x1F6F3, 0x1F6F4,0x1F6F6, 0x1F6F7,0x1F6F8, 0x1F6F9,0x1F6F9, 0x1F6FA,0x1F6FA, 0x1F6FB,0x1F6FF, 0x1F774,0x1F77F, 0x1F7D5,0x1F7DF, 0x1F7E0,0x1F7EB, 0x1F7EC,0x1F7FF, 0x1F80C,0x1F80F, 0x1F848,0x1F84F, 0x1F85A,0x1F85F, 0x1F888,0x1F88F, 0x1F8AE,0x1F8FF, 0x1F90C,0x1F90C, 0x1F90D,0x1F90F, 0x1F910,0x1F918, 0x1F919,0x1F91E, 0x1F91F,0x1F91F, 0x1F920,0x1F927, 0x1F928,0x1F92F, 0x1F930,0x1F930, 0x1F931,0x1F932, 0x1F933,0x1F93A, 0x1F93C,0x1F93E, 0x1F93F,0x1F93F, 0x1F940,0x1F945, 0x1F947,0x1F94B, 0x1F94C,0x1F94C, 0x1F94D,0x1F94F, 0x1F950,0x1F95E, 0x1F95F,0x1F96B, 0x1F96C,0x1F970, 0x1F971,0x1F971, 0x1F972,0x1F972, 0x1F973,0x1F976, 0x1F977,0x1F979, 0x1F97A,0x1F97A, 0x1F97B,0x1F97B, 0x1F97C,0x1F97F, 0x1F980,0x1F984, 0x1F985,0x1F991, 0x1F992,0x1F997, 0x1F998,0x1F9A2, 0x1F9A3,0x1F9A4, 0x1F9A5,0x1F9AA, 0x1F9AB,0x1F9AD, 0x1F9AE,0x1F9AF, 0x1F9B0,0x1F9B9, 0x1F9BA,0x1F9BF, 0x1F9C0,0x1F9C0, 0x1F9C1,0x1F9C2, 0x1F9C3,0x1F9CA, 0x1F9CB,0x1F9CC, 0x1F9CD,0x1F9CF, 0x1F9D0,0x1F9E6, 0x1F9E7,0x1F9FF, 0x1FA00,0x1FA6F, 0x1FA70,0x1FA73, 0x1FA74,0x1FA77, 0x1FA78,0x1FA7A, 0x1FA7B,0x1FA7F, 0x1FA80,0x1FA82, 0x1FA83,0x1FA8F, 0x1FA90,0x1FA95, 0x1FA96,0x1FFFD};
const uint32_t Regional_Indicator[] =
{0x1f1e6,0x1f1ff};
const uint32_t gbpPrepend[] =
// Generated: curl unicode.org/Public/12.1.0/ucd/auxiliary/GraphemeBreakProperty.txt | grep Prepend | cut -d ' ' -f 1 | sed 's/[^.]\{1,\}/0x&/g' | sed 's/\.\./,/' | sed '/,/!s/.*/&,&/' | sed -n 'H;${x;s/\n//;s/\n/, /gp;}' | sed 's/.*/{&};/'
{0x0600,0x0605, 0x06DD,0x06DD, 0x070F,0x070F, 0x08E2,0x08E2, 0x0D4E,0x0D4E, 0x110BD,0x110BD, 0x110CD,0x110CD, 0x111C2,0x111C3, 0x11A3A,0x11A3A, 0x11A84,0x11A89, 0x11D46,0x11D46};
const uint32_t gbpControl[] =
// Generated: curl unicode.org/Public/12.1.0/ucd/auxiliary/GraphemeBreakProperty.txt | grep Control | cut -d ' ' -f 1 | sed 's/[^.]\{1,\}/0x&/g' | sed 's/\.\./,/' | sed '/,/!s/.*/&,&/' | sed -n 'H;${x;s/\n//;s/\n/, /gp;}' | sed 's/.*/{&};/'
{0x0000,0x0009, 0x000B,0x000C, 0x000E,0x001F, 0x007F,0x009F, 0x00AD,0x00AD, 0x061C,0x061C, 0x180E,0x180E, 0x200B,0x200B, 0x200E,0x200F, 0x2028,0x2028, 0x2029,0x2029, 0x202A,0x202E, 0x2060,0x2064, 0x2065,0x2065, 0x2066,0x206F, 0xFEFF,0xFEFF, 0xFFF0,0xFFF8, 0xFFF9,0xFFFB, 0x13430,0x13438, 0x1BCA0,0x1BCA3, 0x1D173,0x1D17A, 0xE0000,0xE0000, 0xE0001,0xE0001, 0xE0002,0xE001F, 0xE0080,0xE00FF, 0xE01F0,0xE0FFF};
const uint32_t gbpExtend[] =
// Generated: curl unicode.org/Public/12.1.0/ucd/auxiliary/GraphemeBreakProperty.txt | grep Extend | cut -d ' ' -f 1 | sed 's/[^.]\{1,\}/0x&/g' | sed 's/\.\./,/' | sed '/,/!s/.*/&,&/' | sed -n 'H;${x;s/\n//;s/\n/, /gp;}' | sed 's/.*/{&};/'
{0x0300,0x036F, 0x0483,0x0487, 0x0488,0x0489, 0x0591,0x05BD, 0x05BF,0x05BF, 0x05C1,0x05C2, 0x05C4,0x05C5, 0x05C7,0x05C7, 0x0610,0x061A, 0x064B,0x065F, 0x0670,0x0670, 0x06D6,0x06DC, 0x06DF,0x06E4, 0x06E7,0x06E8, 0x06EA,0x06ED, 0x0711,0x0711, 0x0730,0x074A, 0x07A6,0x07B0, 0x07EB,0x07F3, 0x07FD,0x07FD, 0x0816,0x0819, 0x081B,0x0823, 0x0825,0x0827, 0x0829,0x082D, 0x0859,0x085B, 0x08D3,0x08E1, 0x08E3,0x0902, 0x093A,0x093A, 0x093C,0x093C, 0x0941,0x0948, 0x094D,0x094D, 0x0951,0x0957, 0x0962,0x0963, 0x0981,0x0981, 0x09BC,0x09BC, 0x09BE,0x09BE, 0x09C1,0x09C4, 0x09CD,0x09CD, 0x09D7,0x09D7, 0x09E2,0x09E3, 0x09FE,0x09FE, 0x0A01,0x0A02, 0x0A3C,0x0A3C, 0x0A41,0x0A42, 0x0A47,0x0A48, 0x0A4B,0x0A4D, 0x0A51,0x0A51, 0x0A70,0x0A71, 0x0A75,0x0A75, 0x0A81,0x0A82, 0x0ABC,0x0ABC, 0x0AC1,0x0AC5, 0x0AC7,0x0AC8, 0x0ACD,0x0ACD, 0x0AE2,0x0AE3, 0x0AFA,0x0AFF, 0x0B01,0x0B01, 0x0B3C,0x0B3C, 0x0B3E,0x0B3E, 0x0B3F,0x0B3F, 0x0B41,0x0B44, 0x0B4D,0x0B4D, 0x0B56,0x0B56, 0x0B57,0x0B57, 0x0B62,0x0B63, 0x0B82,0x0B82, 0x0BBE,0x0BBE, 0x0BC0,0x0BC0, 0x0BCD,0x0BCD, 0x0BD7,0x0BD7, 0x0C00,0x0C00, 0x0C04,0x0C04, 0x0C3E,0x0C40, 0x0C46,0x0C48, 0x0C4A,0x0C4D, 0x0C55,0x0C56, 0x0C62,0x0C63, 0x0C81,0x0C81, 0x0CBC,0x0CBC, 0x0CBF,0x0CBF, 0x0CC2,0x0CC2, 0x0CC6,0x0CC6, 0x0CCC,0x0CCD, 0x0CD5,0x0CD6, 0x0CE2,0x0CE3, 0x0D00,0x0D01, 0x0D3B,0x0D3C, 0x0D3E,0x0D3E, 0x0D41,0x0D44, 0x0D4D,0x0D4D, 0x0D57,0x0D57, 0x0D62,0x0D63, 0x0DCA,0x0DCA, 0x0DCF,0x0DCF, 0x0DD2,0x0DD4, 0x0DD6,0x0DD6, 0x0DDF,0x0DDF, 0x0E31,0x0E31, 0x0E34,0x0E3A, 0x0E47,0x0E4E, 0x0EB1,0x0EB1, 0x0EB4,0x0EBC, 0x0EC8,0x0ECD, 0x0F18,0x0F19, 0x0F35,0x0F35, 0x0F37,0x0F37, 0x0F39,0x0F39, 0x0F71,0x0F7E, 0x0F80,0x0F84, 0x0F86,0x0F87, 0x0F8D,0x0F97, 0x0F99,0x0FBC, 0x0FC6,0x0FC6, 0x102D,0x1030, 0x1032,0x1037, 0x1039,0x103A, 0x103D,0x103E, 0x1058,0x1059, 0x105E,0x1060, 0x1071,0x1074, 0x1082,0x1082, 0x1085,0x1086, 0x108D,0x108D, 0x109D,0x109D, 0x135D,0x135F, 0x1712,0x1714, 0x1732,0x1734, 0x1752,0x1753, 0x1772,0x1773, 0x17B4,0x17B5, 0x17B7,0x17BD, 0x17C6,0x17C6, 0x17C9,0x17D3, 0x17DD,0x17DD, 0x180B,0x180D, 0x1885,0x1886, 0x18A9,0x18A9, 0x1920,0x1922, 0x1927,0x1928, 0x1932,0x1932, 0x1939,0x193B, 0x1A17,0x1A18, 0x1A1B,0x1A1B, 0x1A56,0x1A56, 0x1A58,0x1A5E, 0x1A60,0x1A60, 0x1A62,0x1A62, 0x1A65,0x1A6C, 0x1A73,0x1A7C, 0x1A7F,0x1A7F, 0x1AB0,0x1ABD, 0x1ABE,0x1ABE, 0x1B00,0x1B03, 0x1B34,0x1B34, 0x1B35,0x1B35, 0x1B36,0x1B3A, 0x1B3C,0x1B3C, 0x1B42,0x1B42, 0x1B6B,0x1B73, 0x1B80,0x1B81, 0x1BA2,0x1BA5, 0x1BA8,0x1BA9, 0x1BAB,0x1BAD, 0x1BE6,0x1BE6, 0x1BE8,0x1BE9, 0x1BED,0x1BED, 0x1BEF,0x1BF1, 0x1C2C,0x1C33, 0x1C36,0x1C37, 0x1CD0,0x1CD2, 0x1CD4,0x1CE0, 0x1CE2,0x1CE8, 0x1CED,0x1CED, 0x1CF4,0x1CF4, 0x1CF8,0x1CF9, 0x1DC0,0x1DF9, 0x1DFB,0x1DFF, 0x200C,0x200C, 0x20D0,0x20DC, 0x20DD,0x20E0, 0x20E1,0x20E1, 0x20E2,0x20E4, 0x20E5,0x20F0, 0x2CEF,0x2CF1, 0x2D7F,0x2D7F, 0x2DE0,0x2DFF, 0x302A,0x302D, 0x302E,0x302F, 0x3099,0x309A, 0xA66F,0xA66F, 0xA670,0xA672, 0xA674,0xA67D, 0xA69E,0xA69F, 0xA6F0,0xA6F1, 0xA802,0xA802, 0xA806,0xA806, 0xA80B,0xA80B, 0xA825,0xA826, 0xA8C4,0xA8C5, 0xA8E0,0xA8F1, 0xA8FF,0xA8FF, 0xA926,0xA92D, 0xA947,0xA951, 0xA980,0xA982, 0xA9B3,0xA9B3, 0xA9B6,0xA9B9, 0xA9BC,0xA9BD, 0xA9E5,0xA9E5, 0xAA29,0xAA2E, 0xAA31,0xAA32, 0xAA35,0xAA36, 0xAA43,0xAA43, 0xAA4C,0xAA4C, 0xAA7C,0xAA7C, 0xAAB0,0xAAB0, 0xAAB2,0xAAB4, 0xAAB7,0xAAB8, 0xAABE,0xAABF, 0xAAC1,0xAAC1, 0xAAEC,0xAAED, 0xAAF6,0xAAF6, 0xABE5,0xABE5, 0xABE8,0xABE8, 0xABED,0xABED, 0xFB1E,0xFB1E, 0xFE00,0xFE0F, 0xFE20,0xFE2F, 0xFF9E,0xFF9F, 0x101FD,0x101FD, 0x102E0,0x102E0, 0x10376,0x1037A, 0x10A01,0x10A03, 0x10A05,0x10A06, 0x10A0C,0x10A0F, 0x10A38,0x10A3A, 0x10A3F,0x10A3F, 0x10AE5,0x10AE6, 0x10D24,0x10D27, 0x10F46,0x10F50, 0x11001,0x11001, 0x11038,0x11046, 0x1107F,0x11081, 0x110B3,0x110B6, 0x110B9,0x110BA, 0x11100,0x11102, 0x11127,0x1112B, 0x1112D,0x11134, 0x11173,0x11173, 0x11180,0x11181, 0x111B6,0x111BE, 0x111C9,0x111CC, 0x1122F,0x11231, 0x11234,0x11234, 0x11236,0x11237, 0x1123E,0x1123E, 0x112DF,0x112DF, 0x112E3,0x112EA, 0x11300,0x11301, 0x1133B,0x1133C, 0x1133E,0x1133E, 0x11340,0x11340, 0x11357,0x11357, 0x11366,0x1136C, 0x11370,0x11374, 0x11438,0x1143F, 0x11442,0x11444, 0x11446,0x11446, 0x1145E,0x1145E, 0x114B0,0x114B0, 0x114B3,0x114B8, 0x114BA,0x114BA, 0x114BD,0x114BD, 0x114BF,0x114C0, 0x114C2,0x114C3, 0x115AF,0x115AF, 0x115B2,0x115B5, 0x115BC,0x115BD, 0x115BF,0x115C0, 0x115DC,0x115DD, 0x11633,0x1163A, 0x1163D,0x1163D, 0x1163F,0x11640, 0x116AB,0x116AB, 0x116AD,0x116AD, 0x116B0,0x116B5, 0x116B7,0x116B7, 0x1171D,0x1171F, 0x11722,0x11725, 0x11727,0x1172B, 0x1182F,0x11837, 0x11839,0x1183A, 0x119D4,0x119D7, 0x119DA,0x119DB, 0x119E0,0x119E0, 0x11A01,0x11A0A, 0x11A33,0x11A38, 0x11A3B,0x11A3E, 0x11A47,0x11A47, 0x11A51,0x11A56, 0x11A59,0x11A5B, 0x11A8A,0x11A96, 0x11A98,0x11A99, 0x11C30,0x11C36, 0x11C38,0x11C3D, 0x11C3F,0x11C3F, 0x11C92,0x11CA7, 0x11CAA,0x11CB0, 0x11CB2,0x11CB3, 0x11CB5,0x11CB6, 0x11D31,0x11D36, 0x11D3A,0x11D3A, 0x11D3C,0x11D3D, 0x11D3F,0x11D45, 0x11D47,0x11D47, 0x11D90,0x11D91, 0x11D95,0x11D95, 0x11D97,0x11D97, 0x11EF3,0x11EF4, 0x16AF0,0x16AF4, 0x16B30,0x16B36, 0x16F4F,0x16F4F, 0x16F8F,0x16F92, 0x1BC9D,0x1BC9E, 0x1D165,0x1D165, 0x1D167,0x1D169, 0x1D16E,0x1D172, 0x1D17B,0x1D182, 0x1D185,0x1D18B, 0x1D1AA,0x1D1AD, 0x1D242,0x1D244, 0x1DA00,0x1DA36, 0x1DA3B,0x1DA6C, 0x1DA75,0x1DA75, 0x1DA84,0x1DA84, 0x1DA9B,0x1DA9F, 0x1DAA1,0x1DAAF, 0x1E000,0x1E006, 0x1E008,0x1E018, 0x1E01B,0x1E021, 0x1E023,0x1E024, 0x1E026,0x1E02A, 0x1E130,0x1E136, 0x1E2EC,0x1E2EF, 0x1E8D0,0x1E8D6, 0x1E944,0x1E94A, 0x1F3FB,0x1F3FF, 0xE0020,0xE007F, 0xE0100,0xE01EF};
const uint32_t gbpSpacingMark[] =
// Generated: curl unicode.org/Public/12.1.0/ucd/auxiliary/GraphemeBreakProperty.txt | grep SpacingMark | cut -d ' ' -f 1 | sed 's/[^.]\{1,\}/0x&/g' | sed 's/\.\./,/' | sed '/,/!s/.*/&,&/' | sed -n 'H;${x;s/\n//;s/\n/, /gp;}' | sed 's/.*/{&};/'
{0x0903,0x0903, 0x093B,0x093B, 0x093E,0x0940, 0x0949,0x094C, 0x094E,0x094F, 0x0982,0x0983, 0x09BF,0x09C0, 0x09C7,0x09C8, 0x09CB,0x09CC, 0x0A03,0x0A03, 0x0A3E,0x0A40, 0x0A83,0x0A83, 0x0ABE,0x0AC0, 0x0AC9,0x0AC9, 0x0ACB,0x0ACC, 0x0B02,0x0B03, 0x0B40,0x0B40, 0x0B47,0x0B48, 0x0B4B,0x0B4C, 0x0BBF,0x0BBF, 0x0BC1,0x0BC2, 0x0BC6,0x0BC8, 0x0BCA,0x0BCC, 0x0C01,0x0C03, 0x0C41,0x0C44, 0x0C82,0x0C83, 0x0CBE,0x0CBE, 0x0CC0,0x0CC1, 0x0CC3,0x0CC4, 0x0CC7,0x0CC8, 0x0CCA,0x0CCB, 0x0D02,0x0D03, 0x0D3F,0x0D40, 0x0D46,0x0D48, 0x0D4A,0x0D4C, 0x0D82,0x0D83, 0x0DD0,0x0DD1, 0x0DD8,0x0DDE, 0x0DF2,0x0DF3, 0x0E33,0x0E33, 0x0EB3,0x0EB3, 0x0F3E,0x0F3F, 0x0F7F,0x0F7F, 0x1031,0x1031, 0x103B,0x103C, 0x1056,0x1057, 0x1084,0x1084, 0x17B6,0x17B6, 0x17BE,0x17C5, 0x17C7,0x17C8, 0x1923,0x1926, 0x1929,0x192B, 0x1930,0x1931, 0x1933,0x1938, 0x1A19,0x1A1A, 0x1A55,0x1A55, 0x1A57,0x1A57, 0x1A6D,0x1A72, 0x1B04,0x1B04, 0x1B3B,0x1B3B, 0x1B3D,0x1B41, 0x1B43,0x1B44, 0x1B82,0x1B82, 0x1BA1,0x1BA1, 0x1BA6,0x1BA7, 0x1BAA,0x1BAA, 0x1BE7,0x1BE7, 0x1BEA,0x1BEC, 0x1BEE,0x1BEE, 0x1BF2,0x1BF3, 0x1C24,0x1C2B, 0x1C34,0x1C35, 0x1CE1,0x1CE1, 0x1CF7,0x1CF7, 0xA823,0xA824, 0xA827,0xA827, 0xA880,0xA881, 0xA8B4,0xA8C3, 0xA952,0xA953, 0xA983,0xA983, 0xA9B4,0xA9B5, 0xA9BA,0xA9BB, 0xA9BE,0xA9C0, 0xAA2F,0xAA30, 0xAA33,0xAA34, 0xAA4D,0xAA4D, 0xAAEB,0xAAEB, 0xAAEE,0xAAEF, 0xAAF5,0xAAF5, 0xABE3,0xABE4, 0xABE6,0xABE7, 0xABE9,0xABEA, 0xABEC,0xABEC, 0x11000,0x11000, 0x11002,0x11002, 0x11082,0x11082, 0x110B0,0x110B2, 0x110B7,0x110B8, 0x1112C,0x1112C, 0x11145,0x11146, 0x11182,0x11182, 0x111B3,0x111B5, 0x111BF,0x111C0, 0x1122C,0x1122E, 0x11232,0x11233, 0x11235,0x11235, 0x112E0,0x112E2, 0x11302,0x11303, 0x1133F,0x1133F, 0x11341,0x11344, 0x11347,0x11348, 0x1134B,0x1134D, 0x11362,0x11363, 0x11435,0x11437, 0x11440,0x11441, 0x11445,0x11445, 0x114B1,0x114B2, 0x114B9,0x114B9, 0x114BB,0x114BC, 0x114BE,0x114BE, 0x114C1,0x114C1, 0x115B0,0x115B1, 0x115B8,0x115BB, 0x115BE,0x115BE, 0x11630,0x11632, 0x1163B,0x1163C, 0x1163E,0x1163E, 0x116AC,0x116AC, 0x116AE,0x116AF, 0x116B6,0x116B6, 0x11720,0x11721, 0x11726,0x11726, 0x1182C,0x1182E, 0x11838,0x11838, 0x119D1,0x119D3, 0x119DC,0x119DF, 0x119E4,0x119E4, 0x11A39,0x11A39, 0x11A57,0x11A58, 0x11A97,0x11A97, 0x11C2F,0x11C2F, 0x11C3E,0x11C3E, 0x11CA9,0x11CA9, 0x11CB1,0x11CB1, 0x11CB4,0x11CB4, 0x11D8A,0x11D8E, 0x11D93,0x11D94, 0x11D96,0x11D96, 0x11EF5,0x11EF6, 0x16F51,0x16F87, 0x1D166,0x1D166, 0x1D16D,0x1D16D};
const uint32_t gbpL[] =
// Generated: curl unicode.org/Public/12.1.0/ucd/auxiliary/GraphemeBreakProperty.txt | grep '; L ' | cut -d ' ' -f 1 | sed 's/[^.]\{1,\}/0x&/g' | sed 's/\.\./,/' | sed '/,/!s/.*/&,&/' | sed -n 'H;${x;s/\n//;s/\n/, /gp;}' | sed 's/.*/{&};/'
{0x1100,0x115F, 0xA960,0xA97C};
const uint32_t gbpV[] =
// Generated: curl unicode.org/Public/12.1.0/ucd/auxiliary/GraphemeBreakProperty.txt | grep '; V ' | cut -d ' ' -f 1 | sed 's/[^.]\{1,\}/0x&/g' | sed 's/\.\./,/' | sed '/,/!s/.*/&,&/' | sed -n 'H;${x;s/\n//;s/\n/, /gp;}' | sed 's/.*/{&};/'
{0x1160,0x11A7, 0xD7B0,0xD7C6};
const uint32_t gbpT[] =
// Generated: curl unicode.org/Public/12.1.0/ucd/auxiliary/GraphemeBreakProperty.txt | grep '; T ' | cut -d ' ' -f 1 | sed 's/[^.]\{1,\}/0x&/g' | sed 's/\.\./,/' | sed '/,/!s/.*/&,&/' | sed -n 'H;${x;s/\n//;s/\n/, /gp;}' | sed 's/.*/{&};/'
{0x11A8,0x11FF, 0xD7CB,0xD7FB};
const uint32_t gbpLV[] =
// Generated: curl unicode.org/Public/12.1.0/ucd/auxiliary/GraphemeBreakProperty.txt | grep '; LV ' | cut -d ' ' -f 1 | sed 's/[^.]\{1,\}/0x&/g' | sed 's/\.\./,/' | sed '/,/!s/.*/&,&/' | sed -n 'H;${x;s/\n//;s/\n/, /gp;}' | sed 's/.*/{&};/'
{0xAC00,0xAC00, 0xAC1C,0xAC1C, 0xAC38,0xAC38, 0xAC54,0xAC54, 0xAC70,0xAC70, 0xAC8C,0xAC8C, 0xACA8,0xACA8, 0xACC4,0xACC4, 0xACE0,0xACE0, 0xACFC,0xACFC, 0xAD18,0xAD18, 0xAD34,0xAD34, 0xAD50,0xAD50, 0xAD6C,0xAD6C, 0xAD88,0xAD88, 0xADA4,0xADA4, 0xADC0,0xADC0, 0xADDC,0xADDC, 0xADF8,0xADF8, 0xAE14,0xAE14, 0xAE30,0xAE30, 0xAE4C,0xAE4C, 0xAE68,0xAE68, 0xAE84,0xAE84, 0xAEA0,0xAEA0, 0xAEBC,0xAEBC, 0xAED8,0xAED8, 0xAEF4,0xAEF4, 0xAF10,0xAF10, 0xAF2C,0xAF2C, 0xAF48,0xAF48, 0xAF64,0xAF64, 0xAF80,0xAF80, 0xAF9C,0xAF9C, 0xAFB8,0xAFB8, 0xAFD4,0xAFD4, 0xAFF0,0xAFF0, 0xB00C,0xB00C, 0xB028,0xB028, 0xB044,0xB044, 0xB060,0xB060, 0xB07C,0xB07C, 0xB098,0xB098, 0xB0B4,0xB0B4, 0xB0D0,0xB0D0, 0xB0EC,0xB0EC, 0xB108,0xB108, 0xB124,0xB124, 0xB140,0xB140, 0xB15C,0xB15C, 0xB178,0xB178, 0xB194,0xB194, 0xB1B0,0xB1B0, 0xB1CC,0xB1CC, 0xB1E8,0xB1E8, 0xB204,0xB204, 0xB220,0xB220, 0xB23C,0xB23C, 0xB258,0xB258, 0xB274,0xB274, 0xB290,0xB290, 0xB2AC,0xB2AC, 0xB2C8,0xB2C8, 0xB2E4,0xB2E4, 0xB300,0xB300, 0xB31C,0xB31C, 0xB338,0xB338, 0xB354,0xB354, 0xB370,0xB370, 0xB38C,0xB38C, 0xB3A8,0xB3A8, 0xB3C4,0xB3C4, 0xB3E0,0xB3E0, 0xB3FC,0xB3FC, 0xB418,0xB418, 0xB434,0xB434, 0xB450,0xB450, 0xB46C,0xB46C, 0xB488,0xB488, 0xB4A4,0xB4A4, 0xB4C0,0xB4C0, 0xB4DC,0xB4DC, 0xB4F8,0xB4F8, 0xB514,0xB514, 0xB530,0xB530, 0xB54C,0xB54C, 0xB568,0xB568, 0xB584,0xB584, 0xB5A0,0xB5A0, 0xB5BC,0xB5BC, 0xB5D8,0xB5D8, 0xB5F4,0xB5F4, 0xB610,0xB610, 0xB62C,0xB62C, 0xB648,0xB648, 0xB664,0xB664, 0xB680,0xB680, 0xB69C,0xB69C, 0xB6B8,0xB6B8, 0xB6D4,0xB6D4, 0xB6F0,0xB6F0, 0xB70C,0xB70C, 0xB728,0xB728, 0xB744,0xB744, 0xB760,0xB760, 0xB77C,0xB77C, 0xB798,0xB798, 0xB7B4,0xB7B4, 0xB7D0,0xB7D0, 0xB7EC,0xB7EC, 0xB808,0xB808, 0xB824,0xB824, 0xB840,0xB840, 0xB85C,0xB85C, 0xB878,0xB878, 0xB894,0xB894, 0xB8B0,0xB8B0, 0xB8CC,0xB8CC, 0xB8E8,0xB8E8, 0xB904,0xB904, 0xB920,0xB920, 0xB93C,0xB93C, 0xB958,0xB958, 0xB974,0xB974, 0xB990,0xB990, 0xB9AC,0xB9AC, 0xB9C8,0xB9C8, 0xB9E4,0xB9E4, 0xBA00,0xBA00, 0xBA1C,0xBA1C, 0xBA38,0xBA38, 0xBA54,0xBA54, 0xBA70,0xBA70, 0xBA8C,0xBA8C, 0xBAA8,0xBAA8, 0xBAC4,0xBAC4, 0xBAE0,0xBAE0, 0xBAFC,0xBAFC, 0xBB18,0xBB18, 0xBB34,0xBB34, 0xBB50,0xBB50, 0xBB6C,0xBB6C, 0xBB88,0xBB88, 0xBBA4,0xBBA4, 0xBBC0,0xBBC0, 0xBBDC,0xBBDC, 0xBBF8,0xBBF8, 0xBC14,0xBC14, 0xBC30,0xBC30, 0xBC4C,0xBC4C, 0xBC68,0xBC68, 0xBC84,0xBC84, 0xBCA0,0xBCA0, 0xBCBC,0xBCBC, 0xBCD8,0xBCD8, 0xBCF4,0xBCF4, 0xBD10,0xBD10, 0xBD2C,0xBD2C, 0xBD48,0xBD48, 0xBD64,0xBD64, 0xBD80,0xBD80, 0xBD9C,0xBD9C, 0xBDB8,0xBDB8, 0xBDD4,0xBDD4, 0xBDF0,0xBDF0, 0xBE0C,0xBE0C, 0xBE28,0xBE28, 0xBE44,0xBE44, 0xBE60,0xBE60, 0xBE7C,0xBE7C, 0xBE98,0xBE98, 0xBEB4,0xBEB4, 0xBED0,0xBED0, 0xBEEC,0xBEEC, 0xBF08,0xBF08, 0xBF24,0xBF24, 0xBF40,0xBF40, 0xBF5C,0xBF5C, 0xBF78,0xBF78, 0xBF94,0xBF94, 0xBFB0,0xBFB0, 0xBFCC,0xBFCC, 0xBFE8,0xBFE8, 0xC004,0xC004, 0xC020,0xC020, 0xC03C,0xC03C, 0xC058,0xC058, 0xC074,0xC074, 0xC090,0xC090, 0xC0AC,0xC0AC, 0xC0C8,0xC0C8, 0xC0E4,0xC0E4, 0xC100,0xC100, 0xC11C,0xC11C, 0xC138,0xC138, 0xC154,0xC154, 0xC170,0xC170, 0xC18C,0xC18C, 0xC1A8,0xC1A8, 0xC1C4,0xC1C4, 0xC1E0,0xC1E0, 0xC1FC,0xC1FC, 0xC218,0xC218, 0xC234,0xC234, 0xC250,0xC250, 0xC26C,0xC26C, 0xC288,0xC288, 0xC2A4,0xC2A4, 0xC2C0,0xC2C0, 0xC2DC,0xC2DC, 0xC2F8,0xC2F8, 0xC314,0xC314, 0xC330,0xC330, 0xC34C,0xC34C, 0xC368,0xC368, 0xC384,0xC384, 0xC3A0,0xC3A0, 0xC3BC,0xC3BC, 0xC3D8,0xC3D8, 0xC3F4,0xC3F4, 0xC410,0xC410, 0xC42C,0xC42C, 0xC448,0xC448, 0xC464,0xC464, 0xC480,0xC480, 0xC49C,0xC49C, 0xC4B8,0xC4B8, 0xC4D4,0xC4D4, 0xC4F0,0xC4F0, 0xC50C,0xC50C, 0xC528,0xC528, 0xC544,0xC544, 0xC560,0xC560, 0xC57C,0xC57C, 0xC598,0xC598, 0xC5B4,0xC5B4, 0xC5D0,0xC5D0, 0xC5EC,0xC5EC, 0xC608,0xC608, 0xC624,0xC624, 0xC640,0xC640, 0xC65C,0xC65C, 0xC678,0xC678, 0xC694,0xC694, 0xC6B0,0xC6B0, 0xC6CC,0xC6CC, 0xC6E8,0xC6E8, 0xC704,0xC704, 0xC720,0xC720, 0xC73C,0xC73C, 0xC758,0xC758, 0xC774,0xC774, 0xC790,0xC790, 0xC7AC,0xC7AC, 0xC7C8,0xC7C8, 0xC7E4,0xC7E4, 0xC800,0xC800, 0xC81C,0xC81C, 0xC838,0xC838, 0xC854,0xC854, 0xC870,0xC870, 0xC88C,0xC88C, 0xC8A8,0xC8A8, 0xC8C4,0xC8C4, 0xC8E0,0xC8E0, 0xC8FC,0xC8FC, 0xC918,0xC918, 0xC934,0xC934, 0xC950,0xC950, 0xC96C,0xC96C, 0xC988,0xC988, 0xC9A4,0xC9A4, 0xC9C0,0xC9C0, 0xC9DC,0xC9DC, 0xC9F8,0xC9F8, 0xCA14,0xCA14, 0xCA30,0xCA30, 0xCA4C,0xCA4C, 0xCA68,0xCA68, 0xCA84,0xCA84, 0xCAA0,0xCAA0, 0xCABC,0xCABC, 0xCAD8,0xCAD8, 0xCAF4,0xCAF4, 0xCB10,0xCB10, 0xCB2C,0xCB2C, 0xCB48,0xCB48, 0xCB64,0xCB64, 0xCB80,0xCB80, 0xCB9C,0xCB9C, 0xCBB8,0xCBB8, 0xCBD4,0xCBD4, 0xCBF0,0xCBF0, 0xCC0C,0xCC0C, 0xCC28,0xCC28, 0xCC44,0xCC44, 0xCC60,0xCC60, 0xCC7C,0xCC7C, 0xCC98,0xCC98, 0xCCB4,0xCCB4, 0xCCD0,0xCCD0, 0xCCEC,0xCCEC, 0xCD08,0xCD08, 0xCD24,0xCD24, 0xCD40,0xCD40, 0xCD5C,0xCD5C, 0xCD78,0xCD78, 0xCD94,0xCD94, 0xCDB0,0xCDB0, 0xCDCC,0xCDCC, 0xCDE8,0xCDE8, 0xCE04,0xCE04, 0xCE20,0xCE20, 0xCE3C,0xCE3C, 0xCE58,0xCE58, 0xCE74,0xCE74, 0xCE90,0xCE90, 0xCEAC,0xCEAC, 0xCEC8,0xCEC8, 0xCEE4,0xCEE4, 0xCF00,0xCF00, 0xCF1C,0xCF1C, 0xCF38,0xCF38, 0xCF54,0xCF54, 0xCF70,0xCF70, 0xCF8C,0xCF8C, 0xCFA8,0xCFA8, 0xCFC4,0xCFC4, 0xCFE0,0xCFE0, 0xCFFC,0xCFFC, 0xD018,0xD018, 0xD034,0xD034, 0xD050,0xD050, 0xD06C,0xD06C, 0xD088,0xD088, 0xD0A4,0xD0A4, 0xD0C0,0xD0C0, 0xD0DC,0xD0DC, 0xD0F8,0xD0F8, 0xD114,0xD114, 0xD130,0xD130, 0xD14C,0xD14C, 0xD168,0xD168, 0xD184,0xD184, 0xD1A0,0xD1A0, 0xD1BC,0xD1BC, 0xD1D8,0xD1D8, 0xD1F4,0xD1F4, 0xD210,0xD210, 0xD22C,0xD22C, 0xD248,0xD248, 0xD264,0xD264, 0xD280,0xD280, 0xD29C,0xD29C, 0xD2B8,0xD2B8, 0xD2D4,0xD2D4, 0xD2F0,0xD2F0, 0xD30C,0xD30C, 0xD328,0xD328, 0xD344,0xD344, 0xD360,0xD360, 0xD37C,0xD37C, 0xD398,0xD398, 0xD3B4,0xD3B4, 0xD3D0,0xD3D0, 0xD3EC,0xD3EC, 0xD408,0xD408, 0xD424,0xD424, 0xD440,0xD440, 0xD45C,0xD45C, 0xD478,0xD478, 0xD494,0xD494, 0xD4B0,0xD4B0, 0xD4CC,0xD4CC, 0xD4E8,0xD4E8, 0xD504,0xD504, 0xD520,0xD520, 0xD53C,0xD53C, 0xD558,0xD558, 0xD574,0xD574, 0xD590,0xD590, 0xD5AC,0xD5AC, 0xD5C8,0xD5C8, 0xD5E4,0xD5E4, 0xD600,0xD600, 0xD61C,0xD61C, 0xD638,0xD638, 0xD654,0xD654, 0xD670,0xD670, 0xD68C,0xD68C, 0xD6A8,0xD6A8, 0xD6C4,0xD6C4, 0xD6E0,0xD6E0, 0xD6FC,0xD6FC, 0xD718,0xD718, 0xD734,0xD734, 0xD750,0xD750, 0xD76C,0xD76C, 0xD788,0xD788};
const uint32_t gbpLVT[] =
// Generated: curl unicode.org/Public/12.1.0/ucd/auxiliary/GraphemeBreakProperty.txt | grep '; LVT ' | cut -d ' ' -f 1 | sed 's/[^.]\{1,\}/0x&/g' | sed 's/\.\./,/' | sed '/,/!s/.*/&,&/' | sed -n 'H;${x;s/\n//;s/\n/, /gp;}' | sed 's/.*/{&};/'
{0xAC01,0xAC1B, 0xAC1D,0xAC37, 0xAC39,0xAC53, 0xAC55,0xAC6F, 0xAC71,0xAC8B, 0xAC8D,0xACA7, 0xACA9,0xACC3, 0xACC5,0xACDF, 0xACE1,0xACFB, 0xACFD,0xAD17, 0xAD19,0xAD33, 0xAD35,0xAD4F, 0xAD51,0xAD6B, 0xAD6D,0xAD87, 0xAD89,0xADA3, 0xADA5,0xADBF, 0xADC1,0xADDB, 0xADDD,0xADF7, 0xADF9,0xAE13, 0xAE15,0xAE2F, 0xAE31,0xAE4B, 0xAE4D,0xAE67, 0xAE69,0xAE83, 0xAE85,0xAE9F, 0xAEA1,0xAEBB, 0xAEBD,0xAED7, 0xAED9,0xAEF3, 0xAEF5,0xAF0F, 0xAF11,0xAF2B, 0xAF2D,0xAF47, 0xAF49,0xAF63, 0xAF65,0xAF7F, 0xAF81,0xAF9B, 0xAF9D,0xAFB7, 0xAFB9,0xAFD3, 0xAFD5,0xAFEF, 0xAFF1,0xB00B, 0xB00D,0xB027, 0xB029,0xB043, 0xB045,0xB05F, 0xB061,0xB07B, 0xB07D,0xB097, 0xB099,0xB0B3, 0xB0B5,0xB0CF, 0xB0D1,0xB0EB, 0xB0ED,0xB107, 0xB109,0xB123, 0xB125,0xB13F, 0xB141,0xB15B, 0xB15D,0xB177, 0xB179,0xB193, 0xB195,0xB1AF, 0xB1B1,0xB1CB, 0xB1CD,0xB1E7, 0xB1E9,0xB203, 0xB205,0xB21F, 0xB221,0xB23B, 0xB23D,0xB257, 0xB259,0xB273, 0xB275,0xB28F, 0xB291,0xB2AB, 0xB2AD,0xB2C7, 0xB2C9,0xB2E3, 0xB2E5,0xB2FF, 0xB301,0xB31B, 0xB31D,0xB337, 0xB339,0xB353, 0xB355,0xB36F, 0xB371,0xB38B, 0xB38D,0xB3A7, 0xB3A9,0xB3C3, 0xB3C5,0xB3DF, 0xB3E1,0xB3FB, 0xB3FD,0xB417, 0xB419,0xB433, 0xB435,0xB44F, 0xB451,0xB46B, 0xB46D,0xB487, 0xB489,0xB4A3, 0xB4A5,0xB4BF, 0xB4C1,0xB4DB, 0xB4DD,0xB4F7, 0xB4F9,0xB513, 0xB515,0xB52F, 0xB531,0xB54B, 0xB54D,0xB567, 0xB569,0xB583, 0xB585,0xB59F, 0xB5A1,0xB5BB, 0xB5BD,0xB5D7, 0xB5D9,0xB5F3, 0xB5F5,0xB60F, 0xB611,0xB62B, 0xB62D,0xB647, 0xB649,0xB663, 0xB665,0xB67F, 0xB681,0xB69B, 0xB69D,0xB6B7, 0xB6B9,0xB6D3, 0xB6D5,0xB6EF, 0xB6F1,0xB70B, 0xB70D,0xB727, 0xB729,0xB743, 0xB745,0xB75F, 0xB761,0xB77B, 0xB77D,0xB797, 0xB799,0xB7B3, 0xB7B5,0xB7CF, 0xB7D1,0xB7EB, 0xB7ED,0xB807, 0xB809,0xB823, 0xB825,0xB83F, 0xB841,0xB85B, 0xB85D,0xB877, 0xB879,0xB893, 0xB895,0xB8AF, 0xB8B1,0xB8CB, 0xB8CD,0xB8E7, 0xB8E9,0xB903, 0xB905,0xB91F, 0xB921,0xB93B, 0xB93D,0xB957, 0xB959,0xB973, 0xB975,0xB98F, 0xB991,0xB9AB, 0xB9AD,0xB9C7, 0xB9C9,0xB9E3, 0xB9E5,0xB9FF, 0xBA01,0xBA1B, 0xBA1D,0xBA37, 0xBA39,0xBA53, 0xBA55,0xBA6F, 0xBA71,0xBA8B, 0xBA8D,0xBAA7, 0xBAA9,0xBAC3, 0xBAC5,0xBADF, 0xBAE1,0xBAFB, 0xBAFD,0xBB17, 0xBB19,0xBB33, 0xBB35,0xBB4F, 0xBB51,0xBB6B, 0xBB6D,0xBB87, 0xBB89,0xBBA3, 0xBBA5,0xBBBF, 0xBBC1,0xBBDB, 0xBBDD,0xBBF7, 0xBBF9,0xBC13, 0xBC15,0xBC2F, 0xBC31,0xBC4B, 0xBC4D,0xBC67, 0xBC69,0xBC83, 0xBC85,0xBC9F, 0xBCA1,0xBCBB, 0xBCBD,0xBCD7, 0xBCD9,0xBCF3, 0xBCF5,0xBD0F, 0xBD11,0xBD2B, 0xBD2D,0xBD47, 0xBD49,0xBD63, 0xBD65,0xBD7F, 0xBD81,0xBD9B, 0xBD9D,0xBDB7, 0xBDB9,0xBDD3, 0xBDD5,0xBDEF, 0xBDF1,0xBE0B, 0xBE0D,0xBE27, 0xBE29,0xBE43, 0xBE45,0xBE5F, 0xBE61,0xBE7B, 0xBE7D,0xBE97, 0xBE99,0xBEB3, 0xBEB5,0xBECF, 0xBED1,0xBEEB, 0xBEED,0xBF07, 0xBF09,0xBF23, 0xBF25,0xBF3F, 0xBF41,0xBF5B, 0xBF5D,0xBF77, 0xBF79,0xBF93, 0xBF95,0xBFAF, 0xBFB1,0xBFCB, 0xBFCD,0xBFE7, 0xBFE9,0xC003, 0xC005,0xC01F, 0xC021,0xC03B, 0xC03D,0xC057, 0xC059,0xC073, 0xC075,0xC08F, 0xC091,0xC0AB, 0xC0AD,0xC0C7, 0xC0C9,0xC0E3, 0xC0E5,0xC0FF, 0xC101,0xC11B, 0xC11D,0xC137, 0xC139,0xC153, 0xC155,0xC16F, 0xC171,0xC18B, 0xC18D,0xC1A7, 0xC1A9,0xC1C3, 0xC1C5,0xC1DF, 0xC1E1,0xC1FB, 0xC1FD,0xC217, 0xC219,0xC233, 0xC235,0xC24F, 0xC251,0xC26B, 0xC26D,0xC287, 0xC289,0xC2A3, 0xC2A5,0xC2BF, 0xC2C1,0xC2DB, 0xC2DD,0xC2F7, 0xC2F9,0xC313, 0xC315,0xC32F, 0xC331,0xC34B, 0xC34D,0xC367, 0xC369,0xC383, 0xC385,0xC39F, 0xC3A1,0xC3BB, 0xC3BD,0xC3D7, 0xC3D9,0xC3F3, 0xC3F5,0xC40F, 0xC411,0xC42B, 0xC42D,0xC447, 0xC449,0xC463, 0xC465,0xC47F, 0xC481,0xC49B, 0xC49D,0xC4B7, 0xC4B9,0xC4D3, 0xC4D5,0xC4EF, 0xC4F1,0xC50B, 0xC50D,0xC527, 0xC529,0xC543, 0xC545,0xC55F, 0xC561,0xC57B, 0xC57D,0xC597, 0xC599,0xC5B3, 0xC5B5,0xC5CF, 0xC5D1,0xC5EB, 0xC5ED,0xC607, 0xC609,0xC623, 0xC625,0xC63F, 0xC641,0xC65B, 0xC65D,0xC677, 0xC679,0xC693, 0xC695,0xC6AF, 0xC6B1,0xC6CB, 0xC6CD,0xC6E7, 0xC6E9,0xC703, 0xC705,0xC71F, 0xC721,0xC73B, 0xC73D,0xC757, 0xC759,0xC773, 0xC775,0xC78F, 0xC791,0xC7AB, 0xC7AD,0xC7C7, 0xC7C9,0xC7E3, 0xC7E5,0xC7FF, 0xC801,0xC81B, 0xC81D,0xC837, 0xC839,0xC853, 0xC855,0xC86F, 0xC871,0xC88B, 0xC88D,0xC8A7, 0xC8A9,0xC8C3, 0xC8C5,0xC8DF, 0xC8E1,0xC8FB, 0xC8FD,0xC917, 0xC919,0xC933, 0xC935,0xC94F, 0xC951,0xC96B, 0xC96D,0xC987, 0xC989,0xC9A3, 0xC9A5,0xC9BF, 0xC9C1,0xC9DB, 0xC9DD,0xC9F7, 0xC9F9,0xCA13, 0xCA15,0xCA2F, 0xCA31,0xCA4B, 0xCA4D,0xCA67, 0xCA69,0xCA83, 0xCA85,0xCA9F, 0xCAA1,0xCABB, 0xCABD,0xCAD7, 0xCAD9,0xCAF3, 0xCAF5,0xCB0F, 0xCB11,0xCB2B, 0xCB2D,0xCB47, 0xCB49,0xCB63, 0xCB65,0xCB7F, 0xCB81,0xCB9B, 0xCB9D,0xCBB7, 0xCBB9,0xCBD3, 0xCBD5,0xCBEF, 0xCBF1,0xCC0B, 0xCC0D,0xCC27, 0xCC29,0xCC43, 0xCC45,0xCC5F, 0xCC61,0xCC7B, 0xCC7D,0xCC97, 0xCC99,0xCCB3, 0xCCB5,0xCCCF, 0xCCD1,0xCCEB, 0xCCED,0xCD07, 0xCD09,0xCD23, 0xCD25,0xCD3F, 0xCD41,0xCD5B, 0xCD5D,0xCD77, 0xCD79,0xCD93, 0xCD95,0xCDAF, 0xCDB1,0xCDCB, 0xCDCD,0xCDE7, 0xCDE9,0xCE03, 0xCE05,0xCE1F, 0xCE21,0xCE3B, 0xCE3D,0xCE57, 0xCE59,0xCE73, 0xCE75,0xCE8F, 0xCE91,0xCEAB, 0xCEAD,0xCEC7, 0xCEC9,0xCEE3, 0xCEE5,0xCEFF, 0xCF01,0xCF1B, 0xCF1D,0xCF37, 0xCF39,0xCF53, 0xCF55,0xCF6F, 0xCF71,0xCF8B, 0xCF8D,0xCFA7, 0xCFA9,0xCFC3, 0xCFC5,0xCFDF, 0xCFE1,0xCFFB, 0xCFFD,0xD017, 0xD019,0xD033, 0xD035,0xD04F, 0xD051,0xD06B, 0xD06D,0xD087, 0xD089,0xD0A3, 0xD0A5,0xD0BF, 0xD0C1,0xD0DB, 0xD0DD,0xD0F7, 0xD0F9,0xD113, 0xD115,0xD12F, 0xD131,0xD14B, 0xD14D,0xD167, 0xD169,0xD183, 0xD185,0xD19F, 0xD1A1,0xD1BB, 0xD1BD,0xD1D7, 0xD1D9,0xD1F3, 0xD1F5,0xD20F, 0xD211,0xD22B, 0xD22D,0xD247, 0xD249,0xD263, 0xD265,0xD27F, 0xD281,0xD29B, 0xD29D,0xD2B7, 0xD2B9,0xD2D3, 0xD2D5,0xD2EF, 0xD2F1,0xD30B, 0xD30D,0xD327, 0xD329,0xD343, 0xD345,0xD35F, 0xD361,0xD37B, 0xD37D,0xD397, 0xD399,0xD3B3, 0xD3B5,0xD3CF, 0xD3D1,0xD3EB, 0xD3ED,0xD407, 0xD409,0xD423, 0xD425,0xD43F, 0xD441,0xD45B, 0xD45D,0xD477, 0xD479,0xD493, 0xD495,0xD4AF, 0xD4B1,0xD4CB, 0xD4CD,0xD4E7, 0xD4E9,0xD503, 0xD505,0xD51F, 0xD521,0xD53B, 0xD53D,0xD557, 0xD559,0xD573, 0xD575,0xD58F, 0xD591,0xD5AB, 0xD5AD,0xD5C7, 0xD5C9,0xD5E3, 0xD5E5,0xD5FF, 0xD601,0xD61B, 0xD61D,0xD637, 0xD639,0xD653, 0xD655,0xD66F, 0xD671,0xD68B, 0xD68D,0xD6A7, 0xD6A9,0xD6C3, 0xD6C5,0xD6DF, 0xD6E1,0xD6FB, 0xD6FD,0xD717, 0xD719,0xD733, 0xD735,0xD74F, 0xD751,0xD76B, 0xD76D,0xD787, 0xD789,0xD7A3};
struct cp {
uint32_t c;
char n;
char valid;
}
utf8(size_t nbuf, uint8_t *buf, size_t i)
{
// Handle end of buf
if (i == nbuf)
return (struct cp){.c=UINT32_MAX};
// ASCII shortcut
uint8_t c0 = buf[i];
if (c0 <= 0x7f)
return (struct cp){.c=c0,.n=1,.valid=1};
// Get UTF-8 octet length
struct cp c = {.valid=1};
size_t nleft = nbuf - i;
if (!(c0&0x80)) c.n=1;
else if (nleft>=2 && ((c0&0xE0) == 0xC0)) c.n=2;
else if (nleft>=3 && ((c0&0xF0) == 0xE0)) c.n=3;
else if (nleft>=4 && ((c0&0xF8) == 0xF0)) c.n=4;
else return (struct cp){.c=UINT32_MAX,.n=1};
// Check valid octets
for (int ic=1; ic<c.n; ic++)
if ((buf[i+ic]&0xC0) != 0x80)
return (struct cp){.c=UINT32_MAX,.n=1};
// Decode into codepoint
c.c = c.n == 1
? c0 & 0x7F
: (c0&(0x7F>>c.n)) << 6*(c.n-1);
for (int ic=1; ic<c.n; ic++)
c.c += (buf[i+ic]&0x3F) << 6*(c.n-1-ic);
// Invalid sequence
if (
// Overlong
(c.n==2 && c.c<0x80)
|| (c.n>2 && (c.c < (1 << (5*c.n - 4))))
// UTF-16 surrogate
|| (c.c>=0xD800 && c.c<=0xDFFF)
// Unencodable by UTF-16
|| (c.c > 0x10FFFF))
return (struct cp){.c=UINT32_MAX,.n=1};
// Valid
return c;
}
/* ranges is an array of start and end, nranges
is the number of such pairs (so half its length),
and c is a codepoint to return whether it's within
any of those ranges. */
char
contains(size_t nranges, const uint32_t *ranges, uint32_t c)
{
// Binary search for it
size_t lo=0, hi=nranges-1;
while (hi-lo > 1) {
size_t mid = (hi-lo)/2 + lo;
// Too high
if (c < ranges[mid*2])
hi = mid-1;
// Too low
else if (c > ranges[mid*2+1])
lo = mid+1;
// Match
else return 1;
}
// Check last two
return (ranges[lo*2]<=c && c<=ranges[lo*2+1])
|| (ranges[hi*2]<=c && c<=ranges[hi*2+1]);
}
struct cp
utf8prev(size_t nbuf, uint8_t *buf, size_t i)
{
struct cp c;
if (i>=4 && (c=utf8(nbuf,buf,i-4)).n==4);
else if (i>=3 && (c=utf8(nbuf,buf,i-3)).n==3);
else if (i>=2 && (c=utf8(nbuf,buf,i-2)).n==2);
else if (i>=1 && (c=utf8(nbuf,buf,i-1)).n==1);
else c=(struct cp){.c=UINT32_MAX};
return c;
}
// based on UAX29§3.1.1 © 2019 Unicode, Inc.
// i is the index of b in buf
int
isbreakgraph(struct cp a, struct cp b, uint8_t *buf, size_t i)
{
// Break if either invalid
if (!a.valid || !b.valid) return 1;
// Do not break between a CR and LF.
if (a.c==0xd && b.c==0xa) return 0;
// ASCII shortcut: always break
if (a.c<=0x7f && b.c<=0x7f) return 1;
// Otherwise, break before and after controls.
#define IN(cu, ranges) contains(sizeof(ranges)/(sizeof(uint32_t)*2), ranges, cu.c)
if (IN(a, gbpControl)
|| a.c == 0xd
|| a.c == 0xa
|| IN(b, gbpControl)
|| b.c == 0xd
|| b.c == 0xa)
return 1;
// Do not break Hangul syllable sequences.
if (IN(a, gbpL)) {
if (IN(b, gbpL)
|| IN(b, gbpV)
|| IN(b, gbpLV)
|| IN(b, gbpLVT))
return 0;
} else if (IN(a,gbpLV) || IN(a,gbpV)) {
if (IN(b,gbpV) || IN(b,gbpT)) return 0;
} else if ((IN(a,gbpLVT) || IN(a,gbpT))
&& IN(b, gbpT))
return 0;
// Do not break before extending characters
// or ZWJ.
if (IN(b,gbpExtend) || b.c==0xe2808d)
return 0;
// Do not break before SpacingMarks, or
// after Prepend characters.
if (IN(b,gbpSpacingMark) || IN(a,gbpPrepend))
return 0;
// Do not break within emoji modifier
// sequences or emoji zwj sequences.
if (a.c == 0xe2808d
&& IN(b, Extended_Pictographic))
for (size_t j=i-a.n;;) {
if (!j) break;
struct cp c = utf8prev(i, buf, j);
if (IN(c, Extended_Pictographic)) return 0;
if (!IN(c, gbpExtend)) break;
j -= c.n;
}
// Do not break within emoji flag sequences.
// That is, do not break between regional
// indicator (RI) symbols if there is an odd
// number of RI characters before the break point.
if (IN(a, Regional_Indicator)
&& IN(b, Regional_Indicator))
for (size_t j=i-a.n;;) {
if (!j) return 0;
struct cp c = utf8prev(i, buf, j);
if (!IN(c,Regional_Indicator)) return 0;
if (!(j-=c.n) || !(IN((c=utf8prev(i,buf,j)),Regional_Indicator))) break;
j -= c.n;
}
#undef IN
// Otherwise, break everywhere.
return 1;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment