-
-
Save tylerneylon/9773800 to your computer and use it in GitHub Desktop.
// This macro tests if a char is a continuation byte in utf8. | |
#define IS_CONT(x) (((x) & 0xc0) == 0x80) | |
// This returns the code point encoded at **s and advances *s to point to the | |
// next character. Thus it can easily be used in a loop. | |
int decode_code_point(char **s) { | |
int k = **s ? __builtin_clz(~(**s << 24)) : 0; // Count # of leading 1 bits. | |
int mask = (1 << (8 - k)) - 1; // All 1s with k leading 0s. | |
int value = **s & mask; | |
// k = 0 for one-byte code points; otherwise, k = #total bytes. | |
for (++(*s), --k; k > 0 && IS_CONT(**s); --k, ++(*s)) { | |
value <<= 6; | |
value += (**s & 0x3F); | |
} | |
return value; | |
} | |
// This assumes that `code` is <= 0x10FFFF and ensures that nothing will be | |
// written at or beyond `end`. It advances *s so it's easy to use in a loop. | |
void encode_code_point(char **s, char *end, int code) { | |
char val[4]; | |
int lead_byte_max = 0x7F; | |
int val_index = 0; | |
while (code > lead_byte_max) { | |
val[val_index++] = (code & 0x3F) | 0x80; | |
code >>= 6; | |
lead_byte_max >>= (val_index == 1 ? 2 : 1); | |
} | |
val[val_index++] = (code & lead_byte_max) | (~lead_byte_max << 1); | |
while (val_index-- && *s < end) { | |
**s = val[val_index]; | |
(*s)++; | |
} | |
} | |
// This returns 0 if no split was needed. | |
int split_into_surrogates(int code, int *surr1, int *surr2) { | |
if (code <= 0xFFFF) return 0; | |
*surr2 = 0xDC00 | (code & 0x3FF); // Save the low 10 bits. | |
code >>= 10; // Drop the low 10 bits. | |
// If `code` now has low bits "uuu uuxx xxxx", then the bits of *surr are | |
// "1101 10ww wwxx xxxx" where wwww = (uuuuu - 1). | |
*surr1 = 0xD800 | ((code & 0x7FF) - 0x40); | |
return 1; | |
} | |
// This expects to be used in a loop and see all code points in *code. Start | |
// *old at 0; this function updates *old for you - don't change it after | |
// initialization. This returns 0 when *code is the 1st of a surrogate pair; | |
// otherwise use *code as the final code point. | |
int join_from_surrogates(int *old, int *code) { | |
if (*old) *code = (((*old & 0x3FF) + 0x40) << 10) + (*code & 0x3FF); | |
*old = ((*code & 0xD800) == 0xD800 ? *code : 0); | |
return !(*old); | |
} |
@tylerneylon There is reason that I won't check the continuation byte like IS_CONT()
. Naïve programmers will think of a bitwise AND and compare like what you did, but when you are check the valid bits and extract other bits for value at the same time, a subtraction and compare could yield a smaller assembly code.
It also make things simple.
@Explorer09 thanks for the snapshot of your code and the notes. Of course many people will appreciate error-checking code! 👍
@tylerneylon
The interesting thing is, neither ChatGPT or Google Gemini I played with would generate code as compact as mine.
My version compiles to 144 bytes with x86-64 GCC.
ChatGPT
https://chatgpt.com/share/0c457058-2e4a-44bb-93cd-cbd06334039a
Google Gemini
https://g.co/gemini/share/d3c4ed05bbcf
@tylerneylon I wrote my own version of UTF-8 decode function before giving suggestion here.
Your code can be quite compact if the only goal is to decode valid UTF-8 sequences (as there is no error checking at all). My version would perform error checking while also be compact to be included in applications.
I'm not ready to open a Gist page for this, but here is a sneak peek: