Created
November 8, 2015 07:09
-
-
Save mikeash/1991612528d706f7cc4a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
struct DecodeResult { | |
uint32_t codepoint; | |
uint32_t decodedBytes; | |
}; | |
static inline struct DecodeResult DecodeOneUTF8(const uint8_t *utf8) { | |
static uint8_t byteCountTable[16] = { | |
// 0xxx | |
[0x0] = 1, | |
[0x1] = 1, | |
[0x2] = 1, | |
[0x3] = 1, | |
[0x4] = 1, | |
[0x5] = 1, | |
[0x6] = 1, | |
[0x7] = 1, | |
// 110x | |
[0xc] = 2, | |
[0xd] = 2, | |
// 1110 | |
[0xe] = 3, | |
// 1111 | |
[0xf] = 4 | |
}; | |
uint8_t byteOne = utf8[0]; | |
uint8_t byteTwo = utf8[1]; | |
uint8_t byteThree = utf8[2]; | |
uint8_t byteFour = utf8[3]; | |
uint32_t codepoints[5] = { | |
-1, // dummy | |
byteOne, | |
((byteOne & 0x1f) << 6) | (byteTwo & 0x3f), | |
((byteOne & 0x0f) << 12) | ((byteTwo & 0x3f) << 6) | (byteThree & 0x3f), | |
((byteOne & 0x03) << 18) | ((byteTwo & 0x3f) << 12) | ((byteThree & 0x3f) << 6) | (byteFour & 0x3f) | |
}; | |
uint8_t byteCount = byteCountTable[byteOne >> 4]; | |
return (struct DecodeResult){ codepoints[byteCount], byteCount }; | |
} | |
size_t DecodeUTF8(const uint8_t *utf8, uint32_t *destination) { | |
size_t length = 0; | |
while(*utf8 != '\0') { | |
struct DecodeResult result = DecodeOneUTF8(utf8); | |
*destination++ = result.codepoint; | |
utf8 += result.decodedBytes; | |
length++; | |
} | |
return length; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment