Created
August 1, 2023 18:44
-
-
Save syoyo/42fe2874cc6f86d0f281f1f523cc98ab to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <string> | |
static int _ReadUTF8(char const *&cp, std::string *errMsg) | |
{ | |
// Return a byte with the high `n` bits set, rest clear. | |
auto highBits = [](int n) { | |
return static_cast<unsigned char>(((1 << n) - 1) << (8 - n)); | |
}; | |
// Return true if `ch` is a continuation byte. | |
auto isContinuation = [&highBits](unsigned char ch) { | |
return (ch & highBits(2)) == highBits(1); | |
}; | |
// Check for single-character code. | |
if ((*cp & highBits(1)) == 0) { | |
return *cp++; | |
} | |
// Check for 2, 3, or 4-byte code. | |
for (int i = 2; i <= 4; ++i) { | |
// This is an N-byte code if the high-order N+1 bits are N 1s | |
// followed by a single 0. | |
if ((*cp & highBits(i + 1)) == highBits(i)) { | |
int ret = static_cast<unsigned char>(*cp) & ~highBits(i + 1); | |
// If that's the case then the following N-1 bytes must be | |
// "continuation bytes". | |
for (int j = 1; j != i; ++j) { | |
if (!isContinuation(cp[j])) { | |
char const *ordinalWords[] = { | |
"first", "second", "third", "fourth" | |
}; | |
*errMsg = std::to_string(i) + "-byte UTF-8 code point lacks " | |
+ std::string(ordinalWords[j-1]) + " continuation byte"; | |
return -1; | |
} | |
ret = (ret << 6) | static_cast<unsigned char>((cp[j] & ~highBits(2))); | |
} | |
cp += i; | |
return ret; | |
} | |
} | |
*errMsg = "invalid UTF-8 code point byte"; // 0x%hhx", *cp); | |
return -1; | |
} | |
inline uint32_t utf8_code(const std::string &s) { | |
if (s.empty() || (s.size() > 4)) { | |
return ~0u; // invalid | |
} | |
uint32_t code = 0; | |
if (s.size() == 1) { | |
unsigned char s0 = static_cast<unsigned char>(s[0]); | |
code = uint32_t(s0) & 0x7f; | |
} else if (s.size() == 2) { | |
// 11bit: 110y-yyyx 10xx-xxxx | |
unsigned char s0 = static_cast<unsigned char>(s[0]); | |
unsigned char s1 = static_cast<unsigned char>(s[1]); | |
code = (uint32_t(s0 & 0x1f) << 6) | (s1 & 0x3f); | |
} else if (s.size() == 3) { | |
// 16bit: 1110-yyyy 10yx-xxxx 10xx-xxxx | |
unsigned char s0 = static_cast<unsigned char>(s[0]); | |
unsigned char s1 = static_cast<unsigned char>(s[1]); | |
unsigned char s2 = static_cast<unsigned char>(s[2]); | |
code = (uint32_t(s0 & 0xf) << 12) | (uint32_t(s1 & 0x3f) << 6) | (s2 & 0x3f); | |
} else { | |
// 21bit: 1111-0yyy 10yy-xxxx 10xx-xxxx 10xx-xxxx | |
unsigned char s0 = static_cast<unsigned char>(s[0]); | |
unsigned char s1 = static_cast<unsigned char>(s[1]); | |
unsigned char s2 = static_cast<unsigned char>(s[2]); | |
unsigned char s3 = static_cast<unsigned char>(s[3]); | |
code = (uint32_t(s0 & 0x7) << 18) | (uint32_t(s1 & 0x3f) << 12) | (uint32_t(s2 & 0x3f) << 6) | uint32_t(s3 & 0x3f); | |
} | |
return code; | |
} | |
int main() { | |
std::string s = u8"響"; | |
char const *sptr = s.c_str(); | |
std::string err; | |
int code = _ReadUTF8(sptr, &err); | |
std::cout << code << "\n"; // 38911 | |
std::cout << err << "\n"; | |
uint32_t code1 = utf8_code(s); // 38911 | |
std::cout << code1 << "\n"; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment