Skip to content

Instantly share code, notes, and snippets.

@syoyo
Created August 1, 2023 18:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save syoyo/42fe2874cc6f86d0f281f1f523cc98ab to your computer and use it in GitHub Desktop.
Save syoyo/42fe2874cc6f86d0f281f1f523cc98ab to your computer and use it in GitHub Desktop.
#include <iostream>
#include <string>
static int _ReadUTF8(char const *&cp, std::string *errMsg)
{
// Return a byte with the high `n` bits set, rest clear.
auto highBits = [](int n) {
return static_cast<unsigned char>(((1 << n) - 1) << (8 - n));
};
// Return true if `ch` is a continuation byte.
auto isContinuation = [&highBits](unsigned char ch) {
return (ch & highBits(2)) == highBits(1);
};
// Check for single-character code.
if ((*cp & highBits(1)) == 0) {
return *cp++;
}
// Check for 2, 3, or 4-byte code.
for (int i = 2; i <= 4; ++i) {
// This is an N-byte code if the high-order N+1 bits are N 1s
// followed by a single 0.
if ((*cp & highBits(i + 1)) == highBits(i)) {
int ret = static_cast<unsigned char>(*cp) & ~highBits(i + 1);
// If that's the case then the following N-1 bytes must be
// "continuation bytes".
for (int j = 1; j != i; ++j) {
if (!isContinuation(cp[j])) {
char const *ordinalWords[] = {
"first", "second", "third", "fourth"
};
*errMsg = std::to_string(i) + "-byte UTF-8 code point lacks "
+ std::string(ordinalWords[j-1]) + " continuation byte";
return -1;
}
ret = (ret << 6) | static_cast<unsigned char>((cp[j] & ~highBits(2)));
}
cp += i;
return ret;
}
}
*errMsg = "invalid UTF-8 code point byte"; // 0x%hhx", *cp);
return -1;
}
inline uint32_t utf8_code(const std::string &s) {
if (s.empty() || (s.size() > 4)) {
return ~0u; // invalid
}
uint32_t code = 0;
if (s.size() == 1) {
unsigned char s0 = static_cast<unsigned char>(s[0]);
code = uint32_t(s0) & 0x7f;
} else if (s.size() == 2) {
// 11bit: 110y-yyyx 10xx-xxxx
unsigned char s0 = static_cast<unsigned char>(s[0]);
unsigned char s1 = static_cast<unsigned char>(s[1]);
code = (uint32_t(s0 & 0x1f) << 6) | (s1 & 0x3f);
} else if (s.size() == 3) {
// 16bit: 1110-yyyy 10yx-xxxx 10xx-xxxx
unsigned char s0 = static_cast<unsigned char>(s[0]);
unsigned char s1 = static_cast<unsigned char>(s[1]);
unsigned char s2 = static_cast<unsigned char>(s[2]);
code = (uint32_t(s0 & 0xf) << 12) | (uint32_t(s1 & 0x3f) << 6) | (s2 & 0x3f);
} else {
// 21bit: 1111-0yyy 10yy-xxxx 10xx-xxxx 10xx-xxxx
unsigned char s0 = static_cast<unsigned char>(s[0]);
unsigned char s1 = static_cast<unsigned char>(s[1]);
unsigned char s2 = static_cast<unsigned char>(s[2]);
unsigned char s3 = static_cast<unsigned char>(s[3]);
code = (uint32_t(s0 & 0x7) << 18) | (uint32_t(s1 & 0x3f) << 12) | (uint32_t(s2 & 0x3f) << 6) | uint32_t(s3 & 0x3f);
}
return code;
}
int main() {
std::string s = u8"響";
char const *sptr = s.c_str();
std::string err;
int code = _ReadUTF8(sptr, &err);
std::cout << code << "\n"; // 38911
std::cout << err << "\n";
uint32_t code1 = utf8_code(s); // 38911
std::cout << code1 << "\n";
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment