Skip to content

Instantly share code, notes, and snippets.

@dkosmari
Created May 25, 2024 03:32
Show Gist options
  • Save dkosmari/2bcb33a47cf2b137bd9549c9d89dfba2 to your computer and use it in GitHub Desktop.
Save dkosmari/2bcb33a47cf2b137bd9549c9d89dfba2 to your computer and use it in GitHub Desktop.
C++ code to convert UTF-16 encoding to UTF-8 encoding.
std::string
to_utf8(const std::u16string& input)
{
#if 0
// TODO: try using std::c16rtomb()
#else
std::string output;
char32_t codepoint;
for (auto i = input.begin(); i != input.end(); ++i) {
auto ic = *i;
if (ic <= 0xd7ff)
codepoint = ic;
else {
char32_t high_surrogate = ic;
if ((high_surrogate & 0xfc00) != 0xd800)
break; // error: invalid high surrogate
if (++i == input.end())
break; // error: reached the end too soon.
char32_t low_surrogate = *i;
if ((low_surrogate & 0xfc00) != 0xdc00)
break; // error: invalid low surrogate
codepoint =
((high_surrogate & 0x3ff) << 10)
| (low_surrogate & 0x3ff);
}
if (!codepoint)
break; // null terminator
// now encode as UTF-8
char8_t oc;
if (codepoint <= 0x7f) {
output.push_back(codepoint);
} else if (codepoint <= 0x7ff) {
oc = 0xc0 | (codepoint >> 6);
output.push_back(oc);
oc = 0x80 | ((codepoint >> 0) & 0x3f);
output.push_back(oc);
} else if (codepoint <= 0xffff) {
oc = 0xe0 | (codepoint >> 12);
output.push_back(oc);
oc = 0x80 | ((codepoint >> 6) & 0x3f);
output.push_back(oc);
oc = 0x80 | ((codepoint >> 0) & 0x3f);
output.push_back(oc);
} else {
oc = 0xf0 | (codepoint >> 18);
output.push_back(oc);
oc = 0x80 | ((codepoint >> 12) & 0x3f);
output.push_back(oc);
oc = 0x80 | ((codepoint >> 6) & 0x3f);
output.push_back(oc);
oc = 0x80 | ((codepoint >> 0) & 0x3f);
output.push_back(oc);
}
}
#endif
return output;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment