Skip to content

Instantly share code, notes, and snippets.

@ryutorion
Created May 26, 2020 14:31
Show Gist options
  • Save ryutorion/c6c6e3e3f45de5261de55a3407ec5d21 to your computer and use it in GitHub Desktop.
Save ryutorion/c6c6e3e3f45de5261de55a3407ec5d21 to your computer and use it in GitHub Desktop.
#include <cstdint>
#include <cstdio>
inline bool isUTF8Tail(uint8_t c)
{
return 0x80 <= c && c <= 0xBF;
}
int convertUTF8CharToUTF32Char(const char * p_utf8_str, char32_t & c)
{
uint8_t c0 = static_cast<uint8_t>(p_utf8_str[0]);
if(c0 <= 0x7F)
{
c = c0;
return 1;
}
if(c0 < 0xC2)
{
return -1;
}
uint8_t c1 = static_cast<uint8_t>(p_utf8_str[1]);
if(c0 <= 0xDF)
{
if(!isUTF8Tail(c1))
{
return -1;
}
c = (static_cast<char32_t>(c0) & 0x1F) << 6;
c |= c1 & 0x3F;
return 2;
}
uint8_t c2 = static_cast<uint8_t>(p_utf8_str[2]);
if(c0 == 0xE0)
{
if(c1 < 0xA0 || 0xBF < c1 || !isUTF8Tail(c2))
{
return -1;
}
}
else if((0xE1 <= c0 && c0 <= 0xEC) || c0 == 0xEE || c0 == 0xEF)
{
if(!isUTF8Tail(c1) || !isUTF8Tail(c2))
{
return -1;
}
}
else if(c0 == 0xED)
{
if(c1 < 0x80 || 0x9F < c1 || !isUTF8Tail(c2))
{
return -1;
}
}
if(c0 < 0xF0)
{
c = (static_cast<char32_t>(c0) & 0x0F) << 12;
c |= (static_cast<char32_t>(c1) & 0x3F) << 6;
c |= c2 & 0x3F;
return 3;
}
uint8_t c3 = static_cast<uint8_t>(p_utf8_str[3]);
if(c0 == 0xF0)
{
if(c1 < 0x90 || 0xBF > c1 || !isUTF8Tail(c2) || !isUTF8Tail(c3))
{
return -1;
}
}
else if(c0 == 0xF1 || c0 == 0xF2 || c0 == 0xF3)
{
if(!isUTF8Tail(c1) || !isUTF8Tail(c2) || !isUTF8Tail(c3))
{
return false;
}
}
else if(c0 == 0xF4)
{
if(c1 < 0x80 || 0x8F > c1 || !isUTF8Tail(c2) || !isUTF8Tail(c3))
{
return -1;
}
}
if(c0 <= 0xF4)
{
c = (static_cast<char32_t>(c0) & 0x07) << 18;
c |= (static_cast<char32_t>(c1) & 0x3F) << 12;
c |= (static_cast<char32_t>(c2) & 0x3F) << 6;
c |= c3 & 0x3F;
return 4;
}
return -1;
}
int main(int argc, char * argv[])
{
char s[] = u8"こんにちは";
char32_t s32[] = U"こんにちは";
int index = 0;
char * p = s;
while(*p)
{
char32_t c;
int length = convertUTF8CharToUTF32Char(p, c);
if(length <= 0)
{
return -1;
}
if(s32[index] == c)
{
printf("s32[%d] match\n", index);
}
else
{
printf("s32[%d] unmatch\n", index);
}
p += length;
++index;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment