Skip to content

Instantly share code, notes, and snippets.

@Bueddl
Created May 12, 2017 07:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Bueddl/85b99bc84b5e99f7b2993af149ae07ea to your computer and use it in GitHub Desktop.
Save Bueddl/85b99bc84b5e99f7b2993af149ae07ea to your computer and use it in GitHub Desktop.
utf8 encoding test
#include <iostream>
#include <cstdint>
#include <cstdio>
#include <cassert>
// (C) 2017 by Sebastian 'bueddl' Büttner
void checkCodepoint(wchar_t codepoint)
{
// Primary requirement for validity of XML
assert((codepoint >= 0x1 && codepoint <= 0xD7FF) ||
(codepoint >= 0xE000 && codepoint <= 0xFFFD) ||
(codepoint >= 0x10000 && codepoint <= 0x10FFFF) == true);
}
int main()
{
while (!std::cin.eof()) {
wchar_t codepoint = 0;
wchar_t ch = std::cin.get();
if ((ch & 0200) != 0200) {
// u+0000 - u+007f
codepoint = ch;
std::printf("U+%02x\n", codepoint);
checkCodepoint(codepoint);
continue;
}
ch <<= 6;
ch |= std::cin.get() & 077;
if ((ch & 034'000) != 034'000) {
// u+0080 - u+07ff
codepoint = ch & ~034'000;
std::printf("U+%04x\n", codepoint);
checkCodepoint(codepoint);
continue;
}
ch <<= 6;
ch |= std::cin.get() & 077;
if ((ch & 03'600'000) != 03'600'000) {
// u+0800 - u+ffff
codepoint = ch & ~03'600'000;
std::printf("U+%04x\n", codepoint);
checkCodepoint(codepoint);
continue;
}
ch <<= 6;
// u+10000 - u+10ffff
codepoint = ch |= std::cin.get() & 077 & ~0370'000'000;
std::printf("U+%06x\n", codepoint);
checkCodepoint(codepoint);
continue;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment