Skip to content

Instantly share code, notes, and snippets.

@wcho21
Last active October 21, 2023 16:41
Show Gist options
  • Save wcho21/5bb039c1312cfe2aa0dc3c913d8dee75 to your computer and use it in GitHub Desktop.
Save wcho21/5bb039c1312cfe2aa0dc3c913d8dee75 to your computer and use it in GitHub Desktop.
Implementing UTF encodings

Implementing UTF encodings

To compile, run make.

.PHONY: all
all: utf-8.out utf-16.out utf-32.out
%.out: %.cpp
g++ -std=c++2a -Wall -Wextra -o $@ $<
.PHONY: run
run:
./utf-8.out
./utf-16.out
./utf-32.out
#include <algorithm>
#include <cstdint>
#include <cassert>
#ifdef _WIN64
#include <windows.h> // for Windows
#endif
using std::begin, std::end, std::equal;
typedef uint32_t Codepoint;
void encodeUtf16(const Codepoint cp, char16_t *buf) {
bool within_16bits = cp < (1 << 16);
if (within_16bits) {
buf[0] = cp;
buf[1] = '\0';
return;
}
bool within_21bits = cp < (1 << 21);
if (within_21bits) {
// encode x xxxx yyyy yyzz zzzz zzzz
// into bytes 1101 10ww wwyy yyyy 1101 11zz zzzz zzzz
// with wwww = xxxxx - 1
Codepoint x = (cp & 0b1'1111'0000'0000'0000'0000) >> 16;
Codepoint y = (cp & 0b0'0000'1111'1100'0000'0000) >> 10;
Codepoint z = (cp & 0b0'0000'0000'0011'1111'1111);
Codepoint w = x - 1;
buf[0] = 0b1101'1000'0000'0000 | (w << 6) | y;
buf[1] = 0b1101'1100'0000'0000 | z;
buf[2] = '\0';
return;
}
// return U+FFFD in UTF-16
buf[0] = 0xFFFD;
buf[1] = '\0';
}
Codepoint decodeUtf16(const char16_t *buf) {
if (((buf[0] & 0b1111'1100'0000'0000) >> 10) == 0b11'0110 &&
((buf[1] & 0b1111'1100'0000'0000) >> 10) == 0b11'0111) {
// get w wwww yyyy yyzz zzzz zzzz
// from bytes 1101 10xx xxyy yyyy 1101 11zz zzzz zzzz
// with wwwww = xxxx + 1
Codepoint x = (buf[0] & 0b11'1100'0000) >> 6;
Codepoint y = (buf[0] & 0b00'0011'1111);
Codepoint z = (buf[1] & 0b11'1111'1111);
Codepoint w = x + 1;
Codepoint cp = (w << 16) + (y << 10) + z;
return cp;
}
if (((buf[0] & 0b1111'1100'0000'0000) >> 10) != 0b11'0110 &&
((buf[0] & 0b1111'1100'0000'0000) >> 10) != 0b11'0111) {
Codepoint cp = buf[0];
return cp;
}
return 0xFFFD;
}
// test code
int main() {
#ifdef _WIN64
SetConsoleOutputCP(CP_UTF8); // for Windows
#endif
char16_t utf16_buffer[3];
encodeUtf16(0x41, utf16_buffer);
assert(equal(utf16_buffer, utf16_buffer+1, u"A"));
assert(decodeUtf16(utf16_buffer) == 0x41);
encodeUtf16(0xA9, utf16_buffer);
assert(equal(utf16_buffer, utf16_buffer+1, u"©"));
assert(decodeUtf16(utf16_buffer) == 0xA9);
encodeUtf16(0xAC00, utf16_buffer);
assert(equal(utf16_buffer, utf16_buffer+1, u"가"));
assert(decodeUtf16(utf16_buffer) == 0xAC00);
encodeUtf16(0x1F602, utf16_buffer);
assert(equal(utf16_buffer, utf16_buffer+2, u"😂"));
assert(decodeUtf16(utf16_buffer) == 0x1F602);
}
#include <algorithm>
#include <cstdint>
#include <cassert>
#ifdef _WIN64
#include <windows.h> // for Windows
#endif
using std::begin, std::end, std::equal;
typedef uint32_t Codepoint;
void encodeUtf32(const Codepoint cp, char32_t *buf) {
buf[0] = cp;
buf[1] = '\0';
}
Codepoint decodeUtf32(const char32_t *buf) {
return buf[0];
}
// test code
int main() {
#ifdef _WIN64
SetConsoleOutputCP(CP_UTF8); // for Windows
#endif
char32_t utf32_buffer[2];
encodeUtf32(0x41, utf32_buffer);
assert(equal(utf32_buffer, utf32_buffer+1, u"A"));
assert(decodeUtf32(utf32_buffer) == 0x41);
encodeUtf32(0xA9, utf32_buffer);
assert(equal(utf32_buffer, utf32_buffer+1, U"©"));
assert(decodeUtf32(utf32_buffer) == 0xA9);
encodeUtf32(0xAC00, utf32_buffer);
assert(equal(utf32_buffer, utf32_buffer+1, U"가"));
assert(decodeUtf32(utf32_buffer) == 0xAC00);
encodeUtf32(0x1F602, utf32_buffer);
assert(equal(utf32_buffer, utf32_buffer+2, U"😂"));
assert(decodeUtf32(utf32_buffer) == 0x1F602);
}
#include <algorithm>
#include <cstdint>
#include <cassert>
#ifdef _WIN64
#include <windows.h> // for Windows
#endif
using std::begin, std::end, std::equal;
typedef uint32_t Codepoint;
void encodeUtf8(const Codepoint cp, char8_t *buf) {
bool within_7bits = cp < (1 << 7);
if (within_7bits) {
// encode xxx xxxx
// into bytes 0xxx xxxx
buf[0] = cp;
buf[1] = '\0';
return;
}
bool within_11bits = cp < (1 << 11);
if (within_11bits) {
// encode yyy yyxx xxxx
// into bytes 110y yyyy 10xx xxxx
Codepoint y = (cp & 0b111'1100'0000) >> 6;
Codepoint x = (cp & 0b000'0011'1111);
buf[0] = 0b1100'0000 | y;
buf[1] = 0b1000'0000 | x;
buf[2] = '\0';
return;
}
bool within_16bits = cp < (1 << 16);
if (within_16bits) {
// encode zzzz yyyy yyxx xxxx
// into bytes 1110 zzzz 10yy yyyy 10xx xxxx
Codepoint z = (cp & 0b1111'0000'0000'0000) >> 12;
Codepoint y = (cp & 0b0000'1111'1100'0000) >> 6;
Codepoint x = (cp & 0b0000'0000'0011'1111);
buf[0] = 0b1110'0000 | z;
buf[1] = 0b1000'0000 | y;
buf[2] = 0b1000'0000 | x;
buf[3] = '\0';
return;
}
bool within_21bits = cp < (1 << 21);
if (within_21bits) {
// encode w wwzz zzzz yyyy yyxx xxxx
// into bytes 1111 0www 10zz zzzz 10yy yyyy 10xx xxxx
Codepoint w = (cp & 0b1'1100'0000'0000'0000'0000) >> 18;
Codepoint z = (cp & 0b0'0011'1111'0000'0000'0000) >> 12;
Codepoint y = (cp & 0b0'0000'0000'1111'1100'0000) >> 6;
Codepoint x = (cp & 0b0'0000'0000'0000'0011'1111);
buf[0] = 0b1111'0000 | w;
buf[1] = 0b1000'0000 | z;
buf[2] = 0b1000'0000 | y;
buf[3] = 0b1000'0000 | x;
buf[4] = '\0';
return;
}
// return U+FFFD in UTF-8
buf[0] = 0xEF;
buf[1] = 0xBF;
buf[2] = 0xBD;
buf[3] = '\0';
}
Codepoint decodeUtf8(const char8_t *buf) {
if ((buf[0] & 0b1000'0000) == 0) {
// get xxx xxxx
// from bytes 0xxx xxxx
return (Codepoint) buf[0];
}
if (((buf[0] & 0b1110'0000) >> 5) == 0b110 &&
((buf[1] & 0b1100'0000) >> 6) == 0b10) {
// get xxx xxyy yyyy
// from bytes 110x xxxx 10yy yyyy
Codepoint x = buf[0] & 0b1'1111;
Codepoint y = buf[1] & 0b11'1111;
Codepoint cp = (x << 6) | y;
return cp;
}
if (((buf[0] & 0b1111'0000) >> 4) == 0b1110 &&
((buf[1] & 0b1100'0000) >> 6) == 0b10 &&
((buf[2] & 0b1100'0000) >> 6) == 0b10) {
// get xxxx yyyy yyzz zzzz
// from bytes 1110 xxxx 10yy yyyy 10zz zzzz
Codepoint x = buf[0] & 0b1111;
Codepoint y = buf[1] & 0b11'1111;
Codepoint z = buf[2] & 0b11'1111;
Codepoint cp = (x << 12) | (y << 6) | z;
return cp;
}
if (((buf[0] & 0b1111'1000) >> 3) == 0b11110 &&
((buf[1] & 0b1100'0000) >> 6) == 0b10 &&
((buf[2] & 0b1100'0000) >> 6) == 0b10 &&
((buf[3] & 0b1100'0000) >> 6) == 0b10) {
// get x xxyy yyyy zzzz zzww wwww
// from bytes 1111 0xxx 10yy yyyy 10zz zzzz 10ww wwww
Codepoint x = buf[0] & 0b111;
Codepoint y = buf[1] & 0b11'1111;
Codepoint z = buf[2] & 0b11'1111;
Codepoint w = buf[3] & 0b11'1111;
Codepoint cp = (x << 18) | (y << 12) | (z << 6) | w;
return cp;
}
return 0xFFFD;
}
// test code
int main() {
#ifdef _WIN64
SetConsoleOutputCP(CP_UTF8); // for Windows
#endif
char8_t utf8_buffer[5];
encodeUtf8(0x41, utf8_buffer);
assert(equal(utf8_buffer, utf8_buffer+2, u8"A"));
assert(decodeUtf8(utf8_buffer) == 0x41);
encodeUtf8(0xA9, utf8_buffer);
assert(equal(utf8_buffer, utf8_buffer+3, u8"©"));
assert(decodeUtf8(utf8_buffer) == 0xA9);
encodeUtf8(0xAC00, utf8_buffer);
assert(equal(utf8_buffer, utf8_buffer+4, u8"가"));
assert(decodeUtf8(utf8_buffer) == 0xAC00);
encodeUtf8(0x1F602, utf8_buffer);
assert(equal(utf8_buffer, utf8_buffer+5, u8"😂"));
assert(decodeUtf8(utf8_buffer) == 0x1F602);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment