To compile, run make
.
Last active
October 21, 2023 16:41
-
-
Save wcho21/5bb039c1312cfe2aa0dc3c913d8dee75 to your computer and use it in GitHub Desktop.
Implementing UTF encodings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.PHONY: all | |
all: utf-8.out utf-16.out utf-32.out | |
%.out: %.cpp | |
g++ -std=c++2a -Wall -Wextra -o $@ $< | |
.PHONY: run | |
run: | |
./utf-8.out | |
./utf-16.out | |
./utf-32.out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <cstdint> | |
#include <cassert> | |
#ifdef _WIN64 | |
#include <windows.h> // for Windows | |
#endif | |
using std::begin, std::end, std::equal; | |
typedef uint32_t Codepoint; | |
void encodeUtf16(const Codepoint cp, char16_t *buf) { | |
bool within_16bits = cp < (1 << 16); | |
if (within_16bits) { | |
buf[0] = cp; | |
buf[1] = '\0'; | |
return; | |
} | |
bool within_21bits = cp < (1 << 21); | |
if (within_21bits) { | |
// encode x xxxx yyyy yyzz zzzz zzzz | |
// into bytes 1101 10ww wwyy yyyy 1101 11zz zzzz zzzz | |
// with wwww = xxxxx - 1 | |
Codepoint x = (cp & 0b1'1111'0000'0000'0000'0000) >> 16; | |
Codepoint y = (cp & 0b0'0000'1111'1100'0000'0000) >> 10; | |
Codepoint z = (cp & 0b0'0000'0000'0011'1111'1111); | |
Codepoint w = x - 1; | |
buf[0] = 0b1101'1000'0000'0000 | (w << 6) | y; | |
buf[1] = 0b1101'1100'0000'0000 | z; | |
buf[2] = '\0'; | |
return; | |
} | |
// return U+FFFD in UTF-16 | |
buf[0] = 0xFFFD; | |
buf[1] = '\0'; | |
} | |
Codepoint decodeUtf16(const char16_t *buf) { | |
if (((buf[0] & 0b1111'1100'0000'0000) >> 10) == 0b11'0110 && | |
((buf[1] & 0b1111'1100'0000'0000) >> 10) == 0b11'0111) { | |
// get w wwww yyyy yyzz zzzz zzzz | |
// from bytes 1101 10xx xxyy yyyy 1101 11zz zzzz zzzz | |
// with wwwww = xxxx + 1 | |
Codepoint x = (buf[0] & 0b11'1100'0000) >> 6; | |
Codepoint y = (buf[0] & 0b00'0011'1111); | |
Codepoint z = (buf[1] & 0b11'1111'1111); | |
Codepoint w = x + 1; | |
Codepoint cp = (w << 16) + (y << 10) + z; | |
return cp; | |
} | |
if (((buf[0] & 0b1111'1100'0000'0000) >> 10) != 0b11'0110 && | |
((buf[0] & 0b1111'1100'0000'0000) >> 10) != 0b11'0111) { | |
Codepoint cp = buf[0]; | |
return cp; | |
} | |
return 0xFFFD; | |
} | |
// test code | |
int main() { | |
#ifdef _WIN64 | |
SetConsoleOutputCP(CP_UTF8); // for Windows | |
#endif | |
char16_t utf16_buffer[3]; | |
encodeUtf16(0x41, utf16_buffer); | |
assert(equal(utf16_buffer, utf16_buffer+1, u"A")); | |
assert(decodeUtf16(utf16_buffer) == 0x41); | |
encodeUtf16(0xA9, utf16_buffer); | |
assert(equal(utf16_buffer, utf16_buffer+1, u"©")); | |
assert(decodeUtf16(utf16_buffer) == 0xA9); | |
encodeUtf16(0xAC00, utf16_buffer); | |
assert(equal(utf16_buffer, utf16_buffer+1, u"가")); | |
assert(decodeUtf16(utf16_buffer) == 0xAC00); | |
encodeUtf16(0x1F602, utf16_buffer); | |
assert(equal(utf16_buffer, utf16_buffer+2, u"😂")); | |
assert(decodeUtf16(utf16_buffer) == 0x1F602); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <cstdint> | |
#include <cassert> | |
#ifdef _WIN64 | |
#include <windows.h> // for Windows | |
#endif | |
using std::begin, std::end, std::equal; | |
typedef uint32_t Codepoint; | |
void encodeUtf32(const Codepoint cp, char32_t *buf) { | |
buf[0] = cp; | |
buf[1] = '\0'; | |
} | |
Codepoint decodeUtf32(const char32_t *buf) { | |
return buf[0]; | |
} | |
// test code | |
int main() { | |
#ifdef _WIN64 | |
SetConsoleOutputCP(CP_UTF8); // for Windows | |
#endif | |
char32_t utf32_buffer[2]; | |
encodeUtf32(0x41, utf32_buffer); | |
assert(equal(utf32_buffer, utf32_buffer+1, u"A")); | |
assert(decodeUtf32(utf32_buffer) == 0x41); | |
encodeUtf32(0xA9, utf32_buffer); | |
assert(equal(utf32_buffer, utf32_buffer+1, U"©")); | |
assert(decodeUtf32(utf32_buffer) == 0xA9); | |
encodeUtf32(0xAC00, utf32_buffer); | |
assert(equal(utf32_buffer, utf32_buffer+1, U"가")); | |
assert(decodeUtf32(utf32_buffer) == 0xAC00); | |
encodeUtf32(0x1F602, utf32_buffer); | |
assert(equal(utf32_buffer, utf32_buffer+2, U"😂")); | |
assert(decodeUtf32(utf32_buffer) == 0x1F602); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <cstdint> | |
#include <cassert> | |
#ifdef _WIN64 | |
#include <windows.h> // for Windows | |
#endif | |
using std::begin, std::end, std::equal; | |
typedef uint32_t Codepoint; | |
void encodeUtf8(const Codepoint cp, char8_t *buf) { | |
bool within_7bits = cp < (1 << 7); | |
if (within_7bits) { | |
// encode xxx xxxx | |
// into bytes 0xxx xxxx | |
buf[0] = cp; | |
buf[1] = '\0'; | |
return; | |
} | |
bool within_11bits = cp < (1 << 11); | |
if (within_11bits) { | |
// encode yyy yyxx xxxx | |
// into bytes 110y yyyy 10xx xxxx | |
Codepoint y = (cp & 0b111'1100'0000) >> 6; | |
Codepoint x = (cp & 0b000'0011'1111); | |
buf[0] = 0b1100'0000 | y; | |
buf[1] = 0b1000'0000 | x; | |
buf[2] = '\0'; | |
return; | |
} | |
bool within_16bits = cp < (1 << 16); | |
if (within_16bits) { | |
// encode zzzz yyyy yyxx xxxx | |
// into bytes 1110 zzzz 10yy yyyy 10xx xxxx | |
Codepoint z = (cp & 0b1111'0000'0000'0000) >> 12; | |
Codepoint y = (cp & 0b0000'1111'1100'0000) >> 6; | |
Codepoint x = (cp & 0b0000'0000'0011'1111); | |
buf[0] = 0b1110'0000 | z; | |
buf[1] = 0b1000'0000 | y; | |
buf[2] = 0b1000'0000 | x; | |
buf[3] = '\0'; | |
return; | |
} | |
bool within_21bits = cp < (1 << 21); | |
if (within_21bits) { | |
// encode w wwzz zzzz yyyy yyxx xxxx | |
// into bytes 1111 0www 10zz zzzz 10yy yyyy 10xx xxxx | |
Codepoint w = (cp & 0b1'1100'0000'0000'0000'0000) >> 18; | |
Codepoint z = (cp & 0b0'0011'1111'0000'0000'0000) >> 12; | |
Codepoint y = (cp & 0b0'0000'0000'1111'1100'0000) >> 6; | |
Codepoint x = (cp & 0b0'0000'0000'0000'0011'1111); | |
buf[0] = 0b1111'0000 | w; | |
buf[1] = 0b1000'0000 | z; | |
buf[2] = 0b1000'0000 | y; | |
buf[3] = 0b1000'0000 | x; | |
buf[4] = '\0'; | |
return; | |
} | |
// return U+FFFD in UTF-8 | |
buf[0] = 0xEF; | |
buf[1] = 0xBF; | |
buf[2] = 0xBD; | |
buf[3] = '\0'; | |
} | |
Codepoint decodeUtf8(const char8_t *buf) { | |
if ((buf[0] & 0b1000'0000) == 0) { | |
// get xxx xxxx | |
// from bytes 0xxx xxxx | |
return (Codepoint) buf[0]; | |
} | |
if (((buf[0] & 0b1110'0000) >> 5) == 0b110 && | |
((buf[1] & 0b1100'0000) >> 6) == 0b10) { | |
// get xxx xxyy yyyy | |
// from bytes 110x xxxx 10yy yyyy | |
Codepoint x = buf[0] & 0b1'1111; | |
Codepoint y = buf[1] & 0b11'1111; | |
Codepoint cp = (x << 6) | y; | |
return cp; | |
} | |
if (((buf[0] & 0b1111'0000) >> 4) == 0b1110 && | |
((buf[1] & 0b1100'0000) >> 6) == 0b10 && | |
((buf[2] & 0b1100'0000) >> 6) == 0b10) { | |
// get xxxx yyyy yyzz zzzz | |
// from bytes 1110 xxxx 10yy yyyy 10zz zzzz | |
Codepoint x = buf[0] & 0b1111; | |
Codepoint y = buf[1] & 0b11'1111; | |
Codepoint z = buf[2] & 0b11'1111; | |
Codepoint cp = (x << 12) | (y << 6) | z; | |
return cp; | |
} | |
if (((buf[0] & 0b1111'1000) >> 3) == 0b11110 && | |
((buf[1] & 0b1100'0000) >> 6) == 0b10 && | |
((buf[2] & 0b1100'0000) >> 6) == 0b10 && | |
((buf[3] & 0b1100'0000) >> 6) == 0b10) { | |
// get x xxyy yyyy zzzz zzww wwww | |
// from bytes 1111 0xxx 10yy yyyy 10zz zzzz 10ww wwww | |
Codepoint x = buf[0] & 0b111; | |
Codepoint y = buf[1] & 0b11'1111; | |
Codepoint z = buf[2] & 0b11'1111; | |
Codepoint w = buf[3] & 0b11'1111; | |
Codepoint cp = (x << 18) | (y << 12) | (z << 6) | w; | |
return cp; | |
} | |
return 0xFFFD; | |
} | |
// test code | |
int main() { | |
#ifdef _WIN64 | |
SetConsoleOutputCP(CP_UTF8); // for Windows | |
#endif | |
char8_t utf8_buffer[5]; | |
encodeUtf8(0x41, utf8_buffer); | |
assert(equal(utf8_buffer, utf8_buffer+2, u8"A")); | |
assert(decodeUtf8(utf8_buffer) == 0x41); | |
encodeUtf8(0xA9, utf8_buffer); | |
assert(equal(utf8_buffer, utf8_buffer+3, u8"©")); | |
assert(decodeUtf8(utf8_buffer) == 0xA9); | |
encodeUtf8(0xAC00, utf8_buffer); | |
assert(equal(utf8_buffer, utf8_buffer+4, u8"가")); | |
assert(decodeUtf8(utf8_buffer) == 0xAC00); | |
encodeUtf8(0x1F602, utf8_buffer); | |
assert(equal(utf8_buffer, utf8_buffer+5, u8"😂")); | |
assert(decodeUtf8(utf8_buffer) == 0x1F602); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment