Last active
March 17, 2024 02:13
-
-
Save saxbophone/6a424069c3129070926dfa462ac67de0 to your computer and use it in GitHub Desktop.
Taking advantage of the fact that Unicode only uses 21 bits to pack 3 UTF-32 code units into one 64-bit word (63 bits with one wasted/spare).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <string> | |
#pragma pack(push, 1) | |
struct Chunk { | |
char32_t first : 21; | |
char32_t second : 21; | |
char32_t third : 21; | |
operator std::u32string() const { | |
return {first, second, third}; | |
} | |
}; | |
#pragma pack(pop) | |
static_assert(sizeof(Chunk) == 8); | |
#include <vector> | |
constexpr std::vector<Chunk> operator""_packed(const char32_t* str, std::size_t size) { | |
std::vector<Chunk> packed; | |
packed.reserve(size / 3 + (size % 3 != 0)); | |
int i = 0; | |
for (; i < size; i += 3) { | |
packed.push_back({str[i], str[i+1], str[i+2]}); | |
} | |
if (size % 3 != 0) { | |
packed.push_back({}); | |
switch (size % 3) { | |
case 2: | |
packed.back().second = str[i+1]; | |
case 1: | |
packed.back().first = str[i]; | |
} | |
} | |
return packed; | |
} | |
#include <cuchar> | |
#include <iostream> | |
int main() { | |
auto packed = U"Gwabberzenty, 훈민정음!"_packed; | |
std::u32string unpacked = packed[0]; | |
std::mbstate_t state{}; | |
char out[5]{}; | |
for (const auto& chunk : packed) { | |
for (char32_t c32 : (std::u32string)chunk) { | |
std::size_t rc = std::c32rtomb(out, c32, &state); | |
std::cout << out; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment