Last active
May 2, 2023 17:28
-
-
Save Miouyouyou/864130e8734afe3f806512b14022226f to your computer and use it in GitHub Desktop.
A very dull UTF32 codepoint to UTF8 sequence converter (UTF-32 to UTF-8)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* UTF-32 codepoint to UTF-8 sequence converter by Miouyouyou | |
* | |
* To the extent possible under law, the person who associated CC0 with | |
* this content has waived all copyright and related or neighboring | |
* rights to this content. | |
* | |
* You should have received a copy of the CC0 legalcode along with this | |
* work. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. | |
* | |
*/ | |
#include <stdint.h> // uintx_t | |
/* Using http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf page 56 | |
* as a basis */ | |
/** Convert an UTF-32 (architecture endianness) codepoint and return | |
* an UTF-8 sequence packed in a 32 bits integer. | |
* | |
* Mostly useless. | |
* | |
* @param code The UTF-32 codepoint to convert | |
* @return a corresponding UTF-8 byte sequence packed in a 32 bits integer | |
*/ | |
uint32_t utf32_to_utf8(uint32_t code) { // 00000000 0xxxxxxx | |
uint32_t utf8_codepoint; | |
if (code < 0x80) utf8_codepoint = code; | |
else if (code < 0x800) { // 00000yyy yyxxxxxx | |
utf8_codepoint = | |
(0b11000000 | (code >> 6) ) << 8 | | |
(0b10000000 | (code & 0x3f) ); | |
} | |
else if (code < 0x10000) { // zzzzyyyy yyxxxxxx | |
utf8_codepoint = | |
(0b11100000 | (code >> 12) ) << 16 | // 1110zzz | |
(0b10000000 | ((code >> 6) & 0x3f) ) << 8 | // 10yyyyy | |
(0b10000000 | (code & 0x3f) ); // 10xxxxx | |
} | |
else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx | |
utf8_codepoint = | |
(0b11110000 | (code >> 18) ) << 24 | // 11110uuu | |
(0b10000000 | ((code >> 12) & 0x3f) ) << 16 | // 10uuzzzz | |
(0b10000000 | ((code >> 6) & 0x3f) ) << 8 | // 10yyyyyy | |
(0b10000000 | (code & 0x3f) ); // 10xxxxxx | |
} | |
} | |
/** Store the UTF-8 sequence corresponding to the provided UTF-32 | |
* codepoint in the provided string. | |
* | |
* If you have an UTF-8 terminal, you can then just do : | |
* char string[5] = {0}; | |
* utf32_to_utf8_string(L'真', string); | |
* printf("%s\n", string); | |
* | |
* WARNING: This assumes that you can store at least 4 bytes in | |
* the address identified by 'string'. | |
* This also assumes a little-endian system. | |
* Not tested on a Big Endian system. | |
* | |
* @param code The UTF-32 codepoint to convert | |
* @param string The byte array where the UTF-8 sequence will be | |
* stored | |
*/ | |
void utf32_to_utf8_string(uint32_t code, char * string) { | |
if (code < 0x80) string[0] = code; | |
else if (code < 0x800) { // 00000yyy yyxxxxxx | |
string[0] = (0b11000000 | (code >> 6)); | |
string[1] = (0b10000000 | (code & 0x3f)); | |
} | |
else if (code < 0x10000) { // zzzzyyyy yyxxxxxx | |
string[0] = (0b11100000 | (code >> 12)); // 1110zzz | |
string[1] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyy | |
string[2] = (0b10000000 | (code & 0x3f)); // 10xxxxx | |
} | |
else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx | |
string[0] = (0b11110000 | (code >> 18)); // 11110uuu | |
string[1] = (0b10000000 | ((code >> 12) & 0x3f)); // 10uuzzzz | |
string[2] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyyy | |
string[3] = (0b10000000 | (code & 0x3f)); // 10xxxxxx | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// gcc -o test simple-test.c | |
#include <stdint.h> | |
#include <stdio.h> | |
void utf32_to_utf8_string(uint32_t code, char * string) { | |
if (code < 0x80) string[0] = code; | |
else if (code < 0x800) { // 00000yyy yyxxxxxx | |
string[0] = (0b11000000 | (code >> 6)); | |
string[1] = (0b10000000 | (code & 0x3f)); | |
} | |
else if (code < 0x10000) { // zzzzyyyy yyxxxxxx | |
string[0] = (0b11100000 | (code >> 12)); // 1110zzz | |
string[1] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyy | |
string[2] = (0b10000000 | (code & 0x3f)); // 10xxxxx | |
} | |
else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx | |
string[0] = (0b11110000 | (code >> 18)); // 11110uuu | |
string[1] = (0b10000000 | ((code >> 12) & 0x3f)); // 10uuzzzz | |
string[2] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyyy | |
string[3] = (0b10000000 | (code & 0x3f)); // 10xxxxxx | |
} | |
} | |
/* Assumes an UTF-8 terminal. | |
You'll need an Emoji font if you can't see the little ghost emoji | |
Potential choices : Symbola, Chromoji */ | |
int main() { | |
char nya[5] = {0}; | |
utf32_to_utf8_string(L'👻', nya); | |
printf("%s\n", nya); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I think the comments around converting utf32
zzzzyyyy yyxxxxxx
has some typo.I think
// 1110zzz
should be// 1110zzzz
. Same with10yyyyy
->10yyyyyy
and10xxxxx
->10xxxxxx
.