Skip to content

Instantly share code, notes, and snippets.

@Miouyouyou
Last active May 2, 2023 17:28
Show Gist options
  • Save Miouyouyou/864130e8734afe3f806512b14022226f to your computer and use it in GitHub Desktop.
Save Miouyouyou/864130e8734afe3f806512b14022226f to your computer and use it in GitHub Desktop.
A very dull UTF32 codepoint to UTF8 sequence converter (UTF-32 to UTF-8)
/* UTF-32 codepoint to UTF-8 sequence converter by Miouyouyou
*
* To the extent possible under law, the person who associated CC0 with
* this content has waived all copyright and related or neighboring
* rights to this content.
*
* You should have received a copy of the CC0 legalcode along with this
* work. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*
*/
#include <stdint.h> // uintx_t
/* Using http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf page 56
* as a basis */
/** Convert an UTF-32 (architecture endianness) codepoint and return
* an UTF-8 sequence packed in a 32 bits integer.
*
* Mostly useless.
*
* @param code The UTF-32 codepoint to convert
* @return a corresponding UTF-8 byte sequence packed in a 32 bits integer
*/
uint32_t utf32_to_utf8(uint32_t code) { // 00000000 0xxxxxxx
uint32_t utf8_codepoint;
if (code < 0x80) utf8_codepoint = code;
else if (code < 0x800) { // 00000yyy yyxxxxxx
utf8_codepoint =
(0b11000000 | (code >> 6) ) << 8 |
(0b10000000 | (code & 0x3f) );
}
else if (code < 0x10000) { // zzzzyyyy yyxxxxxx
utf8_codepoint =
(0b11100000 | (code >> 12) ) << 16 | // 1110zzz
(0b10000000 | ((code >> 6) & 0x3f) ) << 8 | // 10yyyyy
(0b10000000 | (code & 0x3f) ); // 10xxxxx
}
else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx
utf8_codepoint =
(0b11110000 | (code >> 18) ) << 24 | // 11110uuu
(0b10000000 | ((code >> 12) & 0x3f) ) << 16 | // 10uuzzzz
(0b10000000 | ((code >> 6) & 0x3f) ) << 8 | // 10yyyyyy
(0b10000000 | (code & 0x3f) ); // 10xxxxxx
}
}
/** Store the UTF-8 sequence corresponding to the provided UTF-32
* codepoint in the provided string.
*
* If you have an UTF-8 terminal, you can then just do :
* char string[5] = {0};
* utf32_to_utf8_string(L'真', string);
* printf("%s\n", string);
*
* WARNING: This assumes that you can store at least 4 bytes in
* the address identified by 'string'.
* This also assumes a little-endian system.
* Not tested on a Big Endian system.
*
* @param code The UTF-32 codepoint to convert
* @param string The byte array where the UTF-8 sequence will be
* stored
*/
void utf32_to_utf8_string(uint32_t code, char * string) {
if (code < 0x80) string[0] = code;
else if (code < 0x800) { // 00000yyy yyxxxxxx
string[0] = (0b11000000 | (code >> 6));
string[1] = (0b10000000 | (code & 0x3f));
}
else if (code < 0x10000) { // zzzzyyyy yyxxxxxx
string[0] = (0b11100000 | (code >> 12)); // 1110zzz
string[1] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyy
string[2] = (0b10000000 | (code & 0x3f)); // 10xxxxx
}
else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx
string[0] = (0b11110000 | (code >> 18)); // 11110uuu
string[1] = (0b10000000 | ((code >> 12) & 0x3f)); // 10uuzzzz
string[2] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyyy
string[3] = (0b10000000 | (code & 0x3f)); // 10xxxxxx
}
}
// gcc -o test simple-test.c
#include <stdint.h>
#include <stdio.h>
void utf32_to_utf8_string(uint32_t code, char * string) {
if (code < 0x80) string[0] = code;
else if (code < 0x800) { // 00000yyy yyxxxxxx
string[0] = (0b11000000 | (code >> 6));
string[1] = (0b10000000 | (code & 0x3f));
}
else if (code < 0x10000) { // zzzzyyyy yyxxxxxx
string[0] = (0b11100000 | (code >> 12)); // 1110zzz
string[1] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyy
string[2] = (0b10000000 | (code & 0x3f)); // 10xxxxx
}
else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx
string[0] = (0b11110000 | (code >> 18)); // 11110uuu
string[1] = (0b10000000 | ((code >> 12) & 0x3f)); // 10uuzzzz
string[2] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyyy
string[3] = (0b10000000 | (code & 0x3f)); // 10xxxxxx
}
}
/* Assumes an UTF-8 terminal.
You'll need an Emoji font if you can't see the little ghost emoji
Potential choices : Symbola, Chromoji */
int main() {
char nya[5] = {0};
utf32_to_utf8_string(L'👻', nya);
printf("%s\n", nya);
return 0;
}
@xoorath
Copy link

xoorath commented Dec 3, 2022

I think the comments around converting utf32 zzzzyyyy yyxxxxxx has some typo.

I think // 1110zzz should be // 1110zzzz. Same with 10yyyyy -> 10yyyyyy and 10xxxxx -> 10xxxxxx.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment