Miouyouyou/codepoint_to_utf8_seq.c

## codepoint_to_utf8_seq.c
/* UTF-32 codepoint to UTF-8 sequence converter by Miouyouyou
 *
 * To the extent possible under law, the person who associated CC0 with
 * this content has waived all copyright and related or neighboring
 * rights to this content.
 *
 * You should have received a copy of the CC0 legalcode along with this
 * work.  If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
 *
 */

#include <stdint.h> // uintx_t

/* Using http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf page 56
 * as a basis */

/** Convert an UTF-32 (architecture endianness) codepoint and return
 *  an UTF-8 sequence packed in a 32 bits integer.
 *
 * Mostly useless.
 *
 * @param code The UTF-32 codepoint to convert
 * @return a corresponding UTF-8 byte sequence packed in a 32 bits integer
 */
uint32_t utf32_to_utf8(uint32_t code) { // 00000000 0xxxxxxx
	uint32_t utf8_codepoint;
	if (code < 0x80) utf8_codepoint = code;
	else if (code < 0x800) {   // 00000yyy yyxxxxxx
		utf8_codepoint =
		  (0b11000000 | (code >> 6)   ) << 8 |
		  (0b10000000 | (code & 0x3f) );
	}
	else if (code < 0x10000) {  // zzzzyyyy yyxxxxxx
		utf8_codepoint =
		  (0b11100000 | (code >> 12)         ) << 16 |  // 1110zzz
		  (0b10000000 | ((code >> 6) & 0x3f) ) << 8  |  // 10yyyyy
		  (0b10000000 | (code & 0x3f)        );         // 10xxxxx
	}
	else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx
		utf8_codepoint =
		  (0b11110000 | (code >> 18)          ) << 24 | // 11110uuu
		  (0b10000000 | ((code >> 12) & 0x3f) ) << 16 | // 10uuzzzz
		  (0b10000000 | ((code >> 6)  & 0x3f) ) << 8  | // 10yyyyyy
		  (0b10000000 | (code & 0x3f)         );        // 10xxxxxx
	}
}

/** Store the UTF-8 sequence corresponding to the provided UTF-32
 *  codepoint in the provided string.
 *
 * If you have an UTF-8 terminal, you can then just do :
 *   char string[5] = {0};
 *   utf32_to_utf8_string(L'真', string);
 *   printf("%s\n", string);
 *
 * WARNING: This assumes that you can store at least 4 bytes in
 *          the address identified by 'string'.
 *          This also assumes a little-endian system.
 *          Not tested on a Big Endian system.
 *
 * @param code   The UTF-32 codepoint to convert
 * @param string The byte array where the UTF-8 sequence will be
 *               stored
 */
void utf32_to_utf8_string(uint32_t code, char * string) {
	if (code < 0x80) string[0] = code;
	else if (code < 0x800) {   // 00000yyy yyxxxxxx
		string[0] = (0b11000000 | (code >> 6));
		string[1] = (0b10000000 | (code & 0x3f));
	}
	else if (code < 0x10000) {  // zzzzyyyy yyxxxxxx
		string[0] = (0b11100000 | (code >> 12));         // 1110zzz
		string[1] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyy
		string[2] = (0b10000000 | (code & 0x3f));        // 10xxxxx
	}
	else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx
		string[0] = (0b11110000 | (code >> 18));          // 11110uuu
		string[1] = (0b10000000 | ((code >> 12) & 0x3f)); // 10uuzzzz
		string[2] = (0b10000000 | ((code >> 6)  & 0x3f)); // 10yyyyyy
		string[3] = (0b10000000 | (code & 0x3f));         // 10xxxxxx
	}
}

## simple-test.c
// gcc -o test simple-test.c

#include <stdint.h>
#include <stdio.h>

void utf32_to_utf8_string(uint32_t code, char * string) {
	if (code < 0x80) string[0] = code;
	else if (code < 0x800) {   // 00000yyy yyxxxxxx
		string[0] = (0b11000000 | (code >> 6));
		string[1] = (0b10000000 | (code & 0x3f));
	}
	else if (code < 0x10000) {  // zzzzyyyy yyxxxxxx
		string[0] = (0b11100000 | (code >> 12));         // 1110zzz
		string[1] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyy
		string[2] = (0b10000000 | (code & 0x3f));        // 10xxxxx
	}
	else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx
		string[0] = (0b11110000 | (code >> 18));          // 11110uuu
		string[1] = (0b10000000 | ((code >> 12) & 0x3f)); // 10uuzzzz
		string[2] = (0b10000000 | ((code >> 6)  & 0x3f)); // 10yyyyyy
		string[3] = (0b10000000 | (code & 0x3f));         // 10xxxxxx
	}
}

/* Assumes an UTF-8 terminal.
   You'll need an Emoji font if you can't see the little ghost emoji
   Potential choices : Symbola, Chromoji */
int main() {
  char nya[5] = {0};
  utf32_to_utf8_string(L'👻', nya);
  printf("%s\n", nya);
  return 0;
}
	/* UTF-32 codepoint to UTF-8 sequence converter by Miouyouyou
	*
	* To the extent possible under law, the person who associated CC0 with
	* this content has waived all copyright and related or neighboring
	* rights to this content.
	*
	* You should have received a copy of the CC0 legalcode along with this
	* work. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
	*
	*/

	#include <stdint.h> // uintx_t

	/* Using http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf page 56
	* as a basis */

	/** Convert an UTF-32 (architecture endianness) codepoint and return
	* an UTF-8 sequence packed in a 32 bits integer.
	*
	* Mostly useless.
	*
	* @param code The UTF-32 codepoint to convert
	* @return a corresponding UTF-8 byte sequence packed in a 32 bits integer
	*/
	uint32_t utf32_to_utf8(uint32_t code) { // 00000000 0xxxxxxx
	uint32_t utf8_codepoint;
	if (code < 0x80) utf8_codepoint = code;
	else if (code < 0x800) { // 00000yyy yyxxxxxx
	utf8_codepoint =
	(0b11000000 \| (code >> 6) ) << 8 \|
	(0b10000000 \| (code & 0x3f) );
	}
	else if (code < 0x10000) { // zzzzyyyy yyxxxxxx
	utf8_codepoint =
	(0b11100000 \| (code >> 12) ) << 16 \| // 1110zzz
	(0b10000000 \| ((code >> 6) & 0x3f) ) << 8 \| // 10yyyyy
	(0b10000000 \| (code & 0x3f) ); // 10xxxxx
	}
	else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx
	utf8_codepoint =
	(0b11110000 \| (code >> 18) ) << 24 \| // 11110uuu
	(0b10000000 \| ((code >> 12) & 0x3f) ) << 16 \| // 10uuzzzz
	(0b10000000 \| ((code >> 6) & 0x3f) ) << 8 \| // 10yyyyyy
	(0b10000000 \| (code & 0x3f) ); // 10xxxxxx
	}
	}

	/** Store the UTF-8 sequence corresponding to the provided UTF-32
	* codepoint in the provided string.
	*
	* If you have an UTF-8 terminal, you can then just do :
	* char string[5] = {0};
	* utf32_to_utf8_string(L'真', string);
	* printf("%s\n", string);
	*
	* WARNING: This assumes that you can store at least 4 bytes in
	* the address identified by 'string'.
	* This also assumes a little-endian system.
	* Not tested on a Big Endian system.
	*
	* @param code The UTF-32 codepoint to convert
	* @param string The byte array where the UTF-8 sequence will be
	* stored
	*/
	void utf32_to_utf8_string(uint32_t code, char * string) {
	if (code < 0x80) string[0] = code;
	else if (code < 0x800) { // 00000yyy yyxxxxxx
	string[0] = (0b11000000 \| (code >> 6));
	string[1] = (0b10000000 \| (code & 0x3f));
	}
	else if (code < 0x10000) { // zzzzyyyy yyxxxxxx
	string[0] = (0b11100000 \| (code >> 12)); // 1110zzz
	string[1] = (0b10000000 \| ((code >> 6) & 0x3f)); // 10yyyyy
	string[2] = (0b10000000 \| (code & 0x3f)); // 10xxxxx
	}
	else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx
	string[0] = (0b11110000 \| (code >> 18)); // 11110uuu
	string[1] = (0b10000000 \| ((code >> 12) & 0x3f)); // 10uuzzzz
	string[2] = (0b10000000 \| ((code >> 6) & 0x3f)); // 10yyyyyy
	string[3] = (0b10000000 \| (code & 0x3f)); // 10xxxxxx
	}
	}
	// gcc -o test simple-test.c

	#include <stdint.h>
	#include <stdio.h>

	void utf32_to_utf8_string(uint32_t code, char * string) {
	if (code < 0x80) string[0] = code;
	else if (code < 0x800) { // 00000yyy yyxxxxxx
	string[0] = (0b11000000 \| (code >> 6));
	string[1] = (0b10000000 \| (code & 0x3f));
	}
	else if (code < 0x10000) { // zzzzyyyy yyxxxxxx
	string[0] = (0b11100000 \| (code >> 12)); // 1110zzz
	string[1] = (0b10000000 \| ((code >> 6) & 0x3f)); // 10yyyyy
	string[2] = (0b10000000 \| (code & 0x3f)); // 10xxxxx
	}
	else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx
	string[0] = (0b11110000 \| (code >> 18)); // 11110uuu
	string[1] = (0b10000000 \| ((code >> 12) & 0x3f)); // 10uuzzzz
	string[2] = (0b10000000 \| ((code >> 6) & 0x3f)); // 10yyyyyy
	string[3] = (0b10000000 \| (code & 0x3f)); // 10xxxxxx
	}
	}

	/* Assumes an UTF-8 terminal.
	You'll need an Emoji font if you can't see the little ghost emoji
	Potential choices : Symbola, Chromoji */
	int main() {
	char nya[5] = {0};
	utf32_to_utf8_string(L'👻', nya);
	printf("%s\n", nya);
	return 0;
	}