mietek/convert_utf8.c

## convert_utf8.c
#include <stdbool.h>
#include <stdint.h>

#include "convert_utf8.h"


#define UTF8_SINGLE_BYTE_CONTROL_MASK  0x80
#define UTF8_SINGLE_BYTE_CONTROL_VALUE 0x00

// Returns true if `byte` is standalone in UTF8.
inline static bool is_utf8_single_byte(uint8_t byte) {
	return (byte & UTF8_SINGLE_BYTE_CONTROL_MASK) == UTF8_SINGLE_BYTE_CONTROL_VALUE;
}


#define UTF8_SEQUENCE_BYTE_CONTROL_MASK  0xC0
#define UTF8_SEQUENCE_BYTE_CONTROL_VALUE 0x80
#define UTF8_SEQUENCE_BYTE_DATA_MASK     0x3F

#define UTF8_SEQUENCE_WORD_DATA_SHIFT 6

// Returns true if `byte` is a part of a UTF8 sequence.
inline static bool is_utf8_sequence_byte(uint8_t byte) {
	return (byte & UTF8_SEQUENCE_BYTE_CONTROL_MASK) == UTF8_SEQUENCE_BYTE_CONTROL_VALUE;
}

// Returns the data bits of a UTF8 sequence byte.
inline static uint32_t decode_utf8_sequence_byte(uint8_t byte) {
	return byte & UTF8_SEQUENCE_BYTE_DATA_MASK;
}

// Returns a UTF8 sequence byte encoding the data bits of a UTF32 word.
inline static uint8_t encode_utf8_sequence_byte(uint32_t word) {
	return UTF8_SEQUENCE_BYTE_CONTROL_VALUE | (word & UTF8_SEQUENCE_BYTE_DATA_MASK);
}


#define UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK  0xE0
#define UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE 0xC0
#define UTF8_TWO_BYTE_SEQUENCE_START_BYTE_DATA_MASK     0x1F

#define UTF8_TWO_BYTE_SEQUENCE_WORD_NON_ZERO_MASK 0x00000780

// Returns true if `byte` is the start of a UTF8 two-byte sequence.
inline static bool is_utf8_two_byte_sequence_start_byte(uint8_t byte) {
	return (byte & UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK) == UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE;
}

// Returns the data bits of a UTF8 two-byte sequence start byte.
inline static uint32_t decode_utf8_two_byte_sequence_start_byte(uint8_t byte) {
	return byte & UTF8_TWO_BYTE_SEQUENCE_START_BYTE_DATA_MASK;
}

// Returns true if `word` is a non-overlong UTF8 two-byte sequence.
inline static bool is_non_overlong_utf8_two_byte_sequence_word(uint32_t word) {
	return word & UTF8_TWO_BYTE_SEQUENCE_WORD_NON_ZERO_MASK;
}

// Returns a UTF8 two-byte sequence start byte encoding the data bits of `word`.
inline static uint8_t encode_utf8_two_byte_sequence_start_byte(uint32_t word) {
	return UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE | (word & UTF8_TWO_BYTE_SEQUENCE_START_BYTE_DATA_MASK);
}


#define UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK  0xF0
#define UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE 0xE0
#define UTF8_THREE_BYTE_SEQUENCE_START_BYTE_DATA_MASK     0x0F

#define UTF8_THREE_BYTE_SEQUENCE_WORD_NON_ZERO_MASK 0x0000F800

// Returns true if `byte` is the start of a UTF8 three-byte sequence.
inline static bool is_utf8_three_byte_sequence_start_byte(uint8_t byte) {
	return (byte & UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK) == UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE;
}

// Returns the data bits of a UTF8 three-byte sequence start byte.
inline static uint32_t decode_utf8_three_byte_sequence_start_byte(uint8_t byte) {
	return byte & UTF8_THREE_BYTE_SEQUENCE_START_BYTE_DATA_MASK;
}

// Returns true if `word` is a non-overlong UTF8 three-byte sequence.
inline static bool is_non_overlong_utf8_three_byte_sequence_word(uint32_t word) {
	return word & UTF8_THREE_BYTE_SEQUENCE_WORD_NON_ZERO_MASK;
}

// Returns a UTF8 three-byte sequence start byte encoding the data bits of `word`.
inline static uint8_t encode_utf8_three_byte_sequence_start_byte(uint32_t word) {
	return UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE | (word & UTF8_THREE_BYTE_SEQUENCE_START_BYTE_DATA_MASK);
}


#define UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK  0xF8
#define UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE 0xF0
#define UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_DATA_MASK     0x07

#define UTF8_FOUR_BYTE_SEQUENCE_WORD_NON_ZERO_MASK 0x001F0000

// Returns true if `byte` is the start of a UTF8 four-byte sequence.
inline static bool is_utf8_four_byte_sequence_start_byte(uint8_t byte) {
	return (byte & UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK) == UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE;
}

// Returns the data bits of a UTF8 four-byte sequence start byte.
inline static uint32_t decode_utf8_four_byte_sequence_start_byte(uint8_t byte) {
	return byte & UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_DATA_MASK;
}

// Returns true if `word` is a non-overlong UTF8 four-byte sequence.
inline static bool is_non_overlong_utf8_four_byte_sequence_word(uint32_t word) {
	return word & UTF8_FOUR_BYTE_SEQUENCE_WORD_NON_ZERO_MASK;
}

// Returns a UTF8 four-byte sequence start byte encoding the data bits of `word`.
inline static uint8_t encode_utf8_four_byte_sequence_start_byte(uint32_t word) {
	return UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE | (word & UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_DATA_MASK);
}


// Decodes a UTF32 word from a UTF8 two-byte sequence, or '?' if unsuccessful.
// Returns true if successful.
inline static bool decode_utf8_two_byte_sequence(uint32_t *dst, const uint8_t *src) {
	uint32_t word;
	if (is_utf8_sequence_byte(src[1])) {
		word = decode_utf8_two_byte_sequence_start_byte(src[0]);
		word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT;
		word |= decode_utf8_sequence_byte(src[1]);
		if (is_non_overlong_utf8_two_byte_sequence_word(word)) {
			*dst = word;
			return true;
		}
	}
	*dst = '?';
	return false;
}

// Decodes a UTF32 word from a UTF8 three-byte sequence, or '?' if unsuccessful.
// Returns true if successful.
inline static bool decode_utf8_three_byte_sequence(uint32_t *dst, const uint8_t *src) {
	uint32_t word;
	if (is_utf8_sequence_byte(src[1]) &&
			is_utf8_sequence_byte(src[2])) {
		word = decode_utf8_three_byte_sequence_start_byte(src[0]);
		word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT;
		word |= decode_utf8_sequence_byte(src[1]);
		word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT;
		word |= decode_utf8_sequence_byte(src[2]);
		if (is_non_overlong_utf8_three_byte_sequence_word(word)) {
			*dst = word;
			return true;
		}
	}
	*dst = '?';
	return false;
}

// Decodes a UTF32 word from a UTF8 four-byte sequence, or '?' if unsuccessful.
// Returns true if successful.
inline static bool decode_utf8_four_byte_sequence(uint32_t *dst, const uint8_t *src) {
	uint32_t word;
	if (is_utf8_sequence_byte(src[1]) &&
			is_utf8_sequence_byte(src[2]) &&
			is_utf8_sequence_byte(src[3])) {
		word = decode_utf8_four_byte_sequence_start_byte(src[0]);
		word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT;
		word |= decode_utf8_sequence_byte(src[1]);
		word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT;
		word |= decode_utf8_sequence_byte(src[2]);
		word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT;
		word |= decode_utf8_sequence_byte(src[3]);
		if (is_non_overlong_utf8_four_byte_sequence_word(word)) {
			*dst = word;
			return true;
		}
	}
	*dst = '?';
	return false;
}

// Converts text from UTF8 to UTF32, outputting at most `dst_size - 1` words, 0-terminated.
// Returns the number of words actually output.
// Converts malformed bytes to '?', and leaves truncated sequences unprocessed.
// `src_unused` may be NULL.
uint32_t convert_from_utf8_to_utf32(uint32_t *dst, uint32_t dst_size, const uint8_t *src, uint32_t src_size, uint32_t *src_unused) {
	uint32_t i, j;
	for (i = 0, j = 0; i < src_size && j + 1 < dst_size; i++, j++) {
		if (is_utf8_single_byte(src[i]))
			dst[j] = src[i];
		else if (is_utf8_two_byte_sequence_start_byte(src[i])) {
			if (i + 1 >= src_size)
				break;
			if (decode_utf8_two_byte_sequence(&dst[j], &src[i]))
				i++;
		}
		else if (is_utf8_three_byte_sequence_start_byte(src[i])) {
			if (i + 2 >= src_size)
				break;
			if (decode_utf8_three_byte_sequence(&dst[j], &src[i]))
				i += 2;
		}
		else if (is_utf8_four_byte_sequence_start_byte(src[i])) {
			if (i + 3 >= src_size)
				break;
			if (decode_utf8_four_byte_sequence(&dst[j], &src[i]))
				i += 3;
		}
		else
			dst[j] = '?';
	}
	if (src_unused)
		*src_unused = src_size - i;
	dst[j] = 0;
	return j;
}


// Encodes a UTF32 word to a UTF8 two-byte sequence.
inline static void encode_utf8_two_byte_sequence(uint8_t *dst, uint32_t word) {
	dst[1] = encode_utf8_sequence_byte(word);
	word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT;
	dst[0] = encode_utf8_two_byte_sequence_start_byte(word);
}

// Encodes a UTF32 word to a UTF8 three-byte sequence.
inline static void encode_utf8_three_byte_sequence(uint8_t *dst, uint32_t word) {
	dst[2] = encode_utf8_sequence_byte(word);
	word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT;
	dst[1] = encode_utf8_sequence_byte(word);
	word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT;
	dst[0] = encode_utf8_three_byte_sequence_start_byte(word);
}

// Encodes a UTF32 word to a UTF8 four-byte sequence.
inline static void encode_utf8_four_byte_sequence(uint8_t *dst, uint32_t word) {
	dst[3] = encode_utf8_sequence_byte(word);
	word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT;
	dst[2] = encode_utf8_sequence_byte(word);
	word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT;
	dst[1] = encode_utf8_sequence_byte(word);
	word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT;
	dst[0] = encode_utf8_four_byte_sequence_start_byte(word);
}

#define MAX_UTF8_SINGLE_BYTE_WORD         0x0000007F
#define MAX_UTF8_TWO_BYTE_SEQUENCE_WORD   0x000007FF
#define MAX_UTF8_THREE_BYTE_SEQUENCE_WORD 0x0000FFFF
#define MAX_UTF8_FOUR_BYTE_SEQUENCE_WORD  0x0010FFFF

// Converts text from UTF32 to UTF8, outputting at most `dst_size - 1` bytes, 0-terminated.
// Returns the number of bytes actually output.
// Converts malformed words to '?'.
// `src_unused` may be NULL.
uint32_t convert_from_utf32_to_utf8(uint8_t *dst, uint32_t dst_size, const uint32_t *src, uint32_t src_size, uint32_t *src_unused) {
	uint32_t i, j;
	for (i = 0, j = 0; i < src_size && j + 1 < dst_size; i++, j++) {
		if (src[i] <= MAX_UTF8_SINGLE_BYTE_WORD)
			dst[j] = src[i];
		else if (src[i] <= MAX_UTF8_TWO_BYTE_SEQUENCE_WORD) {
			if (j + 2 >= dst_size)
				break;
			encode_utf8_two_byte_sequence(&dst[j], src[i]);
			j++;
		}
		else if (src[i] <= MAX_UTF8_THREE_BYTE_SEQUENCE_WORD) {
			if (j + 3 >= dst_size)
				break;
			encode_utf8_three_byte_sequence(&dst[j], src[i]);
			j += 2;
		}
		else if (src[i] <= MAX_UTF8_FOUR_BYTE_SEQUENCE_WORD) {
		 	if (j + 4 >= dst_size)
				break;
			encode_utf8_four_byte_sequence(&dst[j], src[i]);
			j += 3;
		}
		else
			dst[j] = '?';
	}
	if (src_unused)
		*src_unused = src_size - i;
	dst[j] = 0;
	return j;
}

## convert_utf8.h
// Converts text from UTF8 to UTF32, outputting at most `dst_size - 1` words, 0-terminated.
// Returns the number of words actually output.
// Converts malformed bytes to '?', and leaves truncated sequences unprocessed.
// `src_unused` may be NULL.
uint32_t convert_from_utf8_to_utf32(uint32_t *dst, uint32_t dst_size, const uint8_t *src, uint32_t src_size, uint32_t *src_unused);


// Converts text from UTF32 to UTF8, outputting at most `dst_size - 1` bytes, 0-terminated.
// Returns the number of bytes actually output.
// Converts malformed words to '?'.
// `src_unused` may be NULL.
uint32_t convert_from_utf32_to_utf8(uint8_t *dst, uint32_t dst_size, const uint32_t *src, uint32_t src_size, uint32_t *src_unused);
	#include <stdbool.h>
	#include <stdint.h>

	#include "convert_utf8.h"


	#define UTF8_SINGLE_BYTE_CONTROL_MASK 0x80
	#define UTF8_SINGLE_BYTE_CONTROL_VALUE 0x00

	// Returns true if `byte` is standalone in UTF8.
	inline static bool is_utf8_single_byte(uint8_t byte) {
	return (byte & UTF8_SINGLE_BYTE_CONTROL_MASK) == UTF8_SINGLE_BYTE_CONTROL_VALUE;
	}


	#define UTF8_SEQUENCE_BYTE_CONTROL_MASK 0xC0
	#define UTF8_SEQUENCE_BYTE_CONTROL_VALUE 0x80
	#define UTF8_SEQUENCE_BYTE_DATA_MASK 0x3F

	#define UTF8_SEQUENCE_WORD_DATA_SHIFT 6

	// Returns true if `byte` is a part of a UTF8 sequence.
	inline static bool is_utf8_sequence_byte(uint8_t byte) {
	return (byte & UTF8_SEQUENCE_BYTE_CONTROL_MASK) == UTF8_SEQUENCE_BYTE_CONTROL_VALUE;
	}

	// Returns the data bits of a UTF8 sequence byte.
	inline static uint32_t decode_utf8_sequence_byte(uint8_t byte) {
	return byte & UTF8_SEQUENCE_BYTE_DATA_MASK;
	}

	// Returns a UTF8 sequence byte encoding the data bits of a UTF32 word.
	inline static uint8_t encode_utf8_sequence_byte(uint32_t word) {
	return UTF8_SEQUENCE_BYTE_CONTROL_VALUE \| (word & UTF8_SEQUENCE_BYTE_DATA_MASK);
	}


	#define UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK 0xE0
	#define UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE 0xC0
	#define UTF8_TWO_BYTE_SEQUENCE_START_BYTE_DATA_MASK 0x1F

	#define UTF8_TWO_BYTE_SEQUENCE_WORD_NON_ZERO_MASK 0x00000780

	// Returns true if `byte` is the start of a UTF8 two-byte sequence.
	inline static bool is_utf8_two_byte_sequence_start_byte(uint8_t byte) {
	return (byte & UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK) == UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE;
	}

	// Returns the data bits of a UTF8 two-byte sequence start byte.
	inline static uint32_t decode_utf8_two_byte_sequence_start_byte(uint8_t byte) {
	return byte & UTF8_TWO_BYTE_SEQUENCE_START_BYTE_DATA_MASK;
	}

	// Returns true if `word` is a non-overlong UTF8 two-byte sequence.
	inline static bool is_non_overlong_utf8_two_byte_sequence_word(uint32_t word) {
	return word & UTF8_TWO_BYTE_SEQUENCE_WORD_NON_ZERO_MASK;
	}

	// Returns a UTF8 two-byte sequence start byte encoding the data bits of `word`.
	inline static uint8_t encode_utf8_two_byte_sequence_start_byte(uint32_t word) {
	return UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE \| (word & UTF8_TWO_BYTE_SEQUENCE_START_BYTE_DATA_MASK);
	}


	#define UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK 0xF0
	#define UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE 0xE0
	#define UTF8_THREE_BYTE_SEQUENCE_START_BYTE_DATA_MASK 0x0F

	#define UTF8_THREE_BYTE_SEQUENCE_WORD_NON_ZERO_MASK 0x0000F800

	// Returns true if `byte` is the start of a UTF8 three-byte sequence.
	inline static bool is_utf8_three_byte_sequence_start_byte(uint8_t byte) {
	return (byte & UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK) == UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE;
	}

	// Returns the data bits of a UTF8 three-byte sequence start byte.
	inline static uint32_t decode_utf8_three_byte_sequence_start_byte(uint8_t byte) {
	return byte & UTF8_THREE_BYTE_SEQUENCE_START_BYTE_DATA_MASK;
	}

	// Returns true if `word` is a non-overlong UTF8 three-byte sequence.
	inline static bool is_non_overlong_utf8_three_byte_sequence_word(uint32_t word) {
	return word & UTF8_THREE_BYTE_SEQUENCE_WORD_NON_ZERO_MASK;
	}

	// Returns a UTF8 three-byte sequence start byte encoding the data bits of `word`.
	inline static uint8_t encode_utf8_three_byte_sequence_start_byte(uint32_t word) {
	return UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE \| (word & UTF8_THREE_BYTE_SEQUENCE_START_BYTE_DATA_MASK);
	}


	#define UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK 0xF8
	#define UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE 0xF0
	#define UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_DATA_MASK 0x07

	#define UTF8_FOUR_BYTE_SEQUENCE_WORD_NON_ZERO_MASK 0x001F0000

	// Returns true if `byte` is the start of a UTF8 four-byte sequence.
	inline static bool is_utf8_four_byte_sequence_start_byte(uint8_t byte) {
	return (byte & UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK) == UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE;
	}

	// Returns the data bits of a UTF8 four-byte sequence start byte.
	inline static uint32_t decode_utf8_four_byte_sequence_start_byte(uint8_t byte) {
	return byte & UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_DATA_MASK;
	}

	// Returns true if `word` is a non-overlong UTF8 four-byte sequence.
	inline static bool is_non_overlong_utf8_four_byte_sequence_word(uint32_t word) {
	return word & UTF8_FOUR_BYTE_SEQUENCE_WORD_NON_ZERO_MASK;
	}

	// Returns a UTF8 four-byte sequence start byte encoding the data bits of `word`.
	inline static uint8_t encode_utf8_four_byte_sequence_start_byte(uint32_t word) {
	return UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE \| (word & UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_DATA_MASK);
	}


	// Decodes a UTF32 word from a UTF8 two-byte sequence, or '?' if unsuccessful.
	// Returns true if successful.
	inline static bool decode_utf8_two_byte_sequence(uint32_t dst, const uint8_t src) {
	uint32_t word;
	if (is_utf8_sequence_byte(src[1])) {
	word = decode_utf8_two_byte_sequence_start_byte(src[0]);
	word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT;
	word \|= decode_utf8_sequence_byte(src[1]);
	if (is_non_overlong_utf8_two_byte_sequence_word(word)) {
	*dst = word;
	return true;
	}
	}
	*dst = '?';
	return false;
	}

	// Decodes a UTF32 word from a UTF8 three-byte sequence, or '?' if unsuccessful.
	// Returns true if successful.
	inline static bool decode_utf8_three_byte_sequence(uint32_t dst, const uint8_t src) {
	uint32_t word;
	if (is_utf8_sequence_byte(src[1]) &&
	is_utf8_sequence_byte(src[2])) {
	word = decode_utf8_three_byte_sequence_start_byte(src[0]);
	word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT;
	word \|= decode_utf8_sequence_byte(src[1]);
	word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT;
	word \|= decode_utf8_sequence_byte(src[2]);
	if (is_non_overlong_utf8_three_byte_sequence_word(word)) {
	*dst = word;
	return true;
	}
	}
	*dst = '?';
	return false;
	}

	// Decodes a UTF32 word from a UTF8 four-byte sequence, or '?' if unsuccessful.
	// Returns true if successful.
	inline static bool decode_utf8_four_byte_sequence(uint32_t dst, const uint8_t src) {
	uint32_t word;
	if (is_utf8_sequence_byte(src[1]) &&
	is_utf8_sequence_byte(src[2]) &&
	is_utf8_sequence_byte(src[3])) {
	word = decode_utf8_four_byte_sequence_start_byte(src[0]);
	word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT;
	word \|= decode_utf8_sequence_byte(src[1]);
	word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT;
	word \|= decode_utf8_sequence_byte(src[2]);
	word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT;
	word \|= decode_utf8_sequence_byte(src[3]);
	if (is_non_overlong_utf8_four_byte_sequence_word(word)) {
	*dst = word;
	return true;
	}
	}
	*dst = '?';
	return false;
	}

	// Converts text from UTF8 to UTF32, outputting at most `dst_size - 1` words, 0-terminated.
	// Returns the number of words actually output.
	// Converts malformed bytes to '?', and leaves truncated sequences unprocessed.
	// `src_unused` may be NULL.
	uint32_t convert_from_utf8_to_utf32(uint32_t dst, uint32_t dst_size, const uint8_t src, uint32_t src_size, uint32_t *src_unused) {
	uint32_t i, j;
	for (i = 0, j = 0; i < src_size && j + 1 < dst_size; i++, j++) {
	if (is_utf8_single_byte(src[i]))
	dst[j] = src[i];
	else if (is_utf8_two_byte_sequence_start_byte(src[i])) {
	if (i + 1 >= src_size)
	break;
	if (decode_utf8_two_byte_sequence(&dst[j], &src[i]))
	i++;
	}
	else if (is_utf8_three_byte_sequence_start_byte(src[i])) {
	if (i + 2 >= src_size)
	break;
	if (decode_utf8_three_byte_sequence(&dst[j], &src[i]))
	i += 2;
	}
	else if (is_utf8_four_byte_sequence_start_byte(src[i])) {
	if (i + 3 >= src_size)
	break;
	if (decode_utf8_four_byte_sequence(&dst[j], &src[i]))
	i += 3;
	}
	else
	dst[j] = '?';
	}
	if (src_unused)
	*src_unused = src_size - i;
	dst[j] = 0;
	return j;
	}


	// Encodes a UTF32 word to a UTF8 two-byte sequence.
	inline static void encode_utf8_two_byte_sequence(uint8_t *dst, uint32_t word) {
	dst[1] = encode_utf8_sequence_byte(word);
	word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT;
	dst[0] = encode_utf8_two_byte_sequence_start_byte(word);
	}

	// Encodes a UTF32 word to a UTF8 three-byte sequence.
	inline static void encode_utf8_three_byte_sequence(uint8_t *dst, uint32_t word) {
	dst[2] = encode_utf8_sequence_byte(word);
	word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT;
	dst[1] = encode_utf8_sequence_byte(word);
	word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT;
	dst[0] = encode_utf8_three_byte_sequence_start_byte(word);
	}

	// Encodes a UTF32 word to a UTF8 four-byte sequence.
	inline static void encode_utf8_four_byte_sequence(uint8_t *dst, uint32_t word) {
	dst[3] = encode_utf8_sequence_byte(word);
	word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT;
	dst[2] = encode_utf8_sequence_byte(word);
	word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT;
	dst[1] = encode_utf8_sequence_byte(word);
	word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT;
	dst[0] = encode_utf8_four_byte_sequence_start_byte(word);
	}

	#define MAX_UTF8_SINGLE_BYTE_WORD 0x0000007F
	#define MAX_UTF8_TWO_BYTE_SEQUENCE_WORD 0x000007FF
	#define MAX_UTF8_THREE_BYTE_SEQUENCE_WORD 0x0000FFFF
	#define MAX_UTF8_FOUR_BYTE_SEQUENCE_WORD 0x0010FFFF

	// Converts text from UTF32 to UTF8, outputting at most `dst_size - 1` bytes, 0-terminated.
	// Returns the number of bytes actually output.
	// Converts malformed words to '?'.
	// `src_unused` may be NULL.
	uint32_t convert_from_utf32_to_utf8(uint8_t dst, uint32_t dst_size, const uint32_t src, uint32_t src_size, uint32_t *src_unused) {
	uint32_t i, j;
	for (i = 0, j = 0; i < src_size && j + 1 < dst_size; i++, j++) {
	if (src[i] <= MAX_UTF8_SINGLE_BYTE_WORD)
	dst[j] = src[i];
	else if (src[i] <= MAX_UTF8_TWO_BYTE_SEQUENCE_WORD) {
	if (j + 2 >= dst_size)
	break;
	encode_utf8_two_byte_sequence(&dst[j], src[i]);
	j++;
	}
	else if (src[i] <= MAX_UTF8_THREE_BYTE_SEQUENCE_WORD) {
	if (j + 3 >= dst_size)
	break;
	encode_utf8_three_byte_sequence(&dst[j], src[i]);
	j += 2;
	}
	else if (src[i] <= MAX_UTF8_FOUR_BYTE_SEQUENCE_WORD) {
	if (j + 4 >= dst_size)
	break;
	encode_utf8_four_byte_sequence(&dst[j], src[i]);
	j += 3;
	}
	else
	dst[j] = '?';
	}
	if (src_unused)
	*src_unused = src_size - i;
	dst[j] = 0;
	return j;
	}