Skip to content

Instantly share code, notes, and snippets.

@mietek
Created July 27, 2010 21:09
Show Gist options
  • Save mietek/492864 to your computer and use it in GitHub Desktop.
Save mietek/492864 to your computer and use it in GitHub Desktop.
#include <stdbool.h>
#include <stdint.h>
#include "convert_utf8.h"
#define UTF8_SINGLE_BYTE_CONTROL_MASK 0x80
#define UTF8_SINGLE_BYTE_CONTROL_VALUE 0x00
// Returns true if `byte` is standalone in UTF8.
inline static bool is_utf8_single_byte(uint8_t byte) {
return (byte & UTF8_SINGLE_BYTE_CONTROL_MASK) == UTF8_SINGLE_BYTE_CONTROL_VALUE;
}
#define UTF8_SEQUENCE_BYTE_CONTROL_MASK 0xC0
#define UTF8_SEQUENCE_BYTE_CONTROL_VALUE 0x80
#define UTF8_SEQUENCE_BYTE_DATA_MASK 0x3F
#define UTF8_SEQUENCE_WORD_DATA_SHIFT 6
// Returns true if `byte` is a part of a UTF8 sequence.
inline static bool is_utf8_sequence_byte(uint8_t byte) {
return (byte & UTF8_SEQUENCE_BYTE_CONTROL_MASK) == UTF8_SEQUENCE_BYTE_CONTROL_VALUE;
}
// Returns the data bits of a UTF8 sequence byte.
inline static uint32_t decode_utf8_sequence_byte(uint8_t byte) {
return byte & UTF8_SEQUENCE_BYTE_DATA_MASK;
}
// Returns a UTF8 sequence byte encoding the data bits of a UTF32 word.
inline static uint8_t encode_utf8_sequence_byte(uint32_t word) {
return UTF8_SEQUENCE_BYTE_CONTROL_VALUE | (word & UTF8_SEQUENCE_BYTE_DATA_MASK);
}
#define UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK 0xE0
#define UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE 0xC0
#define UTF8_TWO_BYTE_SEQUENCE_START_BYTE_DATA_MASK 0x1F
#define UTF8_TWO_BYTE_SEQUENCE_WORD_NON_ZERO_MASK 0x00000780
// Returns true if `byte` is the start of a UTF8 two-byte sequence.
inline static bool is_utf8_two_byte_sequence_start_byte(uint8_t byte) {
return (byte & UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK) == UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE;
}
// Returns the data bits of a UTF8 two-byte sequence start byte.
inline static uint32_t decode_utf8_two_byte_sequence_start_byte(uint8_t byte) {
return byte & UTF8_TWO_BYTE_SEQUENCE_START_BYTE_DATA_MASK;
}
// Returns true if `word` is a non-overlong UTF8 two-byte sequence.
inline static bool is_non_overlong_utf8_two_byte_sequence_word(uint32_t word) {
return word & UTF8_TWO_BYTE_SEQUENCE_WORD_NON_ZERO_MASK;
}
// Returns a UTF8 two-byte sequence start byte encoding the data bits of `word`.
inline static uint8_t encode_utf8_two_byte_sequence_start_byte(uint32_t word) {
return UTF8_TWO_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE | (word & UTF8_TWO_BYTE_SEQUENCE_START_BYTE_DATA_MASK);
}
#define UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK 0xF0
#define UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE 0xE0
#define UTF8_THREE_BYTE_SEQUENCE_START_BYTE_DATA_MASK 0x0F
#define UTF8_THREE_BYTE_SEQUENCE_WORD_NON_ZERO_MASK 0x0000F800
// Returns true if `byte` is the start of a UTF8 three-byte sequence.
inline static bool is_utf8_three_byte_sequence_start_byte(uint8_t byte) {
return (byte & UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK) == UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE;
}
// Returns the data bits of a UTF8 three-byte sequence start byte.
inline static uint32_t decode_utf8_three_byte_sequence_start_byte(uint8_t byte) {
return byte & UTF8_THREE_BYTE_SEQUENCE_START_BYTE_DATA_MASK;
}
// Returns true if `word` is a non-overlong UTF8 three-byte sequence.
inline static bool is_non_overlong_utf8_three_byte_sequence_word(uint32_t word) {
return word & UTF8_THREE_BYTE_SEQUENCE_WORD_NON_ZERO_MASK;
}
// Returns a UTF8 three-byte sequence start byte encoding the data bits of `word`.
inline static uint8_t encode_utf8_three_byte_sequence_start_byte(uint32_t word) {
return UTF8_THREE_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE | (word & UTF8_THREE_BYTE_SEQUENCE_START_BYTE_DATA_MASK);
}
#define UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK 0xF8
#define UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE 0xF0
#define UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_DATA_MASK 0x07
#define UTF8_FOUR_BYTE_SEQUENCE_WORD_NON_ZERO_MASK 0x001F0000
// Returns true if `byte` is the start of a UTF8 four-byte sequence.
inline static bool is_utf8_four_byte_sequence_start_byte(uint8_t byte) {
return (byte & UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_MASK) == UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE;
}
// Returns the data bits of a UTF8 four-byte sequence start byte.
inline static uint32_t decode_utf8_four_byte_sequence_start_byte(uint8_t byte) {
return byte & UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_DATA_MASK;
}
// Returns true if `word` is a non-overlong UTF8 four-byte sequence.
inline static bool is_non_overlong_utf8_four_byte_sequence_word(uint32_t word) {
return word & UTF8_FOUR_BYTE_SEQUENCE_WORD_NON_ZERO_MASK;
}
// Returns a UTF8 four-byte sequence start byte encoding the data bits of `word`.
inline static uint8_t encode_utf8_four_byte_sequence_start_byte(uint32_t word) {
return UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_CONTROL_VALUE | (word & UTF8_FOUR_BYTE_SEQUENCE_START_BYTE_DATA_MASK);
}
// Decodes a UTF32 word from a UTF8 two-byte sequence, or '?' if unsuccessful.
// Returns true if successful.
inline static bool decode_utf8_two_byte_sequence(uint32_t *dst, const uint8_t *src) {
uint32_t word;
if (is_utf8_sequence_byte(src[1])) {
word = decode_utf8_two_byte_sequence_start_byte(src[0]);
word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT;
word |= decode_utf8_sequence_byte(src[1]);
if (is_non_overlong_utf8_two_byte_sequence_word(word)) {
*dst = word;
return true;
}
}
*dst = '?';
return false;
}
// Decodes a UTF32 word from a UTF8 three-byte sequence, or '?' if unsuccessful.
// Returns true if successful.
inline static bool decode_utf8_three_byte_sequence(uint32_t *dst, const uint8_t *src) {
uint32_t word;
if (is_utf8_sequence_byte(src[1]) &&
is_utf8_sequence_byte(src[2])) {
word = decode_utf8_three_byte_sequence_start_byte(src[0]);
word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT;
word |= decode_utf8_sequence_byte(src[1]);
word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT;
word |= decode_utf8_sequence_byte(src[2]);
if (is_non_overlong_utf8_three_byte_sequence_word(word)) {
*dst = word;
return true;
}
}
*dst = '?';
return false;
}
// Decodes a UTF32 word from a UTF8 four-byte sequence, or '?' if unsuccessful.
// Returns true if successful.
inline static bool decode_utf8_four_byte_sequence(uint32_t *dst, const uint8_t *src) {
uint32_t word;
if (is_utf8_sequence_byte(src[1]) &&
is_utf8_sequence_byte(src[2]) &&
is_utf8_sequence_byte(src[3])) {
word = decode_utf8_four_byte_sequence_start_byte(src[0]);
word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT;
word |= decode_utf8_sequence_byte(src[1]);
word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT;
word |= decode_utf8_sequence_byte(src[2]);
word <<= UTF8_SEQUENCE_WORD_DATA_SHIFT;
word |= decode_utf8_sequence_byte(src[3]);
if (is_non_overlong_utf8_four_byte_sequence_word(word)) {
*dst = word;
return true;
}
}
*dst = '?';
return false;
}
// Converts text from UTF8 to UTF32, outputting at most `dst_size - 1` words, 0-terminated.
// Returns the number of words actually output.
// Converts malformed bytes to '?', and leaves truncated sequences unprocessed.
// `src_unused` may be NULL.
uint32_t convert_from_utf8_to_utf32(uint32_t *dst, uint32_t dst_size, const uint8_t *src, uint32_t src_size, uint32_t *src_unused) {
uint32_t i, j;
for (i = 0, j = 0; i < src_size && j + 1 < dst_size; i++, j++) {
if (is_utf8_single_byte(src[i]))
dst[j] = src[i];
else if (is_utf8_two_byte_sequence_start_byte(src[i])) {
if (i + 1 >= src_size)
break;
if (decode_utf8_two_byte_sequence(&dst[j], &src[i]))
i++;
}
else if (is_utf8_three_byte_sequence_start_byte(src[i])) {
if (i + 2 >= src_size)
break;
if (decode_utf8_three_byte_sequence(&dst[j], &src[i]))
i += 2;
}
else if (is_utf8_four_byte_sequence_start_byte(src[i])) {
if (i + 3 >= src_size)
break;
if (decode_utf8_four_byte_sequence(&dst[j], &src[i]))
i += 3;
}
else
dst[j] = '?';
}
if (src_unused)
*src_unused = src_size - i;
dst[j] = 0;
return j;
}
// Encodes a UTF32 word to a UTF8 two-byte sequence.
inline static void encode_utf8_two_byte_sequence(uint8_t *dst, uint32_t word) {
dst[1] = encode_utf8_sequence_byte(word);
word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT;
dst[0] = encode_utf8_two_byte_sequence_start_byte(word);
}
// Encodes a UTF32 word to a UTF8 three-byte sequence.
inline static void encode_utf8_three_byte_sequence(uint8_t *dst, uint32_t word) {
dst[2] = encode_utf8_sequence_byte(word);
word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT;
dst[1] = encode_utf8_sequence_byte(word);
word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT;
dst[0] = encode_utf8_three_byte_sequence_start_byte(word);
}
// Encodes a UTF32 word to a UTF8 four-byte sequence.
inline static void encode_utf8_four_byte_sequence(uint8_t *dst, uint32_t word) {
dst[3] = encode_utf8_sequence_byte(word);
word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT;
dst[2] = encode_utf8_sequence_byte(word);
word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT;
dst[1] = encode_utf8_sequence_byte(word);
word >>= UTF8_SEQUENCE_WORD_DATA_SHIFT;
dst[0] = encode_utf8_four_byte_sequence_start_byte(word);
}
#define MAX_UTF8_SINGLE_BYTE_WORD 0x0000007F
#define MAX_UTF8_TWO_BYTE_SEQUENCE_WORD 0x000007FF
#define MAX_UTF8_THREE_BYTE_SEQUENCE_WORD 0x0000FFFF
#define MAX_UTF8_FOUR_BYTE_SEQUENCE_WORD 0x0010FFFF
// Converts text from UTF32 to UTF8, outputting at most `dst_size - 1` bytes, 0-terminated.
// Returns the number of bytes actually output.
// Converts malformed words to '?'.
// `src_unused` may be NULL.
uint32_t convert_from_utf32_to_utf8(uint8_t *dst, uint32_t dst_size, const uint32_t *src, uint32_t src_size, uint32_t *src_unused) {
uint32_t i, j;
for (i = 0, j = 0; i < src_size && j + 1 < dst_size; i++, j++) {
if (src[i] <= MAX_UTF8_SINGLE_BYTE_WORD)
dst[j] = src[i];
else if (src[i] <= MAX_UTF8_TWO_BYTE_SEQUENCE_WORD) {
if (j + 2 >= dst_size)
break;
encode_utf8_two_byte_sequence(&dst[j], src[i]);
j++;
}
else if (src[i] <= MAX_UTF8_THREE_BYTE_SEQUENCE_WORD) {
if (j + 3 >= dst_size)
break;
encode_utf8_three_byte_sequence(&dst[j], src[i]);
j += 2;
}
else if (src[i] <= MAX_UTF8_FOUR_BYTE_SEQUENCE_WORD) {
if (j + 4 >= dst_size)
break;
encode_utf8_four_byte_sequence(&dst[j], src[i]);
j += 3;
}
else
dst[j] = '?';
}
if (src_unused)
*src_unused = src_size - i;
dst[j] = 0;
return j;
}
// Converts text from UTF8 to UTF32, outputting at most `dst_size - 1` words, 0-terminated.
// Returns the number of words actually output.
// Converts malformed bytes to '?', and leaves truncated sequences unprocessed.
// `src_unused` may be NULL.
uint32_t convert_from_utf8_to_utf32(uint32_t *dst, uint32_t dst_size, const uint8_t *src, uint32_t src_size, uint32_t *src_unused);
// Converts text from UTF32 to UTF8, outputting at most `dst_size - 1` bytes, 0-terminated.
// Returns the number of bytes actually output.
// Converts malformed words to '?'.
// `src_unused` may be NULL.
uint32_t convert_from_utf32_to_utf8(uint8_t *dst, uint32_t dst_size, const uint32_t *src, uint32_t src_size, uint32_t *src_unused);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment