Skip to content

Instantly share code, notes, and snippets.

@fami-com
Last active May 22, 2021 13:49
Show Gist options
  • Save fami-com/79f6f93220fba0759df80ad012c1240d to your computer and use it in GitHub Desktop.
Save fami-com/79f6f93220fba0759df80ad012c1240d to your computer and use it in GitHub Desktop.
A simple UTF-8/16/32 encoder/decoder
/* Licensed under the CC0 license */
#include <stdint.h>
#include <string.h>
#define INVALID_CODEPOINT -1
#define UNPAIRED_SURROGATE -2
int utf32_to_utf8(uint32_t ch, uint8_t **out)
{
int len = 0;
char obytes[5];
if (ch >= 0xD800 && ch <= 0xDFFF)
return UNPAIRED_SURROGATE;
if (ch <= 0x7F) {
len = 2;
obytes[0] = (uint8_t)ch;
obytes[1] = 0;
} else if (ch <= 0x7FF) {
len = 3;
obytes[0] = ((ch >> 6) & 037) | 0300;
obytes[1] = ((ch & 077) | 0200);
obytes[2] = 0;
} else if (ch <= 0xFFFF) {
len = 4;
obytes[0] = ((ch >> 12) & 017) | 0340;
obytes[1] = ((ch >> 6) & 077) | 0200;
obytes[2] = ((ch & 077) | 0200);
obytes[3] = 0;
} else if (ch <= 0x10FFFF) {
len = 5;
obytes[0] = ((ch >> 18) & 070) | 0360;
obytes[1] = ((ch >> 12) & 077) | 0200;
obytes[2] = ((ch >> 6) & 077) | 0200;
obytes[3] = ((ch & 077) | 0200);
obytes[4] = 0;
} else
return INVALID_CODEPOINT;
*out = (uint8_t *)malloc(sizeof(uint8_t) * len);
memcpy(*out, obytes, len);
return len;
}
int utf8_to_utf32(uint8_t *in)
{
int ch = 0;
if (in[0] <= 0x7F) {
ch = in[0];
} else if (ch <= 0xD0) {
ch |= (in[0] & 037) << 6;
ch |= (in[1] & 077);
} else if (ch <= 0xE0) {
ch |= (in[0] & 017) << 12;
ch |= (in[1] & 077) << 6;
ch |= (in[2] & 077);
} else if (ch <= 0xF7) {
ch |= (in[0] & 007) << 18;
ch |= (in[1] & 077) << 12;
ch |= (in[2] & 077) << 6;
ch |= (in[3] & 077);
} else
return INVALID_CODEPOINT;
if (ch >= 0xD800 && ch <= 0xDFFF)
return UNPAIRED_SURROGATE;
return ch;
}
int utf32_to_utf16(uint32_t ch, uint16_t **out)
{
if (ch >= 0xD800 && ch <= 0xDFFF)
return UNPAIRED_SURROGATE;
int len;
uint16_t ounits[3];
if (ch <= 0xFFFF) {
ounits[0] = (uint16_t)ch;
ounits[1] = 0;
len = 2;
} else if (ch <= 0x10FFFF) {
ch &= 0xFFFF;
ounits[0] = (ch >> 10) | 0xD800;
ounits[1] = (ch & 0x3FF) | 0xDC00;
ounits[2] = 0;
len = 3;
} else
return INVALID_CODEPOINT;
*out = (uint16_t *)(malloc(sizeof(uint16_t) * len));
memcpy(*out, ounits, sizeof(uint16_t) * len);
return len;
}
int utf16_to_utf32(uint16_t *in)
{
int ch = 0;
if (in[0] < 0xD800 || in[0] > 0xDFFF) {
ch = in[0];
} else {
ch |= (in[0] & 0x3FF) << 10;
ch |= in[1] & 0x3FF;
ch |= 0x10000;
}
if (ch > 0x10FFFF)
return INVALID_CODEPOINT;
if (ch >= 0xD800 && ch <= 0xDFFF)
return UNPAIRED_SURROGATE;
return ch;
}
int utf16_to_utf8(uint16_t *in, uint8_t **out)
{
return utf32_to_utf8(utf16_to_utf32(in), out);
}
int utf8_to_utf16(uint8_t *in, uint16_t **out)
{
return utf32_to_utf16(utf8_to_utf32(in), out);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment