Last active
May 22, 2021 13:49
-
-
Save fami-com/79f6f93220fba0759df80ad012c1240d to your computer and use it in GitHub Desktop.
A simple UTF-8/16/32 encoder/decoder
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Licensed under the CC0 license */ | |
#include <stdint.h> | |
#include <string.h> | |
#define INVALID_CODEPOINT -1 | |
#define UNPAIRED_SURROGATE -2 | |
int utf32_to_utf8(uint32_t ch, uint8_t **out) | |
{ | |
int len = 0; | |
char obytes[5]; | |
if (ch >= 0xD800 && ch <= 0xDFFF) | |
return UNPAIRED_SURROGATE; | |
if (ch <= 0x7F) { | |
len = 2; | |
obytes[0] = (uint8_t)ch; | |
obytes[1] = 0; | |
} else if (ch <= 0x7FF) { | |
len = 3; | |
obytes[0] = ((ch >> 6) & 037) | 0300; | |
obytes[1] = ((ch & 077) | 0200); | |
obytes[2] = 0; | |
} else if (ch <= 0xFFFF) { | |
len = 4; | |
obytes[0] = ((ch >> 12) & 017) | 0340; | |
obytes[1] = ((ch >> 6) & 077) | 0200; | |
obytes[2] = ((ch & 077) | 0200); | |
obytes[3] = 0; | |
} else if (ch <= 0x10FFFF) { | |
len = 5; | |
obytes[0] = ((ch >> 18) & 070) | 0360; | |
obytes[1] = ((ch >> 12) & 077) | 0200; | |
obytes[2] = ((ch >> 6) & 077) | 0200; | |
obytes[3] = ((ch & 077) | 0200); | |
obytes[4] = 0; | |
} else | |
return INVALID_CODEPOINT; | |
*out = (uint8_t *)malloc(sizeof(uint8_t) * len); | |
memcpy(*out, obytes, len); | |
return len; | |
} | |
int utf8_to_utf32(uint8_t *in) | |
{ | |
int ch = 0; | |
if (in[0] <= 0x7F) { | |
ch = in[0]; | |
} else if (ch <= 0xD0) { | |
ch |= (in[0] & 037) << 6; | |
ch |= (in[1] & 077); | |
} else if (ch <= 0xE0) { | |
ch |= (in[0] & 017) << 12; | |
ch |= (in[1] & 077) << 6; | |
ch |= (in[2] & 077); | |
} else if (ch <= 0xF7) { | |
ch |= (in[0] & 007) << 18; | |
ch |= (in[1] & 077) << 12; | |
ch |= (in[2] & 077) << 6; | |
ch |= (in[3] & 077); | |
} else | |
return INVALID_CODEPOINT; | |
if (ch >= 0xD800 && ch <= 0xDFFF) | |
return UNPAIRED_SURROGATE; | |
return ch; | |
} | |
int utf32_to_utf16(uint32_t ch, uint16_t **out) | |
{ | |
if (ch >= 0xD800 && ch <= 0xDFFF) | |
return UNPAIRED_SURROGATE; | |
int len; | |
uint16_t ounits[3]; | |
if (ch <= 0xFFFF) { | |
ounits[0] = (uint16_t)ch; | |
ounits[1] = 0; | |
len = 2; | |
} else if (ch <= 0x10FFFF) { | |
ch &= 0xFFFF; | |
ounits[0] = (ch >> 10) | 0xD800; | |
ounits[1] = (ch & 0x3FF) | 0xDC00; | |
ounits[2] = 0; | |
len = 3; | |
} else | |
return INVALID_CODEPOINT; | |
*out = (uint16_t *)(malloc(sizeof(uint16_t) * len)); | |
memcpy(*out, ounits, sizeof(uint16_t) * len); | |
return len; | |
} | |
int utf16_to_utf32(uint16_t *in) | |
{ | |
int ch = 0; | |
if (in[0] < 0xD800 || in[0] > 0xDFFF) { | |
ch = in[0]; | |
} else { | |
ch |= (in[0] & 0x3FF) << 10; | |
ch |= in[1] & 0x3FF; | |
ch |= 0x10000; | |
} | |
if (ch > 0x10FFFF) | |
return INVALID_CODEPOINT; | |
if (ch >= 0xD800 && ch <= 0xDFFF) | |
return UNPAIRED_SURROGATE; | |
return ch; | |
} | |
int utf16_to_utf8(uint16_t *in, uint8_t **out) | |
{ | |
return utf32_to_utf8(utf16_to_utf32(in), out); | |
} | |
int utf8_to_utf16(uint8_t *in, uint16_t **out) | |
{ | |
return utf32_to_utf16(utf8_to_utf32(in), out); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment