Skip to content

Instantly share code, notes, and snippets.

@antonijn
Last active September 30, 2022 01:13
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save antonijn/9009746 to your computer and use it in GitHub Desktop.
Save antonijn/9009746 to your computer and use it in GitHub Desktop.
UTF-8 to UTF-32 converter in C
#include "unicode.h"
#include <stdint.h>
#include <stddef.h>
int is_valid_char(uint32_t ch)
{
return ch < 0xd800 || ch > 0xdfff;
}
int is_combo_char(uint32_t ch)
{
return (ch >= 0x0300 && ch <= 0x036f)
|| (ch >= 0x20d0 && ch <= 0x20ff)
|| (ch >= 0xfe20 && ch <= 0xfe2f);
}
#ifndef UNICODE_H
#define UNICODE_H
#include <stdint.h>
#include <stddef.h>
struct character {
uint32_t codepoints[];
size_t count;
};
int is_valid_char(uint32_t ch);
int is_combo_char(uint32_t ch);
#endif
#include "utf16.h"
#include <stdint.h>
#include <stddef.h>
static int getch(uint16_t buf[], unsigned long *idx, size_t strlen,
uint32_t *cp)
{
if (*idx >= strlen) {
return -1;
}
uint16_t ch = buf[(*idx)++];
if ((ch & 0xfc00) != 0xd800) {
*cp = (uint32_t)ch;
return 0;
}
if (*idx > strlen) {
return -1;
}
uint16_t nxt = buf[(*idx)++];
if ((nxt & 0xfc00) != 0xdc00) {
return -1;
}
*cp = ((ch & 0x03ff) << 10) | (nxt & 0x03ff);
return 0;
}
int utf16_codepoint_count(uint16_t chars[], size_t strlen, size_t *out_size)
{
unsigned long idx = 0;
for (*out_size = 0; *out_size < strlen; ++*out_size) {
uint32_t cp;
getch(chars, &idx, strlen, &cp);
if (!is_valid_char(cp)) {
return -1;
}
}
return 0;
}
int utf16_to_utf32(uint16_t input[], uint32_t output[],
size_t count, size_t *out_size)
{
unsigned long idx = 0;
for (*out_size = 0; *out_size < count; ++*out_size) {
getch(input, &idx, count, &output[i]);
if (!is_valid_char(output[i])) {
return -1;
}
}
return 0;
}
#ifndef UTF16_H
#define UTF16_H
#include <stdint.h>
#include <stddef.h>
#include "unicode.h"
int utf16_codepoint_count(uint16_t chars[], size_t strlen, size_t *out_size);
int utf16_to_utf32(uint16_t input[], uint32_t output[], size_t count,
size_t *out_size);
#endif
#include "utf32.h"
int utf32_getchars(uint32_t utf32[], struct character output[],
size_t count, size_t *out_size)
{
for (i = 0; i < count; ++*out_size) {
size_t cur_size = 1;
output[*out_size].codepoints = &utf32[i];
for (; ++i < count && is_combo_char(utf32[i]); ++cur_size) {
if (*out_size == 0) {
return -1;
}
}
output[*out_size].count = cur_size;
}
return 0;
}
#ifndef UTF32_H
#define UTF32_H
#include <stdint.h>
#include <stddef.h>
#include "unicode.h"
int utf32_getchars(uint32_t utf32[], struct character output[],
size_t count, size_t *out_size);
#endif
#include "utf8.h"
#include <stdint.h>
#include <stddef.h>
static int getch(uint8_t buf[], unsigned long *idx, size_t strlen, uint32_t *cp)
{
int remunits;
uint8_t nxt, msk;
if (*idx >= strlen)
return -1;
nxt = buf[(*idx)++];
if (nxt & 0x80) {
msk = 0xe0;
for (remunits = 1; (nxt & msk) != (msk << 1); ++remunits)
msk = (msk >> 1) | 0x80;
} else {
remunits = 0;
msk = 0;
}
*cp = nxt ^ msk;
while (remunits-- > 0) {
*cp <<= 6;
if (*idx >= strlen)
return -1;
*cp |= buf[(*idx)++] & 0x3f;
}
return 0;
}
int utf8_codepoint_count(uint8_t chars[], size_t strlen, size_t *out_size)
{
unsigned long idx = 0;
for (*out_size = 0; *out_size < strlen; ++*out_size) {
uint32_t cp;
getch(chars, &idx, strlen, &cp);
if (!is_valid_char(cp)) {
return -1;
}
}
return 0;
}
int utf8_to_utf32(uint8_t input[], uint32_t output[], size_t count,
size_t *out_size)
{
unsigned long idx = 0;
for (*out_size = 0; *out_size < count; ++*out_size) {
getch(input, &idx, count, &output[i]);
if (!is_valid_char(output[i])) {
return -1;
}
}
return 0;
}
#ifndef UTF8_H
#define UTF8_H
#include <stdint.h>
#include <stddef.h>
#include "unicode.h"
int utf8_codepoint_count(uint8_t chars[], size_t strlen, size_t *out_size);
int utf8_to_utf32(uint8_t input[], uint32_t output[], size_t count,
size_t *out_size);
#endif
@sarikaya
Copy link

why do you need to seperate your code ?

@m-7761
Copy link

m-7761 commented Jun 14, 2017

utf8_to_utf32...
getch(input, &idx, count, &output[i]); if (!is_valid_char(output[i])) { return -1; }
There is no i in this scope. Probably want to look elsewhere for such algorithms.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment