antonijn/unicode.c

## unicode.c
#include "unicode.h"
#include <stdint.h>
#include <stddef.h>

int is_valid_char(uint32_t ch)
{
	return ch < 0xd800 || ch > 0xdfff;
}

int is_combo_char(uint32_t ch)
{
	return (ch >= 0x0300 && ch <= 0x036f)
	    || (ch >= 0x20d0 && ch <= 0x20ff)
	    || (ch >= 0xfe20 && ch <= 0xfe2f);
}

## unicode.h
#ifndef UNICODE_H
#define UNICODE_H

#include <stdint.h>
#include <stddef.h>

struct character {
	uint32_t codepoints[];
	size_t count;
};

int is_valid_char(uint32_t ch);
int is_combo_char(uint32_t ch);

#endif

## utf16.c
#include "utf16.h"
#include <stdint.h>
#include <stddef.h>

static int getch(uint16_t buf[], unsigned long *idx, size_t strlen,
                 uint32_t *cp)
{
	if (*idx >= strlen) {
		return -1;
	}
	uint16_t ch = buf[(*idx)++];
	if ((ch & 0xfc00) != 0xd800) {
		*cp = (uint32_t)ch;
		return 0;
	}
	if (*idx > strlen) {
		return -1;
	}
	uint16_t nxt = buf[(*idx)++];
	if ((nxt & 0xfc00) != 0xdc00) {
		return -1;
	}
	*cp = ((ch & 0x03ff) << 10) | (nxt & 0x03ff);
	return 0;
}

int utf16_codepoint_count(uint16_t chars[], size_t strlen, size_t *out_size)
{
	unsigned long idx = 0;
	for (*out_size = 0; *out_size < strlen; ++*out_size) {
		uint32_t cp;
		getch(chars, &idx, strlen, &cp);
		if (!is_valid_char(cp)) {
			return -1;
		}
	}
	return 0;
}

int utf16_to_utf32(uint16_t input[], uint32_t output[],
                   size_t count, size_t *out_size)
{
	unsigned long idx = 0;
	for (*out_size = 0; *out_size < count; ++*out_size) {
		getch(input, &idx, count, &output[i]);
		if (!is_valid_char(output[i])) {
			return -1;
		}
	}
	return 0;
}

## utf16.h
#ifndef UTF16_H
#define UTF16_H

#include <stdint.h>
#include <stddef.h>
#include "unicode.h"

int utf16_codepoint_count(uint16_t chars[], size_t strlen, size_t *out_size);
int utf16_to_utf32(uint16_t input[], uint32_t output[], size_t count,
                   size_t *out_size);

#endif

## utf32.c
#include "utf32.h"

int utf32_getchars(uint32_t utf32[], struct character output[],
                   size_t count, size_t *out_size)
{
	for (i = 0; i < count; ++*out_size) {
		size_t cur_size = 1;
		output[*out_size].codepoints = &utf32[i];
		for (; ++i < count && is_combo_char(utf32[i]); ++cur_size) {
			if (*out_size == 0) {
				return -1;
			}
		}
		output[*out_size].count = cur_size;
	}
	return 0;
}

## utf32.h
#ifndef UTF32_H
#define UTF32_H

#include <stdint.h>
#include <stddef.h>
#include "unicode.h"

int utf32_getchars(uint32_t utf32[], struct character output[],
                   size_t count, size_t *out_size);

#endif

## utf8.c
#include "utf8.h"
#include <stdint.h>
#include <stddef.h>

static int getch(uint8_t buf[], unsigned long *idx, size_t strlen, uint32_t *cp)
{
	int remunits;
	uint8_t nxt, msk;
	if (*idx >= strlen)
		return -1;
	nxt = buf[(*idx)++];
	if (nxt & 0x80) {
		msk = 0xe0;
		for (remunits = 1; (nxt & msk) != (msk << 1); ++remunits)
			msk = (msk >> 1) | 0x80;
	} else {
		remunits = 0;
		msk = 0;
	}
	*cp = nxt ^ msk;
	while (remunits-- > 0) {
		*cp <<= 6;
		if (*idx >= strlen)
			return -1;
		*cp |= buf[(*idx)++] & 0x3f;
	}
	return 0;
}

int utf8_codepoint_count(uint8_t chars[], size_t strlen, size_t *out_size)
{
	unsigned long idx = 0;
	for (*out_size = 0; *out_size < strlen; ++*out_size) {
		uint32_t cp;
		getch(chars, &idx, strlen, &cp);
		if (!is_valid_char(cp)) {
			return -1;
		}
	}
	return 0;
}

int utf8_to_utf32(uint8_t input[], uint32_t output[], size_t count,
                  size_t *out_size)
{
	unsigned long idx = 0;
	for (*out_size = 0; *out_size < count; ++*out_size) {
		getch(input, &idx, count, &output[i]);
		if (!is_valid_char(output[i])) {
			return -1;
		}
	}
	return 0;
}

## utf8.h
#ifndef UTF8_H
#define UTF8_H

#include <stdint.h>
#include <stddef.h>
#include "unicode.h"

int utf8_codepoint_count(uint8_t chars[], size_t strlen, size_t *out_size);
int utf8_to_utf32(uint8_t input[], uint32_t output[], size_t count,
                  size_t *out_size);

#endif
	#include "unicode.h"
	#include <stdint.h>
	#include <stddef.h>

	int is_valid_char(uint32_t ch)
	{
	return ch < 0xd800 \|\| ch > 0xdfff;
	}

	int is_combo_char(uint32_t ch)
	{
	return (ch >= 0x0300 && ch <= 0x036f)
	\|\| (ch >= 0x20d0 && ch <= 0x20ff)
	\|\| (ch >= 0xfe20 && ch <= 0xfe2f);
	}
	#ifndef UNICODE_H
	#define UNICODE_H

	#include <stdint.h>
	#include <stddef.h>

	struct character {
	uint32_t codepoints[];
	size_t count;
	};

	int is_valid_char(uint32_t ch);
	int is_combo_char(uint32_t ch);

	#endif
	#include "utf16.h"
	#include <stdint.h>
	#include <stddef.h>

	static int getch(uint16_t buf[], unsigned long *idx, size_t strlen,
	uint32_t *cp)
	{
	if (*idx >= strlen) {
	return -1;
	}
	uint16_t ch = buf[(*idx)++];
	if ((ch & 0xfc00) != 0xd800) {
	*cp = (uint32_t)ch;
	return 0;
	}
	if (*idx > strlen) {
	return -1;
	}
	uint16_t nxt = buf[(*idx)++];
	if ((nxt & 0xfc00) != 0xdc00) {
	return -1;
	}
	*cp = ((ch & 0x03ff) << 10) \| (nxt & 0x03ff);
	return 0;
	}

	int utf16_codepoint_count(uint16_t chars[], size_t strlen, size_t *out_size)
	{
	unsigned long idx = 0;
	for (out_size = 0; out_size < strlen; ++*out_size) {
	uint32_t cp;
	getch(chars, &idx, strlen, &cp);
	if (!is_valid_char(cp)) {
	return -1;
	}
	}
	return 0;
	}

	int utf16_to_utf32(uint16_t input[], uint32_t output[],
	size_t count, size_t *out_size)
	{
	unsigned long idx = 0;
	for (out_size = 0; out_size < count; ++*out_size) {
	getch(input, &idx, count, &output[i]);
	if (!is_valid_char(output[i])) {
	return -1;
	}
	}
	return 0;
	}
	#ifndef UTF16_H
	#define UTF16_H

	#include <stdint.h>
	#include <stddef.h>
	#include "unicode.h"

	int utf16_codepoint_count(uint16_t chars[], size_t strlen, size_t *out_size);
	int utf16_to_utf32(uint16_t input[], uint32_t output[], size_t count,
	size_t *out_size);

	#endif
	#include "utf32.h"

	int utf32_getchars(uint32_t utf32[], struct character output[],
	size_t count, size_t *out_size)
	{
	for (i = 0; i < count; ++*out_size) {
	size_t cur_size = 1;
	output[*out_size].codepoints = &utf32[i];
	for (; ++i < count && is_combo_char(utf32[i]); ++cur_size) {
	if (*out_size == 0) {
	return -1;
	}
	}
	output[*out_size].count = cur_size;
	}
	return 0;
	}
	#ifndef UTF32_H
	#define UTF32_H

	#include <stdint.h>
	#include <stddef.h>
	#include "unicode.h"

	int utf32_getchars(uint32_t utf32[], struct character output[],
	size_t count, size_t *out_size);

	#endif
	#include "utf8.h"
	#include <stdint.h>
	#include <stddef.h>

	static int getch(uint8_t buf[], unsigned long idx, size_t strlen, uint32_t cp)
	{
	int remunits;
	uint8_t nxt, msk;
	if (*idx >= strlen)
	return -1;
	nxt = buf[(*idx)++];
	if (nxt & 0x80) {
	msk = 0xe0;
	for (remunits = 1; (nxt & msk) != (msk << 1); ++remunits)
	msk = (msk >> 1) \| 0x80;
	} else {
	remunits = 0;
	msk = 0;
	}
	*cp = nxt ^ msk;
	while (remunits-- > 0) {
	*cp <<= 6;
	if (*idx >= strlen)
	return -1;
	cp \|= buf[(idx)++] & 0x3f;
	}
	return 0;
	}

	int utf8_codepoint_count(uint8_t chars[], size_t strlen, size_t *out_size)
	{
	unsigned long idx = 0;
	for (out_size = 0; out_size < strlen; ++*out_size) {
	uint32_t cp;
	getch(chars, &idx, strlen, &cp);
	if (!is_valid_char(cp)) {
	return -1;
	}
	}
	return 0;
	}

	int utf8_to_utf32(uint8_t input[], uint32_t output[], size_t count,
	size_t *out_size)
	{
	unsigned long idx = 0;
	for (out_size = 0; out_size < count; ++*out_size) {
	getch(input, &idx, count, &output[i]);
	if (!is_valid_char(output[i])) {
	return -1;
	}
	}
	return 0;
	}
	#ifndef UTF8_H
	#define UTF8_H

	#include <stdint.h>
	#include <stddef.h>
	#include "unicode.h"

	int utf8_codepoint_count(uint8_t chars[], size_t strlen, size_t *out_size);
	int utf8_to_utf32(uint8_t input[], uint32_t output[], size_t count,
	size_t *out_size);

	#endif