Inndy/utf8_split_char.c

## utf8_split_char.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define BUFFER_SIZE 4096

// Reference: http://en.wikipedia.org/wiki/UTF-8
#define UTF8_MASK_C 0x080 // 0b10000000

#define UTF8_MASK_2 0x0C0 // 0b11000000
#define UTF8_MASK_3 0x0E0 // 0b11100000
#define UTF8_MASK_4 0x0F0 // 0b11110000
#define UTF8_MASK_5 0x0F8 // 0b11111000
#define UTF8_MASK_6 0x0FC // 0b11111100
#define UTF8_MKMASK(X) (X | X >> 1)
// Test leading bytes
#define UTF8_TEST_L(X, M) (((X) & UTF8_MKMASK(M)) == M)
// Test body bytes
#define UTF8_TEST_B(X) (((X) & UTF8_MASK_C) == UTF8_MASK_C)

// check body bytes
int utf8_check_bytes(unsigned char * data, int l)
{
	while (l--)
		if (! UTF8_TEST_B(*data++))
			return 0;
	return 1;
}

// get character size
int utf8_get_char_size(unsigned char data) {
	if (data) {
		if (UTF8_TEST_L(data, UTF8_MASK_6)) {
			return 6;
		} else if (UTF8_TEST_L(data, UTF8_MASK_5)) {
			return 5;
		} else if (UTF8_TEST_L(data, UTF8_MASK_4)) {
			return 4;
		} else if (UTF8_TEST_L(data, UTF8_MASK_3)) {
			return 3;
		} else if (UTF8_TEST_L(data, UTF8_MASK_2)) {
			return 2;
		} else if ((data & UTF8_MASK_C) == 0) {
			return 1;
		}
	} else {
		return 0;
	}
	return -1;
}

// At least 6 bytes for output buffer
// out_len ==  0 // string end
// out_len == -1 // failed
// NULL for both buffer and out_len are allowed
// Return next position if success
// Return NULL if string is terminated or failed
unsigned char * utf8_get_char(unsigned char * data, unsigned char * buffer,
	int * out_len)
{
	int len = utf8_get_char_size(*data);

	// check every bytes
	if (len > 1)
		if (! utf8_check_bytes(data + 1, len - 1))
			len = -2;
	if (out_len) {
		*out_len = len;
	}
	if (buffer && len > 0) {
		memcpy(buffer, data, len);
	}
	return len > 0 ? (data + len) : NULL;
}

// Count characters in UTF-8
int utf8_char_count(unsigned char * data, int buffer_size) {
	unsigned char *p = data, *end = data + buffer_size;
	int n = 0;
	while ((p = utf8_get_char(p, NULL, NULL)) && (end - p >= 0))
		n++;
	return n;
}

int main ()
{
	unsigned char buffer[BUFFER_SIZE], data[16] = { 0 };
	unsigned char *p = buffer;
	char filename[1024];
	int flen, clen;
	scanf("%1000s", filename);
	FILE* fp = fopen(filename, "rb");
	if (!fp) {
		fprintf(stderr, "Open file failed.");
		return -1;
	}

	fseek(fp, 0, SEEK_END);
	flen = ftell(fp);
	fseek(fp, 0, SEEK_SET);

	if (flen < BUFFER_SIZE) {
		fread(buffer, flen, 1, fp);
		int total_chars = utf8_char_count(buffer, sizeof(buffer));
		printf("total char counts = %d\n", total_chars);
		while (1) {
			p = utf8_get_char(p, data, &clen);
			if (p) {
				data[clen] = '\0';
				printf("Len = %d, Char: %s\n", clen, data);
			} else {
				break;
			}
		}
	} else {
		fprintf(stderr, "File is too large (%d)", flen);
	}

	fclose(fp);
	return 0;
}
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	#define BUFFER_SIZE 4096

	// Reference: http://en.wikipedia.org/wiki/UTF-8
	#define UTF8_MASK_C 0x080 // 0b10000000

	#define UTF8_MASK_2 0x0C0 // 0b11000000
	#define UTF8_MASK_3 0x0E0 // 0b11100000
	#define UTF8_MASK_4 0x0F0 // 0b11110000
	#define UTF8_MASK_5 0x0F8 // 0b11111000
	#define UTF8_MASK_6 0x0FC // 0b11111100
	#define UTF8_MKMASK(X) (X \| X >> 1)
	// Test leading bytes
	#define UTF8_TEST_L(X, M) (((X) & UTF8_MKMASK(M)) == M)
	// Test body bytes
	#define UTF8_TEST_B(X) (((X) & UTF8_MASK_C) == UTF8_MASK_C)

	// check body bytes
	int utf8_check_bytes(unsigned char * data, int l)
	{
	while (l--)
	if (! UTF8_TEST_B(*data++))
	return 0;
	return 1;
	}

	// get character size
	int utf8_get_char_size(unsigned char data) {
	if (data) {
	if (UTF8_TEST_L(data, UTF8_MASK_6)) {
	return 6;
	} else if (UTF8_TEST_L(data, UTF8_MASK_5)) {
	return 5;
	} else if (UTF8_TEST_L(data, UTF8_MASK_4)) {
	return 4;
	} else if (UTF8_TEST_L(data, UTF8_MASK_3)) {
	return 3;
	} else if (UTF8_TEST_L(data, UTF8_MASK_2)) {
	return 2;
	} else if ((data & UTF8_MASK_C) == 0) {
	return 1;
	}
	} else {
	return 0;
	}
	return -1;
	}

	// At least 6 bytes for output buffer
	// out_len == 0 // string end
	// out_len == -1 // failed
	// NULL for both buffer and out_len are allowed
	// Return next position if success
	// Return NULL if string is terminated or failed
	unsigned char * utf8_get_char(unsigned char * data, unsigned char * buffer,
	int * out_len)
	{
	int len = utf8_get_char_size(*data);

	// check every bytes
	if (len > 1)
	if (! utf8_check_bytes(data + 1, len - 1))
	len = -2;
	if (out_len) {
	*out_len = len;
	}
	if (buffer && len > 0) {
	memcpy(buffer, data, len);
	}
	return len > 0 ? (data + len) : NULL;
	}

	// Count characters in UTF-8
	int utf8_char_count(unsigned char * data, int buffer_size) {
	unsigned char p = data, end = data + buffer_size;
	int n = 0;
	while ((p = utf8_get_char(p, NULL, NULL)) && (end - p >= 0))
	n++;
	return n;
	}

	int main ()
	{
	unsigned char buffer[BUFFER_SIZE], data[16] = { 0 };
	unsigned char *p = buffer;
	char filename[1024];
	int flen, clen;
	scanf("%1000s", filename);
	FILE* fp = fopen(filename, "rb");
	if (!fp) {
	fprintf(stderr, "Open file failed.");
	return -1;
	}

	fseek(fp, 0, SEEK_END);
	flen = ftell(fp);
	fseek(fp, 0, SEEK_SET);

	if (flen < BUFFER_SIZE) {
	fread(buffer, flen, 1, fp);
	int total_chars = utf8_char_count(buffer, sizeof(buffer));
	printf("total char counts = %d\n", total_chars);
	while (1) {
	p = utf8_get_char(p, data, &clen);
	if (p) {
	data[clen] = '\0';
	printf("Len = %d, Char: %s\n", clen, data);
	} else {
	break;
	}
	}
	} else {
	fprintf(stderr, "File is too large (%d)", flen);
	}

	fclose(fp);
	return 0;
	}