Skip to content

Instantly share code, notes, and snippets.

@Inndy
Last active August 29, 2015 13:59
Show Gist options
  • Save Inndy/10484418 to your computer and use it in GitHub Desktop.
Save Inndy/10484418 to your computer and use it in GitHub Desktop.
Count UTF-8 character byte length and split.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define BUFFER_SIZE 4096
// Reference: http://en.wikipedia.org/wiki/UTF-8
#define UTF8_MASK_C 0x080 // 0b10000000
#define UTF8_MASK_2 0x0C0 // 0b11000000
#define UTF8_MASK_3 0x0E0 // 0b11100000
#define UTF8_MASK_4 0x0F0 // 0b11110000
#define UTF8_MASK_5 0x0F8 // 0b11111000
#define UTF8_MASK_6 0x0FC // 0b11111100
#define UTF8_MKMASK(X) (X | X >> 1)
// Test leading bytes
#define UTF8_TEST_L(X, M) (((X) & UTF8_MKMASK(M)) == M)
// Test body bytes
#define UTF8_TEST_B(X) (((X) & UTF8_MASK_C) == UTF8_MASK_C)
// check body bytes
int utf8_check_bytes(unsigned char * data, int l)
{
while (l--)
if (! UTF8_TEST_B(*data++))
return 0;
return 1;
}
// get character size
int utf8_get_char_size(unsigned char data) {
if (data) {
if (UTF8_TEST_L(data, UTF8_MASK_6)) {
return 6;
} else if (UTF8_TEST_L(data, UTF8_MASK_5)) {
return 5;
} else if (UTF8_TEST_L(data, UTF8_MASK_4)) {
return 4;
} else if (UTF8_TEST_L(data, UTF8_MASK_3)) {
return 3;
} else if (UTF8_TEST_L(data, UTF8_MASK_2)) {
return 2;
} else if ((data & UTF8_MASK_C) == 0) {
return 1;
}
} else {
return 0;
}
return -1;
}
// At least 6 bytes for output buffer
// out_len == 0 // string end
// out_len == -1 // failed
// NULL for both buffer and out_len are allowed
// Return next position if success
// Return NULL if string is terminated or failed
unsigned char * utf8_get_char(unsigned char * data, unsigned char * buffer,
int * out_len)
{
int len = utf8_get_char_size(*data);
// check every bytes
if (len > 1)
if (! utf8_check_bytes(data + 1, len - 1))
len = -2;
if (out_len) {
*out_len = len;
}
if (buffer && len > 0) {
memcpy(buffer, data, len);
}
return len > 0 ? (data + len) : NULL;
}
// Count characters in UTF-8
int utf8_char_count(unsigned char * data, int buffer_size) {
unsigned char *p = data, *end = data + buffer_size;
int n = 0;
while ((p = utf8_get_char(p, NULL, NULL)) && (end - p >= 0))
n++;
return n;
}
int main ()
{
unsigned char buffer[BUFFER_SIZE], data[16] = { 0 };
unsigned char *p = buffer;
char filename[1024];
int flen, clen;
scanf("%1000s", filename);
FILE* fp = fopen(filename, "rb");
if (!fp) {
fprintf(stderr, "Open file failed.");
return -1;
}
fseek(fp, 0, SEEK_END);
flen = ftell(fp);
fseek(fp, 0, SEEK_SET);
if (flen < BUFFER_SIZE) {
fread(buffer, flen, 1, fp);
int total_chars = utf8_char_count(buffer, sizeof(buffer));
printf("total char counts = %d\n", total_chars);
while (1) {
p = utf8_get_char(p, data, &clen);
if (p) {
data[clen] = '\0';
printf("Len = %d, Char: %s\n", clen, data);
} else {
break;
}
}
} else {
fprintf(stderr, "File is too large (%d)", flen);
}
fclose(fp);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment