Skip to content

Instantly share code, notes, and snippets.

@s4y
Last active October 15, 2017 09:28
Show Gist options
  • Save s4y/7c95f1ebeb2c069cfb09db3c3251eca3 to your computer and use it in GitHub Desktop.
Save s4y/7c95f1ebeb2c069cfb09db3c3251eca3 to your computer and use it in GitHub Desktop.
A simple UTF-8 parser that happens to be fast.
#include <inttypes.h>
typedef enum {
UTF8_OK = 0,
UTF8_ERROR = 4,
} utf8_decode_state_t;
typedef struct {
utf8_decode_state_t state;
uint32_t codepoint;
} utf8_decode_context_t;
static inline void utf8_decode(
utf8_decode_context_t *context, unsigned char byte
) {
switch ((int)context->state) {
case UTF8_OK:
if (byte < 0x80) {
context->codepoint = byte;
} else if (byte < 0xc0){
context->state = UTF8_ERROR;
} else if (byte < 0xe0){
context->state = 1;
context->codepoint = byte & 0x1f;
} else if (byte < 0xf0) {
context->state = 2;
context->codepoint = byte & 0xf;
} else if (byte < 0xf8) {
context->state = 3;
context->codepoint = byte & 0x7;
} else {
context->state = UTF8_ERROR;
}
break;
case 1:
case 2:
case 3:
if (byte >= 0x80 && byte <= 0xbf) {
context->state -= 1;
context->codepoint =
(context->codepoint << 6) | (byte & 0x3f);
} else {
context->state = UTF8_ERROR;
}
}
}
// - - -
#include <stdio.h>
#include <unistd.h>
int main() {
unsigned char buf[1024 * 1024];
size_t count = 0;
size_t avail;
uint32_t codepoint_hash = 0;
utf8_decode_context_t state = { UTF8_OK, 0 };
while ((avail = read(STDIN_FILENO, buf, sizeof(buf) / sizeof(*buf))) > 0) {
for (size_t i = 0; i < avail; i++) {
utf8_decode(&state, buf[i]);
switch (state.state) {
case UTF8_OK:
count += 1;
codepoint_hash ^= state.codepoint;
break;
case UTF8_ERROR:
return 1;
}
}
}
if (state.state != UTF8_OK) {
return 2;
}
printf("Decoded %zu code points and got a hash of %u.\n", count, codepoint_hash);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment