Last active
November 27, 2023 12:47
-
-
Save s4y/344a355f8c1f99c6a4cb2347ec4323cc to your computer and use it in GitHub Desktop.
UTF-8 parser benchmarks (https://news.ycombinator.com/item?id=15425078)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <inttypes.h> | |
#include <stdlib.h> | |
typedef enum { | |
UTF8_OK = 0, | |
UTF8_ERROR = 4, | |
} utf8_decode_state_t; | |
typedef struct { | |
utf8_decode_state_t state; | |
uint32_t codepoint; | |
} utf8_decode_context_t; | |
static inline void utf8_decode( | |
utf8_decode_context_t *context, unsigned char byte | |
) { | |
switch ((int)context->state) { | |
case UTF8_OK: | |
if (byte < 0x80) { | |
context->codepoint = byte; | |
} else if (byte < 0xc0){ | |
context->state = UTF8_ERROR; | |
} else if (byte < 0xe0){ | |
context->state = 1; | |
context->codepoint = byte & 0x1f; | |
} else if (byte < 0xf0) { | |
context->state = 2; | |
context->codepoint = byte & 0xf; | |
} else if (byte < 0xf8) { | |
context->state = 3; | |
context->codepoint = byte & 0x7; | |
} else { | |
context->state = UTF8_ERROR; | |
} | |
break; | |
case 1: | |
case 2: | |
case 3: | |
if (byte >= 0x80 && byte <= 0xbf) { | |
context->state -= 1; | |
context->codepoint = | |
(context->codepoint << 6) | (byte & 0x3f); | |
} else { | |
context->state = UTF8_ERROR; | |
} | |
} | |
} | |
// - - - | |
#include <stdio.h> | |
#include <unistd.h> | |
int main() { | |
size_t bufsz = 1024 * 1024 * 1024; | |
unsigned char* buf = calloc(bufsz, 1); | |
size_t count = 0; | |
uint32_t codepoint_hash = 0; | |
utf8_decode_context_t state = { UTF8_OK, 0 }; | |
size_t avail = read(STDIN_FILENO, buf, bufsz); | |
for (size_t i = 0; i < avail; i++) { | |
utf8_decode(&state, buf[i]); | |
switch (state.state) { | |
case UTF8_OK: | |
count += 1; | |
codepoint_hash ^= state.codepoint; | |
break; | |
case UTF8_ERROR: | |
return 1; | |
} | |
} | |
if (state.state != UTF8_OK) { | |
return 2; | |
} | |
printf("Decoded %zu code points and got a hash of %u.\n", count, codepoint_hash); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <inttypes.h> | |
#include <stdlib.h> | |
/* Branchless UTF-8 decoder | |
* | |
* This is free and unencumbered software released into the public domain. | |
*/ | |
#ifndef UTF8_H | |
#define UTF8_H | |
#include <stdint.h> | |
/* Decode the next character, C, from BUF, reporting errors in E. | |
* | |
* Since this is a branchless decoder, four bytes will be read from the | |
* buffer regardless of the actual length of the next character. This | |
* means the buffer _must_ have at least three bytes of zero padding | |
* following the end of the data stream. | |
* | |
* Errors are reported in E, which will be non-zero if the parsed | |
* character was somehow invalid: invalid byte sequence, non-canonical | |
* encoding, or a surrogate half. | |
* | |
* The function returns a pointer to the next character. When an error | |
* occurs, this pointer will be a guess that depends on the particular | |
* error, but it will always advance at least one byte. | |
*/ | |
static void * | |
utf8_decode(void *buf, uint32_t *c, int *e) | |
{ | |
static const char lengths[] = { | |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0 | |
}; | |
static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07}; | |
static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536}; | |
static const int shiftc[] = {0, 18, 12, 6, 0}; | |
static const int shifte[] = {0, 6, 4, 2, 0}; | |
unsigned char *s = buf; | |
int len = lengths[s[0] >> 3]; | |
/* Compute the pointer to the next character early so that the next | |
* iteration can start working on the next character. Neither Clang | |
* nor GCC figure out this reordering on their own. | |
*/ | |
unsigned char *next = s + len + !len; | |
/* Assume a four-byte character and load four bytes. Unused bits are | |
* shifted out. | |
*/ | |
*c = (uint32_t)(s[0] & masks[len]) << 18; | |
*c |= (uint32_t)(s[1] & 0x3f) << 12; | |
*c |= (uint32_t)(s[2] & 0x3f) << 6; | |
*c |= (uint32_t)(s[3] & 0x3f) << 0; | |
*c >>= shiftc[len]; | |
/* Accumulate the various error conditions. */ | |
*e = (*c < mins[len]) << 6; | |
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half? | |
*e |= (s[1] & 0xc0) >> 2; | |
*e |= (s[2] & 0xc0) >> 4; | |
*e |= (s[3] ) >> 6; | |
*e ^= 0x2a; // top two bits of each tail byte correct? | |
*e >>= shifte[len]; | |
return next; | |
} | |
#endif | |
// - - - | |
#include <stdio.h> | |
#include <unistd.h> | |
int main() { | |
size_t bufsz = 1024 * 1024 * 1024; | |
unsigned char* buf = calloc(bufsz, 1); | |
unsigned char* bufp = buf; | |
size_t count = 0; | |
uint32_t codepoint_hash = 0; | |
uint32_t codepoint = 0; | |
int e; | |
size_t avail = read(STDIN_FILENO, buf, bufsz); | |
while (bufp < buf + avail) { | |
bufp = utf8_decode(bufp, &codepoint, &e); | |
if(e) return e; | |
count += 1; | |
codepoint_hash ^= codepoint; | |
} | |
printf("Decoded %zu code points and got a hash of %u.\n", count, codepoint_hash); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <inttypes.h> | |
#include <fcntl.h> | |
#include <stdlib.h> | |
// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de> | |
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. | |
#define UTF8_ACCEPT 0 | |
#define UTF8_REJECT 12 | |
static const uint8_t utf8d[] = { | |
// The first part of the table maps bytes to character classes that | |
// to reduce the size of the transition table and create bitmasks. | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, | |
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, | |
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, | |
// The second part is a transition table that maps a combination | |
// of a state of the automaton and a character class to a state. | |
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, | |
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, | |
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, | |
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, | |
12,36,12,12,12,12,12,12,12,12,12,12, | |
}; | |
static uint32_t inline | |
decode(uint32_t* state, uint32_t* codep, uint32_t byte) { | |
uint32_t type = utf8d[byte]; | |
*codep = (*state != UTF8_ACCEPT) ? | |
(byte & 0x3fu) | (*codep << 6) : | |
(0xff >> type) & (byte); | |
*state = utf8d[256 + *state + type]; | |
return *state; | |
} | |
// - - - | |
#include <stdio.h> | |
#include <unistd.h> | |
int main(int argc, char* argv[]) { | |
size_t bufsz = 1024 * 1024 * 1024; | |
unsigned char* buf = calloc(bufsz, 1); | |
size_t count = 0; | |
uint32_t codepoint_hash = 0; | |
uint32_t state = 0; | |
uint32_t codep = 0; | |
size_t avail = read(STDIN_FILENO, buf, bufsz); | |
for (size_t i = 0; i < avail; i++) { | |
decode(&state, &codep, buf[i]); | |
switch (state) { | |
case UTF8_ACCEPT: | |
count += 1; | |
codepoint_hash ^= codep; | |
break; | |
case UTF8_REJECT: | |
return 1; | |
} | |
} | |
if (state != UTF8_ACCEPT) { | |
return 2; | |
} | |
printf("Decoded %zu code points and got a hash of %u.\n", count, codepoint_hash); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment