Last active
May 30, 2024 22:33
-
-
Save N-R-K/6fb02994c7de82872dd2a3d367dcbb2a to your computer and use it in GitHub Desktop.
benchmarking utf8 decoders
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// gcc -march=x86-64-v3 -O3 -o bench bench.c | |
// src taken from: https://github.com/skeeto/branchless-utf8/blob/master/test/benchmark.c | |
#define _POSIX_C_SOURCE 200112L | |
#include <stdio.h> | |
#include <stdint.h> | |
#include <stdlib.h> | |
#include <signal.h> | |
#include <stddef.h> | |
#include <unistd.h> // alarm() | |
#define SECONDS 4 | |
#define BUFLEN 8 // MB | |
#define IS_SURROGATE(c) ((c) >= 0xD800U && (c) <= 0xDFFFU) | |
static void * | |
utf8_encode(void *buf, long c) | |
{ | |
unsigned char *s = buf; | |
if (c >= (1L << 16)) { | |
s[0] = 0xf0 | (c >> 18); | |
s[1] = 0x80 | ((c >> 12) & 0x3f); | |
s[2] = 0x80 | ((c >> 6) & 0x3f); | |
s[3] = 0x80 | ((c >> 0) & 0x3f); | |
return s + 4; | |
} else if (c >= (1L << 11)) { | |
s[0] = 0xe0 | (c >> 12); | |
s[1] = 0x80 | ((c >> 6) & 0x3f); | |
s[2] = 0x80 | ((c >> 0) & 0x3f); | |
return s + 3; | |
} else if (c >= (1L << 7)) { | |
s[0] = 0xc0 | (c >> 6); | |
s[1] = 0x80 | ((c >> 0) & 0x3f); | |
return s + 2; | |
} else { | |
s[0] = c; | |
return s + 1; | |
} | |
} | |
static uint32_t | |
pcg32(uint64_t *s) | |
{ | |
uint64_t m = 0x9b60933458e17d7d; | |
uint64_t a = 0xd737232eeccdf7ed; | |
*s = *s * m + a; | |
int shift = 29 - (*s >> 61); | |
return *s >> shift; | |
} | |
/* Generate a random codepoint whose UTF-8 length is uniformly selected. */ | |
static long | |
randchar(uint64_t *s) | |
{ | |
uint32_t r = pcg32(s); | |
int len = 1 + (r & 0x3); | |
r >>= 2; | |
switch (len) { | |
case 1: | |
return r % 128; | |
case 2: | |
return 128 + r % (2048 - 128); | |
case 3: | |
return 2048 + r % (65536 - 2048); | |
case 4: | |
return 65536 + r % (131072 - 65536); | |
} | |
abort(); | |
} | |
static volatile sig_atomic_t running; | |
static void | |
alarm_handler(int signum) | |
{ | |
(void)signum; | |
running = 0; | |
} | |
/* Fill buffer with random characters, with evenly-distributed encoded | |
* lengths. | |
*/ | |
static void * | |
buffer_fill(void *buf, size_t z) | |
{ | |
uint64_t s = 0; | |
char *p = buf; | |
char *end = p + z; | |
while (p < end) { | |
long c; | |
do | |
c = randchar(&s); | |
while (IS_SURROGATE(c)); | |
p = utf8_encode(p, c); | |
} | |
return p; | |
} | |
//////////////////////// | |
static unsigned char * | |
utf8_simple(unsigned char *s, long *c) | |
{ | |
unsigned char *next; | |
if (s[0] < 0x80) { | |
*c = s[0]; | |
next = s + 1; | |
} else if ((s[0] & 0xe0) == 0xc0) { | |
*c = ((long)(s[0] & 0x1f) << 6) | | |
((long)(s[1] & 0x3f) << 0); | |
if ((s[1] & 0xc0) != 0x80) | |
*c = -1; | |
next = s + 2; | |
} else if ((s[0] & 0xf0) == 0xe0) { | |
*c = ((long)(s[0] & 0x0f) << 12) | | |
((long)(s[1] & 0x3f) << 6) | | |
((long)(s[2] & 0x3f) << 0); | |
if ((s[1] & 0xc0) != 0x80 || | |
(s[2] & 0xc0) != 0x80) | |
*c = -1; | |
next = s + 3; | |
} else if ((s[0] & 0xf8) == 0xf0 && (s[0] <= 0xf4)) { | |
*c = ((long)(s[0] & 0x07) << 18) | | |
((long)(s[1] & 0x3f) << 12) | | |
((long)(s[2] & 0x3f) << 6) | | |
((long)(s[3] & 0x3f) << 0); | |
if ((s[1] & 0xc0) != 0x80 || | |
(s[2] & 0xc0) != 0x80 || | |
(s[3] & 0xc0) != 0x80) | |
*c = -1; | |
next = s + 4; | |
} else { | |
*c = -1; // invalid | |
next = s + 1; // skip this byte | |
} | |
if (*c >= 0xd800 && *c <= 0xdfff) | |
*c = -1; // surrogate half | |
return next; | |
} | |
//////////////////////// | |
#include <immintrin.h> | |
typedef uint8_t u8; | |
typedef uint32_t u32; | |
#if 1 | |
static u32 pext32(u32 v, u32 mask) { return _pext_u32(v, mask); } | |
#else | |
static u32 | |
pext32(u32 v, u32 mask) | |
{ | |
u32 ret = 0; | |
u32 test = 0x1; | |
while (mask != 0) { | |
u32 ontarget = mask & test; | |
ret |= v & ontarget; | |
test <<= !!ontarget; | |
v >>= !ontarget; | |
mask >>= !ontarget; | |
} | |
return ret; | |
} | |
#endif | |
static uint8_t * | |
utf8_pext(uint8_t *s, u32 *outcp, int *err) | |
{ | |
static const u8 lens[] = { | |
/* 0XXX */ 1, 1, 1, 1, 1, 1, 1, 1, | |
/* 10XX */ 1, 1, 1, 1, /* invalid */ | |
/* 110X */ 2, 2, | |
/* 1110 */ 3, | |
/* 1111 */ 4, /* maybe */ | |
}; | |
static const u32 pextmask[] = { | |
0x7F000000, 0x1F3F0000, 0x0F3F3F00, 0x073F3F3F, | |
}; | |
static const u32 emask[] = { | |
0x80000000, 0xE0C00000, 0xF0C0C000, 0xF8C0C0C0, | |
}; | |
static const u32 eexpect[] = { | |
0x00000000, 0xC0800000, 0xE0808000, 0xF0808080, | |
}; | |
static const u32 overlong[] = { 0x0, 0x80, 0x0800, 0x10000 }; | |
u32 v = 0; | |
switch (4) { | |
case 4: v |= ((u32)s[3] << 0); // fallthrough | |
case 3: v |= ((u32)s[2] << 8); // fallthrough | |
case 2: v |= ((u32)s[1] << 16); // fallthrough | |
case 1: v |= ((u32)s[0] << 24); break; | |
default: __builtin_unreachable(); break; // string is expected to be not empty | |
} | |
u32 len = lens[s[0] >> 4]; | |
u32 idx = len - 1; | |
s += len; | |
u32 cp = pext32(v, pextmask[idx]); | |
*outcp = cp; | |
*err |= (v & emask[idx]) ^ (eexpect[idx]); | |
*err |= cp > 0x10FFFF; // out of range | |
*err |= (cp >> 11) == 0x1B; // surrogate | |
*err |= cp < overlong[idx]; | |
return s; | |
} | |
///////////////// | |
// https://nullprogram.com/blog/2017/10/06/ | |
static void * | |
utf8_decode(void *buf, uint32_t *c, int *e) | |
{ | |
static const unsigned char lengths[] = { | |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0 | |
}; | |
static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07}; | |
static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536}; | |
static const int shiftc[] = {0, 18, 12, 6, 0}; | |
static const int shifte[] = {0, 6, 4, 2, 0}; | |
unsigned char *s = buf; | |
unsigned int len = lengths[s[0] >> 3]; | |
/* Compute the pointer to the next character early so that the next | |
* iteration can start working on the next character. Neither Clang | |
* nor GCC figure out this reordering on their own. | |
*/ | |
unsigned char *next = s + len + !len; | |
/* Assume a four-byte character and load four bytes. Unused bits are | |
* shifted out. | |
*/ | |
*c = (uint32_t)(s[0] & masks[len]) << 18; | |
*c |= (uint32_t)(s[1] & 0x3f) << 12; | |
*c |= (uint32_t)(s[2] & 0x3f) << 6; | |
*c |= (uint32_t)(s[3] & 0x3f) << 0; | |
*c >>= shiftc[len]; | |
/* Accumulate the various error conditions. */ | |
*e = (*c < mins[len]) << 6; // non-canonical encoding | |
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half? | |
*e |= (*c > 0x10FFFF) << 8; // out of range? | |
*e |= (s[1] & 0xc0) >> 2; | |
*e |= (s[2] & 0xc0) >> 4; | |
*e |= (s[3] ) >> 6; | |
*e ^= 0x2a; // top two bits of each tail byte correct? | |
*e >>= shifte[len]; | |
return next; | |
} | |
/////////////// | |
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. | |
#define UTF8_ACCEPT 0 | |
#define UTF8_REJECT 12 | |
static const uint8_t utf8d[] = { | |
// The first part of the table maps bytes to character classes that | |
// to reduce the size of the transition table and create bitmasks. | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, | |
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, | |
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, | |
// The second part is a transition table that maps a combination | |
// of a state of the automaton and a character class to a state. | |
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, | |
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, | |
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, | |
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, | |
12,36,12,12,12,12,12,12,12,12,12,12, | |
}; | |
static uint32_t | |
bh_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) { | |
uint32_t type = utf8d[byte]; | |
*codep = (*state != UTF8_ACCEPT) ? | |
(byte & 0x3fu) | (*codep << 6) : | |
(0xff >> type) & (byte); | |
*state = utf8d[256 + *state + type]; | |
return *state; | |
} | |
int | |
main(void) | |
{ | |
double rate; | |
long errors, n; | |
size_t z = BUFLEN * 1024L * 1024; | |
unsigned char *buffer = malloc(z); | |
unsigned char *end = buffer_fill(buffer, z - 4); | |
/* Benchmark the branchless decoder */ | |
running = 1; | |
signal(SIGALRM, alarm_handler); | |
alarm(SECONDS); | |
errors = n = 0; | |
do { | |
unsigned char *p = buffer; | |
int e = 0; | |
uint32_t c; | |
long count = 0; | |
while (p < end) { | |
p = utf8_decode(p, &c, &e); | |
errors += !!e; // force errors to be checked | |
count++; | |
} | |
if (p == end) // reached the end successfully? | |
n++; | |
} while (running); | |
rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024; | |
printf("branchless: %f MB/s, %ld errors\n", rate, errors); | |
/* Benchmark Bjoern Hoehrmann's decoder */ | |
running = 1; | |
signal(SIGALRM, alarm_handler); | |
alarm(SECONDS); | |
errors = n = 0; | |
do { | |
unsigned char *p = buffer; | |
uint32_t c; | |
uint32_t state = 0; | |
long count = 0; | |
for (; p < end; p++) { | |
if (!bh_utf8_decode(&state, &c, *p)) | |
count++; | |
else if (state == UTF8_REJECT) | |
errors++; // force errors to be checked | |
} | |
if (p == end) // reached the end successfully? | |
n++; | |
} while (running); | |
rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024; | |
printf("Hoehrmann: %f MB/s, %ld errors\n", rate, errors); | |
/* Benchmark simple decoder */ | |
running = 1; | |
signal(SIGALRM, alarm_handler); | |
alarm(SECONDS); | |
errors = n = 0; | |
do { | |
unsigned char *p = buffer; | |
long c; | |
long count = 0; | |
while (p < end) { | |
p = utf8_simple(p, &c); | |
count++; | |
if (c < 0) | |
errors++; | |
} | |
if (p == end) // reached the end successfully? | |
n++; | |
} while (running); | |
rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024; | |
printf("Simple: %f MB/s, %ld errors\n", rate, errors); | |
/* Benchmark pext decoder */ | |
running = 1; | |
signal(SIGALRM, alarm_handler); | |
alarm(SECONDS); | |
errors = n = 0; | |
do { | |
unsigned char *p = buffer; | |
u32 cp; | |
long count = 0; | |
while (p < end) { | |
int err = 0; | |
p = utf8_pext(p, &cp, &err); | |
count++; | |
if (err) | |
errors++; | |
} | |
if (p == end) // reached the end successfully? | |
n++; | |
} while (running); | |
rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024; | |
printf("Pext: %f MB/s, %ld errors\n", rate, errors); | |
free(buffer); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I re-ran it to share the numbers:
I had not noticed the 2x difference for "branchless" between the architecture options. This is in "performance" mode, I usually run in "powersave" (which gives different results, and not by a constant factor for all the variants). Another thing I checked yesterday was changing the types of the arguments to the decode function. You can easily make it run a lot slower by using uint8_t where that is all is needed (so despite inlining everything, compilers are still not smart enough to use the optimal types automatically; I used 32 bit ints because those worked best on the 32 bit system I developed the decoder on).
Another thing I noticed over the years is that people like to abort early on decoding errors. For my decoder, that adds a lot of work for each byte, it would be better to optimistically process a whole chunk and re-parse if there is an error to get the first error position. But if you can fiddle with the numbers so easily by picking a different compiler or settings or types, with counter-intuitive results, I suspect you probably have to write the assembly code by hand to an ideal representation. (Also the input matters, randomized input can be the worst case for some decoders, depending on character distribution; a popular addition to my decoder is special casing of longer ASCII inputs, but then you have to study CPU manuals to check on the cost of unaligned reads and related issues; similarily, assuming aligned and/or padded input gives other decoders an unfair advantage).
(Edit: For gcc k8, opteron, and x86-64-v4 are all faster than x86-64-v3 and native, not by much, but very constantly.)