Skip to content

Instantly share code, notes, and snippets.

@N-R-K
Last active May 6, 2024 19:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save N-R-K/6fb02994c7de82872dd2a3d367dcbb2a to your computer and use it in GitHub Desktop.
Save N-R-K/6fb02994c7de82872dd2a3d367dcbb2a to your computer and use it in GitHub Desktop.
benchmarking utf8 decoders
// gcc -march=x86-64-v3 -O3 -o bench bench.c
// src taken from: https://github.com/skeeto/branchless-utf8/blob/master/test/benchmark.c
#define _POSIX_C_SOURCE 200112L
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <signal.h>
#include <stddef.h>
#include <unistd.h> // alarm()
#define SECONDS 4
#define BUFLEN 8 // MB
#define IS_SURROGATE(c) ((c) >= 0xD800U && (c) <= 0xDFFFU)
static void *
utf8_encode(void *buf, long c)
{
unsigned char *s = buf;
if (c >= (1L << 16)) {
s[0] = 0xf0 | (c >> 18);
s[1] = 0x80 | ((c >> 12) & 0x3f);
s[2] = 0x80 | ((c >> 6) & 0x3f);
s[3] = 0x80 | ((c >> 0) & 0x3f);
return s + 4;
} else if (c >= (1L << 11)) {
s[0] = 0xe0 | (c >> 12);
s[1] = 0x80 | ((c >> 6) & 0x3f);
s[2] = 0x80 | ((c >> 0) & 0x3f);
return s + 3;
} else if (c >= (1L << 7)) {
s[0] = 0xc0 | (c >> 6);
s[1] = 0x80 | ((c >> 0) & 0x3f);
return s + 2;
} else {
s[0] = c;
return s + 1;
}
}
static uint32_t
pcg32(uint64_t *s)
{
uint64_t m = 0x9b60933458e17d7d;
uint64_t a = 0xd737232eeccdf7ed;
*s = *s * m + a;
int shift = 29 - (*s >> 61);
return *s >> shift;
}
/* Generate a random codepoint whose UTF-8 length is uniformly selected. */
static long
randchar(uint64_t *s)
{
uint32_t r = pcg32(s);
int len = 1 + (r & 0x3);
r >>= 2;
switch (len) {
case 1:
return r % 128;
case 2:
return 128 + r % (2048 - 128);
case 3:
return 2048 + r % (65536 - 2048);
case 4:
return 65536 + r % (131072 - 65536);
}
abort();
}
static volatile sig_atomic_t running;
static void
alarm_handler(int signum)
{
(void)signum;
running = 0;
}
/* Fill buffer with random characters, with evenly-distributed encoded
* lengths.
*/
static void *
buffer_fill(void *buf, size_t z)
{
uint64_t s = 0;
char *p = buf;
char *end = p + z;
while (p < end) {
long c;
do
c = randchar(&s);
while (IS_SURROGATE(c));
p = utf8_encode(p, c);
}
return p;
}
////////////////////////
static unsigned char *
utf8_simple(unsigned char *s, long *c)
{
unsigned char *next;
if (s[0] < 0x80) {
*c = s[0];
next = s + 1;
} else if ((s[0] & 0xe0) == 0xc0) {
*c = ((long)(s[0] & 0x1f) << 6) |
((long)(s[1] & 0x3f) << 0);
if ((s[1] & 0xc0) != 0x80)
*c = -1;
next = s + 2;
} else if ((s[0] & 0xf0) == 0xe0) {
*c = ((long)(s[0] & 0x0f) << 12) |
((long)(s[1] & 0x3f) << 6) |
((long)(s[2] & 0x3f) << 0);
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80)
*c = -1;
next = s + 3;
} else if ((s[0] & 0xf8) == 0xf0 && (s[0] <= 0xf4)) {
*c = ((long)(s[0] & 0x07) << 18) |
((long)(s[1] & 0x3f) << 12) |
((long)(s[2] & 0x3f) << 6) |
((long)(s[3] & 0x3f) << 0);
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80)
*c = -1;
next = s + 4;
} else {
*c = -1; // invalid
next = s + 1; // skip this byte
}
if (*c >= 0xd800 && *c <= 0xdfff)
*c = -1; // surrogate half
return next;
}
////////////////////////
#include <immintrin.h>
typedef uint8_t u8;
typedef uint32_t u32;
#if 1
static u32 pext32(u32 v, u32 mask) { return _pext_u32(v, mask); }
#else
static u32
pext32(u32 v, u32 mask)
{
u32 ret = 0;
u32 test = 0x1;
while (mask != 0) {
u32 ontarget = mask & test;
ret |= v & ontarget;
test <<= !!ontarget;
v >>= !ontarget;
mask >>= !ontarget;
}
return ret;
}
#endif
static uint8_t *
utf8_pext(uint8_t *s, u32 *outcp, int *err)
{
static const u8 lens[] = {
/* 0XXX */ 1, 1, 1, 1, 1, 1, 1, 1,
/* 10XX */ 1, 1, 1, 1, /* invalid */
/* 110X */ 2, 2,
/* 1110 */ 3,
/* 1111 */ 4, /* maybe */
};
static const u32 pextmask[] = {
0x7F000000, 0x1F3F0000, 0x0F3F3F00, 0x073F3F3F,
};
static const u32 emask[] = {
0x80000000, 0xE0C00000, 0xF0C0C000, 0xF8C0C0C0,
};
static const u32 eexpect[] = {
0x00000000, 0xC0800000, 0xE0808000, 0xF0808080,
};
static const u32 overlong[] = { 0x0, 0x80, 0x0800, 0x10000 };
u32 v = 0;
switch (4) {
case 4: v |= ((u32)s[3] << 0); // fallthrough
case 3: v |= ((u32)s[2] << 8); // fallthrough
case 2: v |= ((u32)s[1] << 16); // fallthrough
case 1: v |= ((u32)s[0] << 24); break;
default: __builtin_unreachable(); break; // string is expected to be not empty
}
u32 len = lens[s[0] >> 4];
u32 idx = len - 1;
s += len;
u32 cp = pext32(v, pextmask[idx]);
*outcp = cp;
*err |= (v & emask[idx]) ^ (eexpect[idx]);
*err |= cp > 0x10FFFF; // out of range
*err |= (cp >> 11) == 0x1B; // surrogate
*err |= cp < overlong[idx];
return s;
}
/////////////////
// https://nullprogram.com/blog/2017/10/06/
static void *
utf8_decode(void *buf, uint32_t *c, int *e)
{
static const unsigned char lengths[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
};
static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
static const int shiftc[] = {0, 18, 12, 6, 0};
static const int shifte[] = {0, 6, 4, 2, 0};
unsigned char *s = buf;
unsigned int len = lengths[s[0] >> 3];
/* Compute the pointer to the next character early so that the next
* iteration can start working on the next character. Neither Clang
* nor GCC figure out this reordering on their own.
*/
unsigned char *next = s + len + !len;
/* Assume a four-byte character and load four bytes. Unused bits are
* shifted out.
*/
*c = (uint32_t)(s[0] & masks[len]) << 18;
*c |= (uint32_t)(s[1] & 0x3f) << 12;
*c |= (uint32_t)(s[2] & 0x3f) << 6;
*c |= (uint32_t)(s[3] & 0x3f) << 0;
*c >>= shiftc[len];
/* Accumulate the various error conditions. */
*e = (*c < mins[len]) << 6; // non-canonical encoding
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
*e |= (*c > 0x10FFFF) << 8; // out of range?
*e |= (s[1] & 0xc0) >> 2;
*e |= (s[2] & 0xc0) >> 4;
*e |= (s[3] ) >> 6;
*e ^= 0x2a; // top two bits of each tail byte correct?
*e >>= shifte[len];
return next;
}
///////////////
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
#define UTF8_ACCEPT 0
#define UTF8_REJECT 12
static const uint8_t utf8d[] = {
// The first part of the table maps bytes to character classes that
// to reduce the size of the transition table and create bitmasks.
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
// The second part is a transition table that maps a combination
// of a state of the automaton and a character class to a state.
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12,
};
static uint32_t
bh_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
uint32_t type = utf8d[byte];
*codep = (*state != UTF8_ACCEPT) ?
(byte & 0x3fu) | (*codep << 6) :
(0xff >> type) & (byte);
*state = utf8d[256 + *state + type];
return *state;
}
int
main(void)
{
double rate;
long errors, n;
size_t z = BUFLEN * 1024L * 1024;
unsigned char *buffer = malloc(z);
unsigned char *end = buffer_fill(buffer, z - 4);
/* Benchmark the branchless decoder */
running = 1;
signal(SIGALRM, alarm_handler);
alarm(SECONDS);
errors = n = 0;
do {
unsigned char *p = buffer;
int e = 0;
uint32_t c;
long count = 0;
while (p < end) {
p = utf8_decode(p, &c, &e);
errors += !!e; // force errors to be checked
count++;
}
if (p == end) // reached the end successfully?
n++;
} while (running);
rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
printf("branchless: %f MB/s, %ld errors\n", rate, errors);
/* Benchmark Bjoern Hoehrmann's decoder */
running = 1;
signal(SIGALRM, alarm_handler);
alarm(SECONDS);
errors = n = 0;
do {
unsigned char *p = buffer;
uint32_t c;
uint32_t state = 0;
long count = 0;
for (; p < end; p++) {
if (!bh_utf8_decode(&state, &c, *p))
count++;
else if (state == UTF8_REJECT)
errors++; // force errors to be checked
}
if (p == end) // reached the end successfully?
n++;
} while (running);
rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
printf("Hoehrmann: %f MB/s, %ld errors\n", rate, errors);
/* Benchmark simple decoder */
running = 1;
signal(SIGALRM, alarm_handler);
alarm(SECONDS);
errors = n = 0;
do {
unsigned char *p = buffer;
long c;
long count = 0;
while (p < end) {
p = utf8_simple(p, &c);
count++;
if (c < 0)
errors++;
}
if (p == end) // reached the end successfully?
n++;
} while (running);
rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
printf("Simple: %f MB/s, %ld errors\n", rate, errors);
/* Benchmark pext decoder */
running = 1;
signal(SIGALRM, alarm_handler);
alarm(SECONDS);
errors = n = 0;
do {
unsigned char *p = buffer;
u32 cp;
long count = 0;
while (p < end) {
int err = 0;
p = utf8_pext(p, &cp, &err);
count++;
if (err)
errors++;
}
if (p == end) // reached the end successfully?
n++;
} while (running);
rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
printf("Pext: %f MB/s, %ld errors\n", rate, errors);
free(buffer);
}
@hoehrmann
Copy link

Cool. Note that compilers omit the code point calculation code from my decoder here since you do not use the values. That is part of the design of the decoder, but probably gives it an unfair advantage here. Also, I ran this with clang-15 and gcc-11 and compared -march=native and -march=x86-64-v3. On this Ryzen 7 5700U gcc generates around 15% faster code than clang, and clang with -march=x86-64-v3 is substantially faster than (unexpectedly) -march=native. I had pretty much the same problems when writing the original article 15 years ago...

@N-R-K
Copy link
Author

N-R-K commented May 5, 2024

Note that compilers omit the code point calculation code from my decoder here since you do not use the values.

Aha, good catch. If I add a simple checksum that sums up all the decoded codepoint, the DFA decoder takes almost a 2x perf hit whereas the other ones remain mostly unchanged.

[/tmp]~> clang -march=x86-64-v3 -O3 -o bench bench.c
[/tmp]~> ./bench
branchless: 659.999764 MB/s, 0 errors, 0x2169290DB64C cksum (prev: 680~690 MB/s)
Hoehrmann:  283.999898 MB/s, 0 errors, 0xE6073693284 cksum (prev: ~640 MB/s)
Simple:     403.999856 MB/s, 0 errors, 0x14739225DF4C cksum (prev: ~405 MB/s)
Pext:       119.999957 MB/s, 0 errors, 0x6131EBCACC8 cksum (prev: 120 MB/s)

(Note that this is on my 3700x machine which has a micro-code emulated pext. The checksums between each version also don't match due to the non-deterministic sigalarm style benchmark.)

On this Ryzen 7 5700U gcc generates around 15% faster code than clang, and clang with -march=x86-64-v3 is substantially faster than (unexpectedly) -march=native.

Interesting. Is this on the DFA decoder or the pext one?

I'd presume that whatever fine-tuned instruction cost model gcc has for 5700U is backfiring compared to the "generic" x86-64-v3 model.

Reminds me of the time I was benchmarking small integer sorting and gcc -O3 was surprisingly 2-3x worse than gcc -O1.

@hoehrmann
Copy link

hoehrmann commented May 6, 2024

I re-ran it to share the numbers:

% git log -n1
commit 84de6694abbf39d021cade9b6b7d01d8d5dd7d82 (HEAD -> master, origin/master, origin/HEAD)
Author: NRK <nrk@disroot.org>
Date:   Wed Mar 20 00:20:25 2024 +0000

% for cc in gcc clang-15; do for arch in native x86-64-v3; do "$cc" -O3 -march="$arch"  -fno-stack-protector -Wno-unused-but-set-variable -Wall -lm bench.c -o bench; ./bench | sed -re "s/^/$cc $arch /"; done; done

gcc native branchless: 357.999872 MB/s, 0 errors
gcc native Hoehrmann:  675.999758 MB/s, 0 errors
gcc native Simple:     449.999839 MB/s, 0 errors
gcc native Pext:       119.999957 MB/s, 0 errors

gcc x86-64-v3 branchless: 723.999741 MB/s, 0 errors
gcc x86-64-v3 Hoehrmann:  687.999754 MB/s, 0 errors
gcc x86-64-v3 Simple:     463.999834 MB/s, 0 errors
gcc x86-64-v3 Pext:       123.999956 MB/s, 0 errors

clang-15 native branchless: 731.999738 MB/s, 0 errors
clang-15 native Hoehrmann:  603.999784 MB/s, 0 errors
clang-15 native Simple:     429.999846 MB/s, 0 errors
clang-15 native Pext:       119.999957 MB/s, 0 errors

clang-15 x86-64-v3 branchless: 729.999739 MB/s, 0 errors
clang-15 x86-64-v3 Hoehrmann:  653.999766 MB/s, 0 errors
clang-15 x86-64-v3 Simple:     423.999848 MB/s, 0 errors
clang-15 x86-64-v3 Pext:       121.999956 MB/s, 0 errors

I had not noticed the 2x difference for "branchless" between the architecture options. This is in "performance" mode, I usually run in "powersave" (which gives different results, and not by a constant factor for all the variants). Another thing I checked yesterday was changing the types of the arguments to the decode function. You can easily make it run a lot slower by using uint8_t where that is all is needed (so despite inlining everything, compilers are still not smart enough to use the optimal types automatically; I used 32 bit ints because those worked best on the 32 bit system I developed the decoder on).

Another thing I noticed over the years is that people like to abort early on decoding errors. For my decoder, that adds a lot of work for each byte, it would be better to optimistically process a whole chunk and re-parse if there is an error to get the first error position. But if you can fiddle with the numbers so easily by picking a different compiler or settings or types, with counter-intuitive results, I suspect you probably have to write the assembly code by hand to an ideal representation. (Also the input matters, randomized input can be the worst case for some decoders, depending on character distribution; a popular addition to my decoder is special casing of longer ASCII inputs, but then you have to study CPU manuals to check on the cost of unaligned reads and related issues; similarily, assuming aligned and/or padded input gives other decoders an unfair advantage).

(Edit: For gcc k8, opteron, and x86-64-v4 are all faster than x86-64-v3 and native, not by much, but very constantly.)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment