N-R-K/bench.c

## bench.c
// gcc -march=x86-64-v3 -O3 -o bench bench.c
// src taken from: https://github.com/skeeto/branchless-utf8/blob/master/test/benchmark.c
#define _POSIX_C_SOURCE 200112L
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <signal.h>
#include <stddef.h>

#include <unistd.h> // alarm()

#define SECONDS 4
#define BUFLEN  8 // MB

#define IS_SURROGATE(c) ((c) >= 0xD800U && (c) <= 0xDFFFU)
static void *
utf8_encode(void *buf, long c)
{
	unsigned char *s = buf;
	if (c >= (1L << 16)) {
		s[0] = 0xf0 |  (c >> 18);
		s[1] = 0x80 | ((c >> 12) & 0x3f);
		s[2] = 0x80 | ((c >>  6) & 0x3f);
		s[3] = 0x80 | ((c >>  0) & 0x3f);
		return s + 4;
	} else if (c >= (1L << 11)) {
		s[0] = 0xe0 |  (c >> 12);
		s[1] = 0x80 | ((c >>  6) & 0x3f);
		s[2] = 0x80 | ((c >>  0) & 0x3f);
		return s + 3;
	} else if (c >= (1L << 7)) {
		s[0] = 0xc0 |  (c >>  6);
		s[1] = 0x80 | ((c >>  0) & 0x3f);
		return s + 2;
	} else {
		s[0] = c;
		return s + 1;
	}
}

static uint32_t
pcg32(uint64_t *s)
{
	uint64_t m = 0x9b60933458e17d7d;
	uint64_t a = 0xd737232eeccdf7ed;
	*s = *s * m + a;
	int shift = 29 - (*s >> 61);
	return *s >> shift;
}

/* Generate a random codepoint whose UTF-8 length is uniformly selected. */
static long
randchar(uint64_t *s)
{
	uint32_t r = pcg32(s);
	int len = 1 + (r & 0x3);
	r >>= 2;
	switch (len) {
	case 1:
		return r % 128;
	case 2:
		return 128 + r % (2048 - 128);
	case 3:
		return 2048 + r % (65536 - 2048);
	case 4:
		return 65536 + r % (131072 - 65536);
	}
	abort();
}

static volatile sig_atomic_t running;

static void
alarm_handler(int signum)
{
	(void)signum;
	running = 0;
}

/* Fill buffer with random characters, with evenly-distributed encoded
 * lengths.
 */
static void *
buffer_fill(void *buf, size_t z)
{
	uint64_t s = 0;
	char *p = buf;
	char *end = p + z;
	while (p < end) {
		long c;
		do
			c = randchar(&s);
		while (IS_SURROGATE(c));
		p = utf8_encode(p, c);
	}
	return p;
}

////////////////////////

static unsigned char *
utf8_simple(unsigned char *s, long *c)
{
	unsigned char *next;
	if (s[0] < 0x80) {
		*c = s[0];
		next = s + 1;
	} else if ((s[0] & 0xe0) == 0xc0) {
		*c = ((long)(s[0] & 0x1f) <<  6) |
			((long)(s[1] & 0x3f) <<  0);
		if ((s[1] & 0xc0) != 0x80)
			*c = -1;
		next = s + 2;
	} else if ((s[0] & 0xf0) == 0xe0) {
		*c = ((long)(s[0] & 0x0f) << 12) |
			((long)(s[1] & 0x3f) <<  6) |
			((long)(s[2] & 0x3f) <<  0);
		if ((s[1] & 0xc0) != 0x80 ||
			(s[2] & 0xc0) != 0x80)
			*c = -1;
		next = s + 3;
	} else if ((s[0] & 0xf8) == 0xf0 && (s[0] <= 0xf4)) {
		*c = ((long)(s[0] & 0x07) << 18) |
			((long)(s[1] & 0x3f) << 12) |
			((long)(s[2] & 0x3f) <<  6) |
			((long)(s[3] & 0x3f) <<  0);
		if ((s[1] & 0xc0) != 0x80 ||
			(s[2] & 0xc0) != 0x80 ||
			(s[3] & 0xc0) != 0x80)
			*c = -1;
		next = s + 4;
	} else {
		*c = -1; // invalid
		next = s + 1; // skip this byte
	}
	if (*c >= 0xd800 && *c <= 0xdfff)
		*c = -1; // surrogate half
	return next;
}

////////////////////////

#include <immintrin.h>
typedef uint8_t  u8;
typedef uint32_t u32;
#if 1
static u32 pext32(u32 v, u32 mask) { return _pext_u32(v, mask); }
#else
static u32
pext32(u32 v, u32 mask)
{
	u32 ret = 0;
	u32 test = 0x1;
	while (mask != 0) {
		u32 ontarget = mask & test;
		ret |= v & ontarget;
		test <<= !!ontarget;
		v    >>= !ontarget;
		mask >>= !ontarget;
	}
	return ret;
}
#endif

static uint8_t *
utf8_pext(uint8_t *s, u32 *outcp, int *err)
{
	static const u8 lens[] = {
		/* 0XXX */ 1, 1, 1, 1, 1, 1, 1, 1,
		/* 10XX */ 1, 1, 1, 1,  /* invalid */
		/* 110X */ 2, 2,
		/* 1110 */ 3,
		/* 1111 */ 4, /* maybe */
	};
	static const u32 pextmask[] = {
		0x7F000000, 0x1F3F0000, 0x0F3F3F00, 0x073F3F3F,
	};
	static const u32 emask[] = {
		0x80000000, 0xE0C00000, 0xF0C0C000, 0xF8C0C0C0,
	};
	static const u32 eexpect[] = {
		0x00000000, 0xC0800000, 0xE0808000, 0xF0808080,
	};
	static const u32 overlong[] = { 0x0, 0x80, 0x0800, 0x10000 };

	u32 v = 0;
	switch (4) {
	case 4: v |= ((u32)s[3] <<  0); // fallthrough
	case 3: v |= ((u32)s[2] <<  8); // fallthrough
	case 2: v |= ((u32)s[1] << 16); // fallthrough
	case 1: v |= ((u32)s[0] << 24); break;
	default: __builtin_unreachable(); break; // string is expected to be not empty
	}
	u32 len = lens[s[0] >> 4];
	u32 idx = len - 1;
	s += len;

	u32 cp = pext32(v, pextmask[idx]);
	*outcp = cp;

	*err |= (v & emask[idx]) ^ (eexpect[idx]);
	*err |= cp > 0x10FFFF; // out of range
	*err |= (cp >> 11) == 0x1B; // surrogate
	*err |= cp < overlong[idx];

	return s;
}


/////////////////
// https://nullprogram.com/blog/2017/10/06/
static void *
utf8_decode(void *buf, uint32_t *c, int *e)
{
	static const unsigned char lengths[] = {
		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
		0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
	};
	static const int masks[]  = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
	static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
	static const int shiftc[] = {0, 18, 12, 6, 0};
	static const int shifte[] = {0, 6, 4, 2, 0};

	unsigned char *s = buf;
	unsigned int len = lengths[s[0] >> 3];

	/* Compute the pointer to the next character early so that the next
	 * iteration can start working on the next character. Neither Clang
	 * nor GCC figure out this reordering on their own.
	 */
	unsigned char *next = s + len + !len;

	/* Assume a four-byte character and load four bytes. Unused bits are
	 * shifted out.
	 */
	*c  = (uint32_t)(s[0] & masks[len]) << 18;
	*c |= (uint32_t)(s[1] & 0x3f) << 12;
	*c |= (uint32_t)(s[2] & 0x3f) <<  6;
	*c |= (uint32_t)(s[3] & 0x3f) <<  0;
	*c >>= shiftc[len];

	/* Accumulate the various error conditions. */
	*e  = (*c < mins[len]) << 6; // non-canonical encoding
	*e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
	*e |= (*c > 0x10FFFF) << 8;  // out of range?
	*e |= (s[1] & 0xc0) >> 2;
	*e |= (s[2] & 0xc0) >> 4;
	*e |= (s[3]       ) >> 6;
	*e ^= 0x2a; // top two bits of each tail byte correct?
	*e >>= shifte[len];

	return next;
}

///////////////
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
#define UTF8_ACCEPT 0
#define UTF8_REJECT 12

static const uint8_t utf8d[] = {
  // The first part of the table maps bytes to character classes that
  // to reduce the size of the transition table and create bitmasks.
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

  // The second part is a transition table that maps a combination
  // of a state of the automaton and a character class to a state.
   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
  12,36,12,12,12,12,12,12,12,12,12,12,
};

static uint32_t
bh_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
  uint32_t type = utf8d[byte];

  *codep = (*state != UTF8_ACCEPT) ?
    (byte & 0x3fu) | (*codep << 6) :
    (0xff >> type) & (byte);

  *state = utf8d[256 + *state + type];
  return *state;
}

int
main(void)
{
	double rate;
	long errors, n;
	size_t z = BUFLEN * 1024L * 1024;
	unsigned char *buffer = malloc(z);
	unsigned char *end = buffer_fill(buffer, z - 4);

	/* Benchmark the branchless decoder */
	running = 1;
	signal(SIGALRM, alarm_handler);
	alarm(SECONDS);
	errors = n = 0;
	do {
		unsigned char *p = buffer;
		int e = 0;
		uint32_t c;
		long count = 0;
		while (p < end) {
			p = utf8_decode(p, &c, &e);
			errors += !!e;  // force errors to be checked
			count++;
		}
		if (p == end) // reached the end successfully?
			n++;
	} while (running);
	rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
	printf("branchless: %f MB/s, %ld errors\n", rate, errors);

	/* Benchmark Bjoern Hoehrmann's decoder */
	running = 1;
	signal(SIGALRM, alarm_handler);
	alarm(SECONDS);
	errors = n = 0;
	do {
		unsigned char *p = buffer;
		uint32_t c;
		uint32_t state = 0;
		long count = 0;
		for (; p < end; p++) {
			if (!bh_utf8_decode(&state, &c, *p))
				count++;
			else if (state == UTF8_REJECT)
				errors++;  // force errors to be checked
		}
		if (p == end) // reached the end successfully?
			n++;
	} while (running);
	rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
	printf("Hoehrmann:  %f MB/s, %ld errors\n", rate, errors);

	/* Benchmark simple decoder */
	running = 1;
	signal(SIGALRM, alarm_handler);
	alarm(SECONDS);
	errors = n = 0;
	do {
		unsigned char *p = buffer;
		long c;
		long count = 0;
		while (p < end) {
			p = utf8_simple(p, &c);
			count++;
			if (c < 0)
				errors++;
		}
		if (p == end) // reached the end successfully?
			n++;
	} while (running);
	rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
	printf("Simple:     %f MB/s, %ld errors\n", rate, errors);

	/* Benchmark pext decoder */
	running = 1;
	signal(SIGALRM, alarm_handler);
	alarm(SECONDS);
	errors = n = 0;
	do {
		unsigned char *p = buffer;
		u32 cp;
		long count = 0;
		while (p < end) {
			int err = 0;
			p = utf8_pext(p, &cp, &err);
			count++;
			if (err)
				errors++;
		}
		if (p == end) // reached the end successfully?
			n++;
	} while (running);
	rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
	printf("Pext:       %f MB/s, %ld errors\n", rate, errors);

	free(buffer);
}
	// gcc -march=x86-64-v3 -O3 -o bench bench.c
	// src taken from: https://github.com/skeeto/branchless-utf8/blob/master/test/benchmark.c
	#define _POSIX_C_SOURCE 200112L
	#include <stdio.h>
	#include <stdint.h>
	#include <stdlib.h>
	#include <signal.h>
	#include <stddef.h>

	#include <unistd.h> // alarm()

	#define SECONDS 4
	#define BUFLEN 8 // MB

	#define IS_SURROGATE(c) ((c) >= 0xD800U && (c) <= 0xDFFFU)
	static void *
	utf8_encode(void *buf, long c)
	{
	unsigned char *s = buf;
	if (c >= (1L << 16)) {
	s[0] = 0xf0 \| (c >> 18);
	s[1] = 0x80 \| ((c >> 12) & 0x3f);
	s[2] = 0x80 \| ((c >> 6) & 0x3f);
	s[3] = 0x80 \| ((c >> 0) & 0x3f);
	return s + 4;
	} else if (c >= (1L << 11)) {
	s[0] = 0xe0 \| (c >> 12);
	s[1] = 0x80 \| ((c >> 6) & 0x3f);
	s[2] = 0x80 \| ((c >> 0) & 0x3f);
	return s + 3;
	} else if (c >= (1L << 7)) {
	s[0] = 0xc0 \| (c >> 6);
	s[1] = 0x80 \| ((c >> 0) & 0x3f);
	return s + 2;
	} else {
	s[0] = c;
	return s + 1;
	}
	}

	static uint32_t
	pcg32(uint64_t *s)
	{
	uint64_t m = 0x9b60933458e17d7d;
	uint64_t a = 0xd737232eeccdf7ed;
	s = s * m + a;
	int shift = 29 - (*s >> 61);
	return *s >> shift;
	}

	/* Generate a random codepoint whose UTF-8 length is uniformly selected. */
	static long
	randchar(uint64_t *s)
	{
	uint32_t r = pcg32(s);
	int len = 1 + (r & 0x3);
	r >>= 2;
	switch (len) {
	case 1:
	return r % 128;
	case 2:
	return 128 + r % (2048 - 128);
	case 3:
	return 2048 + r % (65536 - 2048);
	case 4:
	return 65536 + r % (131072 - 65536);
	}
	abort();
	}

	static volatile sig_atomic_t running;

	static void
	alarm_handler(int signum)
	{
	(void)signum;
	running = 0;
	}

	/* Fill buffer with random characters, with evenly-distributed encoded
	* lengths.
	*/
	static void *
	buffer_fill(void *buf, size_t z)
	{
	uint64_t s = 0;
	char *p = buf;
	char *end = p + z;
	while (p < end) {
	long c;
	do
	c = randchar(&s);
	while (IS_SURROGATE(c));
	p = utf8_encode(p, c);
	}
	return p;
	}

	////////////////////////

	static unsigned char *
	utf8_simple(unsigned char s, long c)
	{
	unsigned char *next;
	if (s[0] < 0x80) {
	*c = s[0];
	next = s + 1;
	} else if ((s[0] & 0xe0) == 0xc0) {
	*c = ((long)(s[0] & 0x1f) << 6) \|
	((long)(s[1] & 0x3f) << 0);
	if ((s[1] & 0xc0) != 0x80)
	*c = -1;
	next = s + 2;
	} else if ((s[0] & 0xf0) == 0xe0) {
	*c = ((long)(s[0] & 0x0f) << 12) \|
	((long)(s[1] & 0x3f) << 6) \|
	((long)(s[2] & 0x3f) << 0);
	if ((s[1] & 0xc0) != 0x80 \|\|
	(s[2] & 0xc0) != 0x80)
	*c = -1;
	next = s + 3;
	} else if ((s[0] & 0xf8) == 0xf0 && (s[0] <= 0xf4)) {
	*c = ((long)(s[0] & 0x07) << 18) \|
	((long)(s[1] & 0x3f) << 12) \|
	((long)(s[2] & 0x3f) << 6) \|
	((long)(s[3] & 0x3f) << 0);
	if ((s[1] & 0xc0) != 0x80 \|\|
	(s[2] & 0xc0) != 0x80 \|\|
	(s[3] & 0xc0) != 0x80)
	*c = -1;
	next = s + 4;
	} else {
	*c = -1; // invalid
	next = s + 1; // skip this byte
	}
	if (c >= 0xd800 && c <= 0xdfff)
	*c = -1; // surrogate half
	return next;
	}

	////////////////////////

	#include <immintrin.h>
	typedef uint8_t u8;
	typedef uint32_t u32;
	#if 1
	static u32 pext32(u32 v, u32 mask) { return _pext_u32(v, mask); }
	#else
	static u32
	pext32(u32 v, u32 mask)
	{
	u32 ret = 0;
	u32 test = 0x1;
	while (mask != 0) {
	u32 ontarget = mask & test;
	ret \|= v & ontarget;
	test <<= !!ontarget;
	v >>= !ontarget;
	mask >>= !ontarget;
	}
	return ret;
	}
	#endif

	static uint8_t *
	utf8_pext(uint8_t s, u32 outcp, int *err)
	{
	static const u8 lens[] = {
	/* 0XXX */ 1, 1, 1, 1, 1, 1, 1, 1,
	/* 10XX / 1, 1, 1, 1, / invalid */
	/* 110X */ 2, 2,
	/* 1110 */ 3,
	/* 1111 / 4, / maybe */
	};
	static const u32 pextmask[] = {
	0x7F000000, 0x1F3F0000, 0x0F3F3F00, 0x073F3F3F,
	};
	static const u32 emask[] = {
	0x80000000, 0xE0C00000, 0xF0C0C000, 0xF8C0C0C0,
	};
	static const u32 eexpect[] = {
	0x00000000, 0xC0800000, 0xE0808000, 0xF0808080,
	};
	static const u32 overlong[] = { 0x0, 0x80, 0x0800, 0x10000 };

	u32 v = 0;
	switch (4) {
	case 4: v \|= ((u32)s[3] << 0); // fallthrough
	case 3: v \|= ((u32)s[2] << 8); // fallthrough
	case 2: v \|= ((u32)s[1] << 16); // fallthrough
	case 1: v \|= ((u32)s[0] << 24); break;
	default: __builtin_unreachable(); break; // string is expected to be not empty
	}
	u32 len = lens[s[0] >> 4];
	u32 idx = len - 1;
	s += len;

	u32 cp = pext32(v, pextmask[idx]);
	*outcp = cp;

	*err \|= (v & emask[idx]) ^ (eexpect[idx]);
	*err \|= cp > 0x10FFFF; // out of range
	*err \|= (cp >> 11) == 0x1B; // surrogate
	*err \|= cp < overlong[idx];

	return s;
	}


	/////////////////
	// https://nullprogram.com/blog/2017/10/06/
	static void *
	utf8_decode(void buf, uint32_t c, int *e)
	{
	static const unsigned char lengths[] = {
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
	};
	static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
	static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
	static const int shiftc[] = {0, 18, 12, 6, 0};
	static const int shifte[] = {0, 6, 4, 2, 0};

	unsigned char *s = buf;
	unsigned int len = lengths[s[0] >> 3];

	/* Compute the pointer to the next character early so that the next
	* iteration can start working on the next character. Neither Clang
	* nor GCC figure out this reordering on their own.
	*/
	unsigned char *next = s + len + !len;

	/* Assume a four-byte character and load four bytes. Unused bits are
	* shifted out.
	*/
	*c = (uint32_t)(s[0] & masks[len]) << 18;
	*c \|= (uint32_t)(s[1] & 0x3f) << 12;
	*c \|= (uint32_t)(s[2] & 0x3f) << 6;
	*c \|= (uint32_t)(s[3] & 0x3f) << 0;
	*c >>= shiftc[len];

	/* Accumulate the various error conditions. */
	e = (c < mins[len]) << 6; // non-canonical encoding
	e \|= ((c >> 11) == 0x1b) << 7; // surrogate half?
	e \|= (c > 0x10FFFF) << 8; // out of range?
	*e \|= (s[1] & 0xc0) >> 2;
	*e \|= (s[2] & 0xc0) >> 4;
	*e \|= (s[3] ) >> 6;
	*e ^= 0x2a; // top two bits of each tail byte correct?
	*e >>= shifte[len];

	return next;
	}

	///////////////
	// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
	#define UTF8_ACCEPT 0
	#define UTF8_REJECT 12

	static const uint8_t utf8d[] = {
	// The first part of the table maps bytes to character classes that
	// to reduce the size of the transition table and create bitmasks.
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

	// The second part is a transition table that maps a combination
	// of a state of the automaton and a character class to a state.
	0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
	12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
	12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
	12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
	12,36,12,12,12,12,12,12,12,12,12,12,
	};

	static uint32_t
	bh_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
	uint32_t type = utf8d[byte];

	codep = (state != UTF8_ACCEPT) ?
	(byte & 0x3fu) \| (*codep << 6) :
	(0xff >> type) & (byte);

	state = utf8d[256 + state + type];
	return *state;
	}

	int
	main(void)
	{
	double rate;
	long errors, n;
	size_t z = BUFLEN * 1024L * 1024;
	unsigned char *buffer = malloc(z);
	unsigned char *end = buffer_fill(buffer, z - 4);

	/* Benchmark the branchless decoder */
	running = 1;
	signal(SIGALRM, alarm_handler);
	alarm(SECONDS);
	errors = n = 0;
	do {
	unsigned char *p = buffer;
	int e = 0;
	uint32_t c;
	long count = 0;
	while (p < end) {
	p = utf8_decode(p, &c, &e);
	errors += !!e; // force errors to be checked
	count++;
	}
	if (p == end) // reached the end successfully?
	n++;
	} while (running);
	rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
	printf("branchless: %f MB/s, %ld errors\n", rate, errors);

	/* Benchmark Bjoern Hoehrmann's decoder */
	running = 1;
	signal(SIGALRM, alarm_handler);
	alarm(SECONDS);
	errors = n = 0;
	do {
	unsigned char *p = buffer;
	uint32_t c;
	uint32_t state = 0;
	long count = 0;
	for (; p < end; p++) {
	if (!bh_utf8_decode(&state, &c, *p))
	count++;
	else if (state == UTF8_REJECT)
	errors++; // force errors to be checked
	}
	if (p == end) // reached the end successfully?
	n++;
	} while (running);
	rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
	printf("Hoehrmann: %f MB/s, %ld errors\n", rate, errors);

	/* Benchmark simple decoder */
	running = 1;
	signal(SIGALRM, alarm_handler);
	alarm(SECONDS);
	errors = n = 0;
	do {
	unsigned char *p = buffer;
	long c;
	long count = 0;
	while (p < end) {
	p = utf8_simple(p, &c);
	count++;
	if (c < 0)
	errors++;
	}
	if (p == end) // reached the end successfully?
	n++;
	} while (running);
	rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
	printf("Simple: %f MB/s, %ld errors\n", rate, errors);

	/* Benchmark pext decoder */
	running = 1;
	signal(SIGALRM, alarm_handler);
	alarm(SECONDS);
	errors = n = 0;
	do {
	unsigned char *p = buffer;
	u32 cp;
	long count = 0;
	while (p < end) {
	int err = 0;
	p = utf8_pext(p, &cp, &err);
	count++;
	if (err)
	errors++;
	}
	if (p == end) // reached the end successfully?
	n++;
	} while (running);
	rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
	printf("Pext: %f MB/s, %ld errors\n", rate, errors);

	free(buffer);
	}