s4y/naive_utf8.c

## naive_utf8.c
#include <inttypes.h>
#include <stdlib.h>

typedef enum {
  UTF8_OK = 0,
  UTF8_ERROR = 4,
} utf8_decode_state_t;

typedef struct {
  utf8_decode_state_t state;
  uint32_t codepoint;
} utf8_decode_context_t;

static inline void utf8_decode(
  utf8_decode_context_t *context, unsigned char byte
) {
  switch ((int)context->state) {
    case UTF8_OK:
      if (byte < 0x80) {
        context->codepoint = byte;
      } else if (byte < 0xc0){
        context->state = UTF8_ERROR;
      } else if (byte < 0xe0){
        context->state = 1;
        context->codepoint = byte & 0x1f;
      } else if (byte < 0xf0) {
        context->state = 2;
        context->codepoint = byte & 0xf;
      } else if (byte < 0xf8) {
        context->state = 3;
        context->codepoint = byte & 0x7;
      } else {
        context->state = UTF8_ERROR;
      }
      break;
    case 1:
    case 2:
    case 3:
      if (byte >= 0x80 && byte <= 0xbf) {
        context->state -= 1;
        context->codepoint =
          (context->codepoint << 6) | (byte & 0x3f);
      } else {
        context->state = UTF8_ERROR;
      }
  }
}

// - - -

#include <stdio.h>
#include <unistd.h>

int main() {
  size_t bufsz = 1024 * 1024 * 1024;
  unsigned char* buf = calloc(bufsz, 1);
  size_t count = 0;
  uint32_t codepoint_hash = 0;
  utf8_decode_context_t state = { UTF8_OK, 0 };
  size_t avail = read(STDIN_FILENO, buf, bufsz);
  for (size_t i = 0; i < avail; i++) {
    utf8_decode(&state, buf[i]);
    switch (state.state) {
      case UTF8_OK:
        count += 1;
        codepoint_hash ^= state.codepoint;
        break;
      case UTF8_ERROR:
        return 1;
    }
  }
  if (state.state != UTF8_OK) {
    return 2;
  }
  printf("Decoded %zu code points and got a hash of %u.\n", count, codepoint_hash);
}

## skeeto_utf8.c
#include <inttypes.h>
#include <stdlib.h>

/* Branchless UTF-8 decoder
 *
 * This is free and unencumbered software released into the public domain.
 */
#ifndef UTF8_H
#define UTF8_H

#include <stdint.h>

/* Decode the next character, C, from BUF, reporting errors in E.
 *
 * Since this is a branchless decoder, four bytes will be read from the
 * buffer regardless of the actual length of the next character. This
 * means the buffer _must_ have at least three bytes of zero padding
 * following the end of the data stream.
 *
 * Errors are reported in E, which will be non-zero if the parsed
 * character was somehow invalid: invalid byte sequence, non-canonical
 * encoding, or a surrogate half.
 *
 * The function returns a pointer to the next character. When an error
 * occurs, this pointer will be a guess that depends on the particular
 * error, but it will always advance at least one byte.
 */
static void *
utf8_decode(void *buf, uint32_t *c, int *e)
{
    static const char lengths[] = {
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
    };
    static const int masks[]  = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
    static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
    static const int shiftc[] = {0, 18, 12, 6, 0};
    static const int shifte[] = {0, 6, 4, 2, 0};

    unsigned char *s = buf;
    int len = lengths[s[0] >> 3];

    /* Compute the pointer to the next character early so that the next
     * iteration can start working on the next character. Neither Clang
     * nor GCC figure out this reordering on their own.
     */
    unsigned char *next = s + len + !len;

    /* Assume a four-byte character and load four bytes. Unused bits are
     * shifted out.
     */
    *c  = (uint32_t)(s[0] & masks[len]) << 18;
    *c |= (uint32_t)(s[1] & 0x3f) << 12;
    *c |= (uint32_t)(s[2] & 0x3f) <<  6;
    *c |= (uint32_t)(s[3] & 0x3f) <<  0;
    *c >>= shiftc[len];

    /* Accumulate the various error conditions. */
    *e  = (*c < mins[len]) << 6;
    *e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
    *e |= (s[1] & 0xc0) >> 2;
    *e |= (s[2] & 0xc0) >> 4;
    *e |= (s[3]       ) >> 6;
    *e ^= 0x2a; // top two bits of each tail byte correct?
    *e >>= shifte[len];

    return next;
}

#endif

// - - -

#include <stdio.h>
#include <unistd.h>

int main() {
  size_t bufsz = 1024 * 1024 * 1024;
  unsigned char* buf = calloc(bufsz, 1);
  unsigned char* bufp = buf;
  size_t count = 0;
  uint32_t codepoint_hash = 0;
  uint32_t codepoint = 0;
  int e;
  size_t avail = read(STDIN_FILENO, buf, bufsz);
  while (bufp < buf + avail) {
    bufp = utf8_decode(bufp, &codepoint, &e);
    if(e) return e;
    count += 1;
    codepoint_hash ^= codepoint;
  }
  printf("Decoded %zu code points and got a hash of %u.\n", count, codepoint_hash);
}

## utf8.c
#include <inttypes.h>
#include <fcntl.h>
#include <stdlib.h>

// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.

#define UTF8_ACCEPT 0
#define UTF8_REJECT 12

static const uint8_t utf8d[] = {
  // The first part of the table maps bytes to character classes that
  // to reduce the size of the transition table and create bitmasks.
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

  // The second part is a transition table that maps a combination
  // of a state of the automaton and a character class to a state.
   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
  12,36,12,12,12,12,12,12,12,12,12,12,
};

static uint32_t inline
decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
  uint32_t type = utf8d[byte];

  *codep = (*state != UTF8_ACCEPT) ?
    (byte & 0x3fu) | (*codep << 6) :
    (0xff >> type) & (byte);

  *state = utf8d[256 + *state + type];
  return *state;
}


// - - -

#include <stdio.h>
#include <unistd.h>

int main(int argc, char* argv[]) {
  size_t bufsz = 1024 * 1024 * 1024;
  unsigned char* buf = calloc(bufsz, 1);
  size_t count = 0;
  uint32_t codepoint_hash = 0;
  uint32_t state = 0;
  uint32_t codep = 0;
  size_t avail = read(STDIN_FILENO, buf, bufsz);

  for (size_t i = 0; i < avail; i++) {
    decode(&state, &codep, buf[i]);
    switch (state) {
      case UTF8_ACCEPT:
        count += 1;
        codepoint_hash ^= codep;
        break;
      case UTF8_REJECT:
        return 1;
    }
  }
  if (state != UTF8_ACCEPT) {
    return 2;
  }
  printf("Decoded %zu code points and got a hash of %u.\n", count, codepoint_hash);
}
	#include <inttypes.h>
	#include <stdlib.h>

	typedef enum {
	UTF8_OK = 0,
	UTF8_ERROR = 4,
	} utf8_decode_state_t;

	typedef struct {
	utf8_decode_state_t state;
	uint32_t codepoint;
	} utf8_decode_context_t;

	static inline void utf8_decode(
	utf8_decode_context_t *context, unsigned char byte
	) {
	switch ((int)context->state) {
	case UTF8_OK:
	if (byte < 0x80) {
	context->codepoint = byte;
	} else if (byte < 0xc0){
	context->state = UTF8_ERROR;
	} else if (byte < 0xe0){
	context->state = 1;
	context->codepoint = byte & 0x1f;
	} else if (byte < 0xf0) {
	context->state = 2;
	context->codepoint = byte & 0xf;
	} else if (byte < 0xf8) {
	context->state = 3;
	context->codepoint = byte & 0x7;
	} else {
	context->state = UTF8_ERROR;
	}
	break;
	case 1:
	case 2:
	case 3:
	if (byte >= 0x80 && byte <= 0xbf) {
	context->state -= 1;
	context->codepoint =
	(context->codepoint << 6) \| (byte & 0x3f);
	} else {
	context->state = UTF8_ERROR;
	}
	}
	}

	// - - -

	#include <stdio.h>
	#include <unistd.h>

	int main() {
	size_t bufsz = 1024 * 1024 * 1024;
	unsigned char* buf = calloc(bufsz, 1);
	size_t count = 0;
	uint32_t codepoint_hash = 0;
	utf8_decode_context_t state = { UTF8_OK, 0 };
	size_t avail = read(STDIN_FILENO, buf, bufsz);
	for (size_t i = 0; i < avail; i++) {
	utf8_decode(&state, buf[i]);
	switch (state.state) {
	case UTF8_OK:
	count += 1;
	codepoint_hash ^= state.codepoint;
	break;
	case UTF8_ERROR:
	return 1;
	}
	}
	if (state.state != UTF8_OK) {
	return 2;
	}
	printf("Decoded %zu code points and got a hash of %u.\n", count, codepoint_hash);
	}
	#include <inttypes.h>
	#include <stdlib.h>

	/* Branchless UTF-8 decoder
	*
	* This is free and unencumbered software released into the public domain.
	*/
	#ifndef UTF8_H
	#define UTF8_H

	#include <stdint.h>

	/* Decode the next character, C, from BUF, reporting errors in E.
	*
	* Since this is a branchless decoder, four bytes will be read from the
	* buffer regardless of the actual length of the next character. This
	* means the buffer _must_ have at least three bytes of zero padding
	* following the end of the data stream.
	*
	* Errors are reported in E, which will be non-zero if the parsed
	* character was somehow invalid: invalid byte sequence, non-canonical
	* encoding, or a surrogate half.
	*
	* The function returns a pointer to the next character. When an error
	* occurs, this pointer will be a guess that depends on the particular
	* error, but it will always advance at least one byte.
	*/
	static void *
	utf8_decode(void buf, uint32_t c, int *e)
	{
	static const char lengths[] = {
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
	};
	static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
	static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
	static const int shiftc[] = {0, 18, 12, 6, 0};
	static const int shifte[] = {0, 6, 4, 2, 0};

	unsigned char *s = buf;
	int len = lengths[s[0] >> 3];

	/* Compute the pointer to the next character early so that the next
	* iteration can start working on the next character. Neither Clang
	* nor GCC figure out this reordering on their own.
	*/
	unsigned char *next = s + len + !len;

	/* Assume a four-byte character and load four bytes. Unused bits are
	* shifted out.
	*/
	*c = (uint32_t)(s[0] & masks[len]) << 18;
	*c \|= (uint32_t)(s[1] & 0x3f) << 12;
	*c \|= (uint32_t)(s[2] & 0x3f) << 6;
	*c \|= (uint32_t)(s[3] & 0x3f) << 0;
	*c >>= shiftc[len];

	/* Accumulate the various error conditions. */
	e = (c < mins[len]) << 6;
	e \|= ((c >> 11) == 0x1b) << 7; // surrogate half?
	*e \|= (s[1] & 0xc0) >> 2;
	*e \|= (s[2] & 0xc0) >> 4;
	*e \|= (s[3] ) >> 6;
	*e ^= 0x2a; // top two bits of each tail byte correct?
	*e >>= shifte[len];

	return next;
	}

	#endif

	// - - -

	#include <stdio.h>
	#include <unistd.h>

	int main() {
	size_t bufsz = 1024 * 1024 * 1024;
	unsigned char* buf = calloc(bufsz, 1);
	unsigned char* bufp = buf;
	size_t count = 0;
	uint32_t codepoint_hash = 0;
	uint32_t codepoint = 0;
	int e;
	size_t avail = read(STDIN_FILENO, buf, bufsz);
	while (bufp < buf + avail) {
	bufp = utf8_decode(bufp, &codepoint, &e);
	if(e) return e;
	count += 1;
	codepoint_hash ^= codepoint;
	}
	printf("Decoded %zu code points and got a hash of %u.\n", count, codepoint_hash);
	}
	#include <inttypes.h>
	#include <fcntl.h>
	#include <stdlib.h>

	// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
	// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.

	#define UTF8_ACCEPT 0
	#define UTF8_REJECT 12

	static const uint8_t utf8d[] = {
	// The first part of the table maps bytes to character classes that
	// to reduce the size of the transition table and create bitmasks.
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

	// The second part is a transition table that maps a combination
	// of a state of the automaton and a character class to a state.
	0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
	12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
	12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
	12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
	12,36,12,12,12,12,12,12,12,12,12,12,
	};

	static uint32_t inline
	decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
	uint32_t type = utf8d[byte];

	codep = (state != UTF8_ACCEPT) ?
	(byte & 0x3fu) \| (*codep << 6) :
	(0xff >> type) & (byte);

	state = utf8d[256 + state + type];
	return *state;
	}


	// - - -

	#include <stdio.h>
	#include <unistd.h>

	int main(int argc, char* argv[]) {
	size_t bufsz = 1024 * 1024 * 1024;
	unsigned char* buf = calloc(bufsz, 1);
	size_t count = 0;
	uint32_t codepoint_hash = 0;
	uint32_t state = 0;
	uint32_t codep = 0;
	size_t avail = read(STDIN_FILENO, buf, bufsz);

	for (size_t i = 0; i < avail; i++) {
	decode(&state, &codep, buf[i]);
	switch (state) {
	case UTF8_ACCEPT:
	count += 1;
	codepoint_hash ^= codep;
	break;
	case UTF8_REJECT:
	return 1;
	}
	}
	if (state != UTF8_ACCEPT) {
	return 2;
	}
	printf("Decoded %zu code points and got a hash of %u.\n", count, codepoint_hash);
	}