Skip to content

Instantly share code, notes, and snippets.

@s4y
Last active November 27, 2023 12:47
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save s4y/344a355f8c1f99c6a4cb2347ec4323cc to your computer and use it in GitHub Desktop.
Save s4y/344a355f8c1f99c6a4cb2347ec4323cc to your computer and use it in GitHub Desktop.
#include <inttypes.h>
#include <stdlib.h>
typedef enum {
UTF8_OK = 0,
UTF8_ERROR = 4,
} utf8_decode_state_t;
typedef struct {
utf8_decode_state_t state;
uint32_t codepoint;
} utf8_decode_context_t;
static inline void utf8_decode(
utf8_decode_context_t *context, unsigned char byte
) {
switch ((int)context->state) {
case UTF8_OK:
if (byte < 0x80) {
context->codepoint = byte;
} else if (byte < 0xc0){
context->state = UTF8_ERROR;
} else if (byte < 0xe0){
context->state = 1;
context->codepoint = byte & 0x1f;
} else if (byte < 0xf0) {
context->state = 2;
context->codepoint = byte & 0xf;
} else if (byte < 0xf8) {
context->state = 3;
context->codepoint = byte & 0x7;
} else {
context->state = UTF8_ERROR;
}
break;
case 1:
case 2:
case 3:
if (byte >= 0x80 && byte <= 0xbf) {
context->state -= 1;
context->codepoint =
(context->codepoint << 6) | (byte & 0x3f);
} else {
context->state = UTF8_ERROR;
}
}
}
// - - -
#include <stdio.h>
#include <unistd.h>
int main() {
size_t bufsz = 1024 * 1024 * 1024;
unsigned char* buf = calloc(bufsz, 1);
size_t count = 0;
uint32_t codepoint_hash = 0;
utf8_decode_context_t state = { UTF8_OK, 0 };
size_t avail = read(STDIN_FILENO, buf, bufsz);
for (size_t i = 0; i < avail; i++) {
utf8_decode(&state, buf[i]);
switch (state.state) {
case UTF8_OK:
count += 1;
codepoint_hash ^= state.codepoint;
break;
case UTF8_ERROR:
return 1;
}
}
if (state.state != UTF8_OK) {
return 2;
}
printf("Decoded %zu code points and got a hash of %u.\n", count, codepoint_hash);
}
#include <inttypes.h>
#include <stdlib.h>
/* Branchless UTF-8 decoder
*
* This is free and unencumbered software released into the public domain.
*/
#ifndef UTF8_H
#define UTF8_H
#include <stdint.h>
/* Decode the next character, C, from BUF, reporting errors in E.
*
* Since this is a branchless decoder, four bytes will be read from the
* buffer regardless of the actual length of the next character. This
* means the buffer _must_ have at least three bytes of zero padding
* following the end of the data stream.
*
* Errors are reported in E, which will be non-zero if the parsed
* character was somehow invalid: invalid byte sequence, non-canonical
* encoding, or a surrogate half.
*
* The function returns a pointer to the next character. When an error
* occurs, this pointer will be a guess that depends on the particular
* error, but it will always advance at least one byte.
*/
static void *
utf8_decode(void *buf, uint32_t *c, int *e)
{
static const char lengths[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
};
static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
static const int shiftc[] = {0, 18, 12, 6, 0};
static const int shifte[] = {0, 6, 4, 2, 0};
unsigned char *s = buf;
int len = lengths[s[0] >> 3];
/* Compute the pointer to the next character early so that the next
* iteration can start working on the next character. Neither Clang
* nor GCC figure out this reordering on their own.
*/
unsigned char *next = s + len + !len;
/* Assume a four-byte character and load four bytes. Unused bits are
* shifted out.
*/
*c = (uint32_t)(s[0] & masks[len]) << 18;
*c |= (uint32_t)(s[1] & 0x3f) << 12;
*c |= (uint32_t)(s[2] & 0x3f) << 6;
*c |= (uint32_t)(s[3] & 0x3f) << 0;
*c >>= shiftc[len];
/* Accumulate the various error conditions. */
*e = (*c < mins[len]) << 6;
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
*e |= (s[1] & 0xc0) >> 2;
*e |= (s[2] & 0xc0) >> 4;
*e |= (s[3] ) >> 6;
*e ^= 0x2a; // top two bits of each tail byte correct?
*e >>= shifte[len];
return next;
}
#endif
// - - -
#include <stdio.h>
#include <unistd.h>
int main() {
size_t bufsz = 1024 * 1024 * 1024;
unsigned char* buf = calloc(bufsz, 1);
unsigned char* bufp = buf;
size_t count = 0;
uint32_t codepoint_hash = 0;
uint32_t codepoint = 0;
int e;
size_t avail = read(STDIN_FILENO, buf, bufsz);
while (bufp < buf + avail) {
bufp = utf8_decode(bufp, &codepoint, &e);
if(e) return e;
count += 1;
codepoint_hash ^= codepoint;
}
printf("Decoded %zu code points and got a hash of %u.\n", count, codepoint_hash);
}
#include <inttypes.h>
#include <fcntl.h>
#include <stdlib.h>
// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
#define UTF8_ACCEPT 0
#define UTF8_REJECT 12
static const uint8_t utf8d[] = {
// The first part of the table maps bytes to character classes that
// to reduce the size of the transition table and create bitmasks.
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
// The second part is a transition table that maps a combination
// of a state of the automaton and a character class to a state.
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12,
};
static uint32_t inline
decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
uint32_t type = utf8d[byte];
*codep = (*state != UTF8_ACCEPT) ?
(byte & 0x3fu) | (*codep << 6) :
(0xff >> type) & (byte);
*state = utf8d[256 + *state + type];
return *state;
}
// - - -
#include <stdio.h>
#include <unistd.h>
int main(int argc, char* argv[]) {
size_t bufsz = 1024 * 1024 * 1024;
unsigned char* buf = calloc(bufsz, 1);
size_t count = 0;
uint32_t codepoint_hash = 0;
uint32_t state = 0;
uint32_t codep = 0;
size_t avail = read(STDIN_FILENO, buf, bufsz);
for (size_t i = 0; i < avail; i++) {
decode(&state, &codep, buf[i]);
switch (state) {
case UTF8_ACCEPT:
count += 1;
codepoint_hash ^= codep;
break;
case UTF8_REJECT:
return 1;
}
}
if (state != UTF8_ACCEPT) {
return 2;
}
printf("Decoded %zu code points and got a hash of %u.\n", count, codepoint_hash);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment