Created
July 19, 2021 02:20
-
-
Save saidwho12/58d25a977a329b026a2cc97cf394d550 to your computer and use it in GitHub Desktop.
UTF-8 SIMD tests
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <memory.h> | |
#include <inttypes.h> | |
#include <time.h> | |
#include <x86intrin.h> | |
#ifdef WIN32 | |
#include <windows.h> | |
double get_time() | |
{ | |
LARGE_INTEGER t, f; | |
QueryPerformanceCounter(&t); | |
QueryPerformanceFrequency(&f); | |
return (double)t.QuadPart/(double)f.QuadPart; | |
} | |
#else | |
#include <sys/time.h> | |
#include <sys/resource.h> | |
double get_time() | |
{ | |
struct timeval t; | |
/*struct timezone tzp;*/ | |
gettimeofday(&t, NULL/*&tzp*/); | |
return t.tv_sec + t.tv_usec*1e-6; | |
} | |
#endif | |
#define KIB 1024 | |
#define MIB 1048576 | |
#define GIB 1073741824 | |
const char * | |
mem_unit (size_t n) | |
{ | |
if (n > GIB) return "GiB"; | |
else if (n > MIB) return "MiB"; | |
else if (n > KIB) return "KiB"; | |
else return "byte(s)"; | |
} | |
double | |
mem_value (size_t n) | |
{ | |
if (n > GIB) return (double)n / (double)GIB; | |
else if (n > MIB) return (double)n / (double)MIB; | |
else if (n > KIB) return (double)n / (double)KIB; | |
else return n; | |
} | |
#define PRINTMEM(n) mem_value(n), mem_unit(n) | |
#define NS 1.e-9 | |
#define US 1.e-6 | |
#define MS 1.e-3 | |
#define SEC 1. | |
const double | |
time_print_val(double t) | |
{ | |
if (t < US) return t/NS; | |
else if (t < MS) return t/US; | |
else if (t < SEC) return t/MS; | |
else return t; | |
} | |
const char * | |
get_time_period_suffix(double t) | |
{ | |
if (t < US) return "ns"; | |
else if (t < MS) return "us"; | |
else if (t < SEC) return "ms"; | |
else return "s"; | |
} | |
#define PRINTTIME(t) time_print_val(t), get_time_period_suffix(t) | |
#define UC_UNK 0xfffd | |
#define UC_NULL 0x0000 | |
typedef uint8_t char8_t; | |
typedef uint32_t char32_t, unicode_t; | |
typedef uint16_t ucs2_char_t; | |
typedef struct DecoderState { | |
const char8_t *data; | |
unicode_t code; | |
uintptr_t ptr; | |
size_t len; /* if 0, no limit */ | |
int step; | |
} DecoderState; | |
/* | |
This table decodes the first byte of a utf-8 sequence. | |
It returns the number of expected continuation bytes | |
in the rest of the sequence. | |
From wikipedia: | |
# Bits First Last Sequence Byte 1 Byte 2 Byte 3 Byte 4 Byte 5 Byte 6 | |
# 7 0x0000 0x007F 1 0xxxxxxx | |
# 11 0x0080 0x07FF 2 110xxxxx 10xxxxxx | |
# 16 0x0800 0xFFFF 3 1110xxxx 10xxxxxx 10xxxxxx | |
# 21 0x10000 0x1FFFFF 4 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
# 26 0x200000 0x3FFFFFF 5 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
# 31 0x4000000 0x7FFFFFFF 6 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
*/ | |
static const int8_t decode_byte_table[] = { | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00 */ | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10 */ | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20 */ | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 */ | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 */ | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x50 */ | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x60 */ | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x70 */ | |
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 0x80 */ | |
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 0x90 */ | |
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa0 */ | |
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb0 */ | |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xc0 */ | |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xd0 */ | |
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xe0 */ | |
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, -1, -1, /* 0xf0 */ | |
}; | |
/* This table returns the mask for the data in the first byte | |
of an n-byte sequence, where n is the index into the table. | |
*/ | |
static const uint8_t decode_mask_table[] = { | |
0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01, | |
}; | |
#define START 0 | |
void | |
decoder_state_init(DecoderState *state) | |
{ | |
state->data = NULL; | |
state->ptr = 0; | |
state->len = 0; | |
state->code = 0; | |
state->step = START; | |
} | |
void decoder_state_reset(DecoderState *state) | |
{ | |
state->ptr = 0; | |
state->len = 0; | |
state->code = 0; | |
state->step = START; | |
} | |
void | |
decoder_state_set_data(DecoderState *state, const char8_t *data) | |
{ | |
state->data = data; | |
} | |
/* | |
Implementation of fast UTF-8 decoding by Chris Venter: | |
https://gist.github.com/gorb314/7888804 | |
*/ | |
void | |
decode_byte_chris_venter(DecoderState *state) | |
{ | |
char8_t byte; | |
retry: | |
byte = state->data[state->ptr++]; | |
if (state->step == START) | |
{ | |
/* decode the number of steps using the first byte */ | |
state->step = decode_byte_table[byte]; | |
if (state->step < 0) goto error; /* invalid starting byte */ | |
/* extract the data from the first byte, using the proper mask */ | |
state->code = ((uint32_t)byte & decode_mask_table[state->step]) << (6*state->step); | |
} | |
else | |
{ | |
if ((byte & 0xc0) != 0x80) /* invalid continuation byte */ | |
{ | |
/* retry as starting byte */ | |
state->step = START; | |
goto retry; | |
} | |
if (state->step == 0) goto error; /* too many continuation bytes */ | |
state->step -= 1; | |
/* extract the data from the continuation byte */ | |
state->code |= ((uint32_t)byte & 0x3f) << (6*state->step); | |
} | |
return; | |
error: | |
/* return the unicode "unknown" character and start again */ | |
state->step = START; | |
state->code = UC_UNK; | |
} | |
unicode_t | |
decode_next_codepoint_chris_venter(DecoderState *state) | |
{ | |
do { | |
decode_byte_chris_venter(state); | |
} while (state->step != START); | |
return state->code; | |
} | |
void | |
bench_decode_chris_venter_scalar(const char8_t *data, size_t len, int runs) | |
{ | |
int r; | |
double t1, t2, delta; | |
DecoderState state; | |
printf("Benchmarking Chris Venter UTF-8 scalar decode...\n"); | |
printf("Decoding %0.2lf %s of UTF-8...\n", PRINTMEM(len)); | |
decoder_state_init(&state); | |
decoder_state_set_data(&state, data); | |
for (r = 0; r < runs; ++r) { | |
unicode_t code; | |
t1 = get_time(); | |
decoder_state_reset(&state); | |
do { | |
code = decode_next_codepoint_chris_venter(&state); | |
/*printf("U+%X\n", code);*/ | |
} while (state.ptr < len); | |
t2 = get_time(); | |
delta = t2 - t1; | |
printf("Run %d: took %0.2lf %s\n", r+1, PRINTTIME(delta)); | |
} | |
} | |
/* | |
Returns 1 if little endian, 0 if big endian | |
*/ | |
static inline int BIGENDIAN(void) | |
{ | |
int x = 1; | |
return (int) (((char *)&x)[0]); | |
} | |
static const int8_t prefix_to_length_table[32] = { | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3 | |
}; | |
typedef union vector128 { | |
uint8_t _u8[16]; | |
int8_t _i8[16]; | |
uint16_t _u16[8]; | |
int16_t _i16[8]; | |
uint32_t _u32[4]; | |
int32_t _i32[4]; | |
uint64_t _u64[2]; | |
int64_t _i64[2]; | |
} vector128; | |
vector128 | |
shuffle_v128(const vector128 a, const vector128 b) | |
{ | |
vector128 v; | |
int i; | |
memset(v._u8, 0, 16); | |
for (i = 0; i < 16; ++i) | |
if (b._i8[i] >= 0) | |
v._u8[i] = a._u8[b._i8[i]]; | |
return v; | |
} | |
vector128 | |
and_v128(const register vector128 a, const register vector128 b) | |
{ | |
vector128 v; | |
v._u64[0] = a._u64[0] & b._u64[0]; | |
v._u64[1] = a._u64[1] & b._u64[1]; | |
return v; | |
} | |
vector128 | |
load_v128(const void *ptr) | |
{ | |
return *(vector128 *)ptr; | |
} | |
vector128 | |
loadi8_v128(int8_t x) | |
{ | |
vector128 v; | |
int i; | |
for (i = 0; i < 16; ++i) { | |
v._i8[i] = x; | |
} | |
return v; | |
} | |
/* 256 * 16 = 4096 */ | |
static const vector128 ibm_pattern_lut_128[256] = { | |
{0,-1,-1,-1,1,-1,-1,-1,2,-1,-1,-1,3,-1,-1,-1}, | |
{1,0,-1,-1,2,-1,-1,-1,3,-1,-1,-1,4,-1,-1,-1}, | |
{2,1,0,-1,3,-1,-1,-1,4,-1,-1,-1,5,-1,-1,-1}, | |
{3,2,1,0,4,-1,-1,-1,5,-1,-1,-1,6,-1,-1,-1}, | |
{0,-1,-1,-1,2,1,-1,-1,3,-1,-1,-1,4,-1,-1,-1}, | |
{1,0,-1,-1,3,2,-1,-1,4,-1,-1,-1,5,-1,-1,-1}, | |
{2,1,0,-1,4,3,-1,-1,5,-1,-1,-1,6,-1,-1,-1}, | |
{3,2,1,0,5,4,-1,-1,6,-1,-1,-1,7,-1,-1,-1}, | |
{0,-1,-1,-1,3,2,1,-1,4,-1,-1,-1,5,-1,-1,-1}, | |
{1,0,-1,-1,4,3,2,-1,5,-1,-1,-1,6,-1,-1,-1}, | |
{2,1,0,-1,5,4,3,-1,6,-1,-1,-1,7,-1,-1,-1}, | |
{3,2,1,0,6,5,4,-1,7,-1,-1,-1,8,-1,-1,-1}, | |
{0,-1,-1,-1,4,3,2,1,5,-1,-1,-1,6,-1,-1,-1}, | |
{1,0,-1,-1,5,4,3,2,6,-1,-1,-1,7,-1,-1,-1}, | |
{2,1,0,-1,6,5,4,3,7,-1,-1,-1,8,-1,-1,-1}, | |
{3,2,1,0,7,6,5,4,8,-1,-1,-1,9,-1,-1,-1}, | |
{0,-1,-1,-1,1,-1,-1,-1,3,2,-1,-1,4,-1,-1,-1}, | |
{1,0,-1,-1,2,-1,-1,-1,4,3,-1,-1,5,-1,-1,-1}, | |
{2,1,0,-1,3,-1,-1,-1,5,4,-1,-1,6,-1,-1,-1}, | |
{3,2,1,0,4,-1,-1,-1,6,5,-1,-1,7,-1,-1,-1}, | |
{0,-1,-1,-1,2,1,-1,-1,4,3,-1,-1,5,-1,-1,-1}, | |
{1,0,-1,-1,3,2,-1,-1,5,4,-1,-1,6,-1,-1,-1}, | |
{2,1,0,-1,4,3,-1,-1,6,5,-1,-1,7,-1,-1,-1}, | |
{3,2,1,0,5,4,-1,-1,7,6,-1,-1,8,-1,-1,-1}, | |
{0,-1,-1,-1,3,2,1,-1,5,4,-1,-1,6,-1,-1,-1}, | |
{1,0,-1,-1,4,3,2,-1,6,5,-1,-1,7,-1,-1,-1}, | |
{2,1,0,-1,5,4,3,-1,7,6,-1,-1,8,-1,-1,-1}, | |
{3,2,1,0,6,5,4,-1,8,7,-1,-1,9,-1,-1,-1}, | |
{0,-1,-1,-1,4,3,2,1,6,5,-1,-1,7,-1,-1,-1}, | |
{1,0,-1,-1,5,4,3,2,7,6,-1,-1,8,-1,-1,-1}, | |
{2,1,0,-1,6,5,4,3,8,7,-1,-1,9,-1,-1,-1}, | |
{3,2,1,0,7,6,5,4,9,8,-1,-1,10,-1,-1,-1}, | |
{0,-1,-1,-1,1,-1,-1,-1,4,3,2,-1,5,-1,-1,-1}, | |
{1,0,-1,-1,2,-1,-1,-1,5,4,3,-1,6,-1,-1,-1}, | |
{2,1,0,-1,3,-1,-1,-1,6,5,4,-1,7,-1,-1,-1}, | |
{3,2,1,0,4,-1,-1,-1,7,6,5,-1,8,-1,-1,-1}, | |
{0,-1,-1,-1,2,1,-1,-1,5,4,3,-1,6,-1,-1,-1}, | |
{1,0,-1,-1,3,2,-1,-1,6,5,4,-1,7,-1,-1,-1}, | |
{2,1,0,-1,4,3,-1,-1,7,6,5,-1,8,-1,-1,-1}, | |
{3,2,1,0,5,4,-1,-1,8,7,6,-1,9,-1,-1,-1}, | |
{0,-1,-1,-1,3,2,1,-1,6,5,4,-1,7,-1,-1,-1}, | |
{1,0,-1,-1,4,3,2,-1,7,6,5,-1,8,-1,-1,-1}, | |
{2,1,0,-1,5,4,3,-1,8,7,6,-1,9,-1,-1,-1}, | |
{3,2,1,0,6,5,4,-1,9,8,7,-1,10,-1,-1,-1}, | |
{0,-1,-1,-1,4,3,2,1,7,6,5,-1,8,-1,-1,-1}, | |
{1,0,-1,-1,5,4,3,2,8,7,6,-1,9,-1,-1,-1}, | |
{2,1,0,-1,6,5,4,3,9,8,7,-1,10,-1,-1,-1}, | |
{3,2,1,0,7,6,5,4,10,9,8,-1,11,-1,-1,-1}, | |
{0,-1,-1,-1,1,-1,-1,-1,5,4,3,2,6,-1,-1,-1}, | |
{1,0,-1,-1,2,-1,-1,-1,6,5,4,3,7,-1,-1,-1}, | |
{2,1,0,-1,3,-1,-1,-1,7,6,5,4,8,-1,-1,-1}, | |
{3,2,1,0,4,-1,-1,-1,8,7,6,5,9,-1,-1,-1}, | |
{0,-1,-1,-1,2,1,-1,-1,6,5,4,3,7,-1,-1,-1}, | |
{1,0,-1,-1,3,2,-1,-1,7,6,5,4,8,-1,-1,-1}, | |
{2,1,0,-1,4,3,-1,-1,8,7,6,5,9,-1,-1,-1}, | |
{3,2,1,0,5,4,-1,-1,9,8,7,6,10,-1,-1,-1}, | |
{0,-1,-1,-1,3,2,1,-1,7,6,5,4,8,-1,-1,-1}, | |
{1,0,-1,-1,4,3,2,-1,8,7,6,5,9,-1,-1,-1}, | |
{2,1,0,-1,5,4,3,-1,9,8,7,6,10,-1,-1,-1}, | |
{3,2,1,0,6,5,4,-1,10,9,8,7,11,-1,-1,-1}, | |
{0,-1,-1,-1,4,3,2,1,8,7,6,5,9,-1,-1,-1}, | |
{1,0,-1,-1,5,4,3,2,9,8,7,6,10,-1,-1,-1}, | |
{2,1,0,-1,6,5,4,3,10,9,8,7,11,-1,-1,-1}, | |
{3,2,1,0,7,6,5,4,11,10,9,8,12,-1,-1,-1}, | |
{0,-1,-1,-1,1,-1,-1,-1,2,-1,-1,-1,4,3,-1,-1}, | |
{1,0,-1,-1,2,-1,-1,-1,3,-1,-1,-1,5,4,-1,-1}, | |
{2,1,0,-1,3,-1,-1,-1,4,-1,-1,-1,6,5,-1,-1}, | |
{3,2,1,0,4,-1,-1,-1,5,-1,-1,-1,7,6,-1,-1}, | |
{0,-1,-1,-1,2,1,-1,-1,3,-1,-1,-1,5,4,-1,-1}, | |
{1,0,-1,-1,3,2,-1,-1,4,-1,-1,-1,6,5,-1,-1}, | |
{2,1,0,-1,4,3,-1,-1,5,-1,-1,-1,7,6,-1,-1}, | |
{3,2,1,0,5,4,-1,-1,6,-1,-1,-1,8,7,-1,-1}, | |
{0,-1,-1,-1,3,2,1,-1,4,-1,-1,-1,6,5,-1,-1}, | |
{1,0,-1,-1,4,3,2,-1,5,-1,-1,-1,7,6,-1,-1}, | |
{2,1,0,-1,5,4,3,-1,6,-1,-1,-1,8,7,-1,-1}, | |
{3,2,1,0,6,5,4,-1,7,-1,-1,-1,9,8,-1,-1}, | |
{0,-1,-1,-1,4,3,2,1,5,-1,-1,-1,7,6,-1,-1}, | |
{1,0,-1,-1,5,4,3,2,6,-1,-1,-1,8,7,-1,-1}, | |
{2,1,0,-1,6,5,4,3,7,-1,-1,-1,9,8,-1,-1}, | |
{3,2,1,0,7,6,5,4,8,-1,-1,-1,10,9,-1,-1}, | |
{0,-1,-1,-1,1,-1,-1,-1,3,2,-1,-1,5,4,-1,-1}, | |
{1,0,-1,-1,2,-1,-1,-1,4,3,-1,-1,6,5,-1,-1}, | |
{2,1,0,-1,3,-1,-1,-1,5,4,-1,-1,7,6,-1,-1}, | |
{3,2,1,0,4,-1,-1,-1,6,5,-1,-1,8,7,-1,-1}, | |
{0,-1,-1,-1,2,1,-1,-1,4,3,-1,-1,6,5,-1,-1}, | |
{1,0,-1,-1,3,2,-1,-1,5,4,-1,-1,7,6,-1,-1}, | |
{2,1,0,-1,4,3,-1,-1,6,5,-1,-1,8,7,-1,-1}, | |
{3,2,1,0,5,4,-1,-1,7,6,-1,-1,9,8,-1,-1}, | |
{0,-1,-1,-1,3,2,1,-1,5,4,-1,-1,7,6,-1,-1}, | |
{1,0,-1,-1,4,3,2,-1,6,5,-1,-1,8,7,-1,-1}, | |
{2,1,0,-1,5,4,3,-1,7,6,-1,-1,9,8,-1,-1}, | |
{3,2,1,0,6,5,4,-1,8,7,-1,-1,10,9,-1,-1}, | |
{0,-1,-1,-1,4,3,2,1,6,5,-1,-1,8,7,-1,-1}, | |
{1,0,-1,-1,5,4,3,2,7,6,-1,-1,9,8,-1,-1}, | |
{2,1,0,-1,6,5,4,3,8,7,-1,-1,10,9,-1,-1}, | |
{3,2,1,0,7,6,5,4,9,8,-1,-1,11,10,-1,-1}, | |
{0,-1,-1,-1,1,-1,-1,-1,4,3,2,-1,6,5,-1,-1}, | |
{1,0,-1,-1,2,-1,-1,-1,5,4,3,-1,7,6,-1,-1}, | |
{2,1,0,-1,3,-1,-1,-1,6,5,4,-1,8,7,-1,-1}, | |
{3,2,1,0,4,-1,-1,-1,7,6,5,-1,9,8,-1,-1}, | |
{0,-1,-1,-1,2,1,-1,-1,5,4,3,-1,7,6,-1,-1}, | |
{1,0,-1,-1,3,2,-1,-1,6,5,4,-1,8,7,-1,-1}, | |
{2,1,0,-1,4,3,-1,-1,7,6,5,-1,9,8,-1,-1}, | |
{3,2,1,0,5,4,-1,-1,8,7,6,-1,10,9,-1,-1}, | |
{0,-1,-1,-1,3,2,1,-1,6,5,4,-1,8,7,-1,-1}, | |
{1,0,-1,-1,4,3,2,-1,7,6,5,-1,9,8,-1,-1}, | |
{2,1,0,-1,5,4,3,-1,8,7,6,-1,10,9,-1,-1}, | |
{3,2,1,0,6,5,4,-1,9,8,7,-1,11,10,-1,-1}, | |
{0,-1,-1,-1,4,3,2,1,7,6,5,-1,9,8,-1,-1}, | |
{1,0,-1,-1,5,4,3,2,8,7,6,-1,10,9,-1,-1}, | |
{2,1,0,-1,6,5,4,3,9,8,7,-1,11,10,-1,-1}, | |
{3,2,1,0,7,6,5,4,10,9,8,-1,12,11,-1,-1}, | |
{0,-1,-1,-1,1,-1,-1,-1,5,4,3,2,7,6,-1,-1}, | |
{1,0,-1,-1,2,-1,-1,-1,6,5,4,3,8,7,-1,-1}, | |
{2,1,0,-1,3,-1,-1,-1,7,6,5,4,9,8,-1,-1}, | |
{3,2,1,0,4,-1,-1,-1,8,7,6,5,10,9,-1,-1}, | |
{0,-1,-1,-1,2,1,-1,-1,6,5,4,3,8,7,-1,-1}, | |
{1,0,-1,-1,3,2,-1,-1,7,6,5,4,9,8,-1,-1}, | |
{2,1,0,-1,4,3,-1,-1,8,7,6,5,10,9,-1,-1}, | |
{3,2,1,0,5,4,-1,-1,9,8,7,6,11,10,-1,-1}, | |
{0,-1,-1,-1,3,2,1,-1,7,6,5,4,9,8,-1,-1}, | |
{1,0,-1,-1,4,3,2,-1,8,7,6,5,10,9,-1,-1}, | |
{2,1,0,-1,5,4,3,-1,9,8,7,6,11,10,-1,-1}, | |
{3,2,1,0,6,5,4,-1,10,9,8,7,12,11,-1,-1}, | |
{0,-1,-1,-1,4,3,2,1,8,7,6,5,10,9,-1,-1}, | |
{1,0,-1,-1,5,4,3,2,9,8,7,6,11,10,-1,-1}, | |
{2,1,0,-1,6,5,4,3,10,9,8,7,12,11,-1,-1}, | |
{3,2,1,0,7,6,5,4,11,10,9,8,13,12,-1,-1}, | |
{0,-1,-1,-1,1,-1,-1,-1,2,-1,-1,-1,5,4,3,-1}, | |
{1,0,-1,-1,2,-1,-1,-1,3,-1,-1,-1,6,5,4,-1}, | |
{2,1,0,-1,3,-1,-1,-1,4,-1,-1,-1,7,6,5,-1}, | |
{3,2,1,0,4,-1,-1,-1,5,-1,-1,-1,8,7,6,-1}, | |
{0,-1,-1,-1,2,1,-1,-1,3,-1,-1,-1,6,5,4,-1}, | |
{1,0,-1,-1,3,2,-1,-1,4,-1,-1,-1,7,6,5,-1}, | |
{2,1,0,-1,4,3,-1,-1,5,-1,-1,-1,8,7,6,-1}, | |
{3,2,1,0,5,4,-1,-1,6,-1,-1,-1,9,8,7,-1}, | |
{0,-1,-1,-1,3,2,1,-1,4,-1,-1,-1,7,6,5,-1}, | |
{1,0,-1,-1,4,3,2,-1,5,-1,-1,-1,8,7,6,-1}, | |
{2,1,0,-1,5,4,3,-1,6,-1,-1,-1,9,8,7,-1}, | |
{3,2,1,0,6,5,4,-1,7,-1,-1,-1,10,9,8,-1}, | |
{0,-1,-1,-1,4,3,2,1,5,-1,-1,-1,8,7,6,-1}, | |
{1,0,-1,-1,5,4,3,2,6,-1,-1,-1,9,8,7,-1}, | |
{2,1,0,-1,6,5,4,3,7,-1,-1,-1,10,9,8,-1}, | |
{3,2,1,0,7,6,5,4,8,-1,-1,-1,11,10,9,-1}, | |
{0,-1,-1,-1,1,-1,-1,-1,3,2,-1,-1,6,5,4,-1}, | |
{1,0,-1,-1,2,-1,-1,-1,4,3,-1,-1,7,6,5,-1}, | |
{2,1,0,-1,3,-1,-1,-1,5,4,-1,-1,8,7,6,-1}, | |
{3,2,1,0,4,-1,-1,-1,6,5,-1,-1,9,8,7,-1}, | |
{0,-1,-1,-1,2,1,-1,-1,4,3,-1,-1,7,6,5,-1}, | |
{1,0,-1,-1,3,2,-1,-1,5,4,-1,-1,8,7,6,-1}, | |
{2,1,0,-1,4,3,-1,-1,6,5,-1,-1,9,8,7,-1}, | |
{3,2,1,0,5,4,-1,-1,7,6,-1,-1,10,9,8,-1}, | |
{0,-1,-1,-1,3,2,1,-1,5,4,-1,-1,8,7,6,-1}, | |
{1,0,-1,-1,4,3,2,-1,6,5,-1,-1,9,8,7,-1}, | |
{2,1,0,-1,5,4,3,-1,7,6,-1,-1,10,9,8,-1}, | |
{3,2,1,0,6,5,4,-1,8,7,-1,-1,11,10,9,-1}, | |
{0,-1,-1,-1,4,3,2,1,6,5,-1,-1,9,8,7,-1}, | |
{1,0,-1,-1,5,4,3,2,7,6,-1,-1,10,9,8,-1}, | |
{2,1,0,-1,6,5,4,3,8,7,-1,-1,11,10,9,-1}, | |
{3,2,1,0,7,6,5,4,9,8,-1,-1,12,11,10,-1}, | |
{0,-1,-1,-1,1,-1,-1,-1,4,3,2,-1,7,6,5,-1}, | |
{1,0,-1,-1,2,-1,-1,-1,5,4,3,-1,8,7,6,-1}, | |
{2,1,0,-1,3,-1,-1,-1,6,5,4,-1,9,8,7,-1}, | |
{3,2,1,0,4,-1,-1,-1,7,6,5,-1,10,9,8,-1}, | |
{0,-1,-1,-1,2,1,-1,-1,5,4,3,-1,8,7,6,-1}, | |
{1,0,-1,-1,3,2,-1,-1,6,5,4,-1,9,8,7,-1}, | |
{2,1,0,-1,4,3,-1,-1,7,6,5,-1,10,9,8,-1}, | |
{3,2,1,0,5,4,-1,-1,8,7,6,-1,11,10,9,-1}, | |
{0,-1,-1,-1,3,2,1,-1,6,5,4,-1,9,8,7,-1}, | |
{1,0,-1,-1,4,3,2,-1,7,6,5,-1,10,9,8,-1}, | |
{2,1,0,-1,5,4,3,-1,8,7,6,-1,11,10,9,-1}, | |
{3,2,1,0,6,5,4,-1,9,8,7,-1,12,11,10,-1}, | |
{0,-1,-1,-1,4,3,2,1,7,6,5,-1,10,9,8,-1}, | |
{1,0,-1,-1,5,4,3,2,8,7,6,-1,11,10,9,-1}, | |
{2,1,0,-1,6,5,4,3,9,8,7,-1,12,11,10,-1}, | |
{3,2,1,0,7,6,5,4,10,9,8,-1,13,12,11,-1}, | |
{0,-1,-1,-1,1,-1,-1,-1,5,4,3,2,8,7,6,-1}, | |
{1,0,-1,-1,2,-1,-1,-1,6,5,4,3,9,8,7,-1}, | |
{2,1,0,-1,3,-1,-1,-1,7,6,5,4,10,9,8,-1}, | |
{3,2,1,0,4,-1,-1,-1,8,7,6,5,11,10,9,-1}, | |
{0,-1,-1,-1,2,1,-1,-1,6,5,4,3,9,8,7,-1}, | |
{1,0,-1,-1,3,2,-1,-1,7,6,5,4,10,9,8,-1}, | |
{2,1,0,-1,4,3,-1,-1,8,7,6,5,11,10,9,-1}, | |
{3,2,1,0,5,4,-1,-1,9,8,7,6,12,11,10,-1}, | |
{0,-1,-1,-1,3,2,1,-1,7,6,5,4,10,9,8,-1}, | |
{1,0,-1,-1,4,3,2,-1,8,7,6,5,11,10,9,-1}, | |
{2,1,0,-1,5,4,3,-1,9,8,7,6,12,11,10,-1}, | |
{3,2,1,0,6,5,4,-1,10,9,8,7,13,12,11,-1}, | |
{0,-1,-1,-1,4,3,2,1,8,7,6,5,11,10,9,-1}, | |
{1,0,-1,-1,5,4,3,2,9,8,7,6,12,11,10,-1}, | |
{2,1,0,-1,6,5,4,3,10,9,8,7,13,12,11,-1}, | |
{3,2,1,0,7,6,5,4,11,10,9,8,14,13,12,-1}, | |
{0,-1,-1,-1,1,-1,-1,-1,2,-1,-1,-1,6,5,4,3}, | |
{1,0,-1,-1,2,-1,-1,-1,3,-1,-1,-1,7,6,5,4}, | |
{2,1,0,-1,3,-1,-1,-1,4,-1,-1,-1,8,7,6,5}, | |
{3,2,1,0,4,-1,-1,-1,5,-1,-1,-1,9,8,7,6}, | |
{0,-1,-1,-1,2,1,-1,-1,3,-1,-1,-1,7,6,5,4}, | |
{1,0,-1,-1,3,2,-1,-1,4,-1,-1,-1,8,7,6,5}, | |
{2,1,0,-1,4,3,-1,-1,5,-1,-1,-1,9,8,7,6}, | |
{3,2,1,0,5,4,-1,-1,6,-1,-1,-1,10,9,8,7}, | |
{0,-1,-1,-1,3,2,1,-1,4,-1,-1,-1,8,7,6,5}, | |
{1,0,-1,-1,4,3,2,-1,5,-1,-1,-1,9,8,7,6}, | |
{2,1,0,-1,5,4,3,-1,6,-1,-1,-1,10,9,8,7}, | |
{3,2,1,0,6,5,4,-1,7,-1,-1,-1,11,10,9,8}, | |
{0,-1,-1,-1,4,3,2,1,5,-1,-1,-1,9,8,7,6}, | |
{1,0,-1,-1,5,4,3,2,6,-1,-1,-1,10,9,8,7}, | |
{2,1,0,-1,6,5,4,3,7,-1,-1,-1,11,10,9,8}, | |
{3,2,1,0,7,6,5,4,8,-1,-1,-1,12,11,10,9}, | |
{0,-1,-1,-1,1,-1,-1,-1,3,2,-1,-1,7,6,5,4}, | |
{1,0,-1,-1,2,-1,-1,-1,4,3,-1,-1,8,7,6,5}, | |
{2,1,0,-1,3,-1,-1,-1,5,4,-1,-1,9,8,7,6}, | |
{3,2,1,0,4,-1,-1,-1,6,5,-1,-1,10,9,8,7}, | |
{0,-1,-1,-1,2,1,-1,-1,4,3,-1,-1,8,7,6,5}, | |
{1,0,-1,-1,3,2,-1,-1,5,4,-1,-1,9,8,7,6}, | |
{2,1,0,-1,4,3,-1,-1,6,5,-1,-1,10,9,8,7}, | |
{3,2,1,0,5,4,-1,-1,7,6,-1,-1,11,10,9,8}, | |
{0,-1,-1,-1,3,2,1,-1,5,4,-1,-1,9,8,7,6}, | |
{1,0,-1,-1,4,3,2,-1,6,5,-1,-1,10,9,8,7}, | |
{2,1,0,-1,5,4,3,-1,7,6,-1,-1,11,10,9,8}, | |
{3,2,1,0,6,5,4,-1,8,7,-1,-1,12,11,10,9}, | |
{0,-1,-1,-1,4,3,2,1,6,5,-1,-1,10,9,8,7}, | |
{1,0,-1,-1,5,4,3,2,7,6,-1,-1,11,10,9,8}, | |
{2,1,0,-1,6,5,4,3,8,7,-1,-1,12,11,10,9}, | |
{3,2,1,0,7,6,5,4,9,8,-1,-1,13,12,11,10}, | |
{0,-1,-1,-1,1,-1,-1,-1,4,3,2,-1,8,7,6,5}, | |
{1,0,-1,-1,2,-1,-1,-1,5,4,3,-1,9,8,7,6}, | |
{2,1,0,-1,3,-1,-1,-1,6,5,4,-1,10,9,8,7}, | |
{3,2,1,0,4,-1,-1,-1,7,6,5,-1,11,10,9,8}, | |
{0,-1,-1,-1,2,1,-1,-1,5,4,3,-1,9,8,7,6}, | |
{1,0,-1,-1,3,2,-1,-1,6,5,4,-1,10,9,8,7}, | |
{2,1,0,-1,4,3,-1,-1,7,6,5,-1,11,10,9,8}, | |
{3,2,1,0,5,4,-1,-1,8,7,6,-1,12,11,10,9}, | |
{0,-1,-1,-1,3,2,1,-1,6,5,4,-1,10,9,8,7}, | |
{1,0,-1,-1,4,3,2,-1,7,6,5,-1,11,10,9,8}, | |
{2,1,0,-1,5,4,3,-1,8,7,6,-1,12,11,10,9}, | |
{3,2,1,0,6,5,4,-1,9,8,7,-1,13,12,11,10}, | |
{0,-1,-1,-1,4,3,2,1,7,6,5,-1,11,10,9,8}, | |
{1,0,-1,-1,5,4,3,2,8,7,6,-1,12,11,10,9}, | |
{2,1,0,-1,6,5,4,3,9,8,7,-1,13,12,11,10}, | |
{3,2,1,0,7,6,5,4,10,9,8,-1,14,13,12,11}, | |
{0,-1,-1,-1,1,-1,-1,-1,5,4,3,2,9,8,7,6}, | |
{1,0,-1,-1,2,-1,-1,-1,6,5,4,3,10,9,8,7}, | |
{2,1,0,-1,3,-1,-1,-1,7,6,5,4,11,10,9,8}, | |
{3,2,1,0,4,-1,-1,-1,8,7,6,5,12,11,10,9}, | |
{0,-1,-1,-1,2,1,-1,-1,6,5,4,3,10,9,8,7}, | |
{1,0,-1,-1,3,2,-1,-1,7,6,5,4,11,10,9,8}, | |
{2,1,0,-1,4,3,-1,-1,8,7,6,5,12,11,10,9}, | |
{3,2,1,0,5,4,-1,-1,9,8,7,6,13,12,11,10}, | |
{0,-1,-1,-1,3,2,1,-1,7,6,5,4,11,10,9,8}, | |
{1,0,-1,-1,4,3,2,-1,8,7,6,5,12,11,10,9}, | |
{2,1,0,-1,5,4,3,-1,9,8,7,6,13,12,11,10}, | |
{3,2,1,0,6,5,4,-1,10,9,8,7,14,13,12,11}, | |
{0,-1,-1,-1,4,3,2,1,8,7,6,5,12,11,10,9}, | |
{1,0,-1,-1,5,4,3,2,9,8,7,6,13,12,11,10}, | |
{2,1,0,-1,6,5,4,3,10,9,8,7,14,13,12,11}, | |
{3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12} | |
}; | |
static const vector128 ibm_mask_lut_128[256] = { | |
}; | |
#define UTF_CHUNKSIZE (8 * KIB) | |
#define SUPPORT_SIX_BYTE_UTF8 0 | |
typedef enum utf_decoder_flags_t { | |
UTF_BOUNDS_CHECK = 0x00000001 | |
} utf_decoder_flags_t; | |
/* | |
Maximum characters loaded into chunk (in case of 7-bit ASCII) is | |
UTF_CHUNKSIZE | |
Otherwise, the least number of characters that would be loaded into the chunk is | |
UTF_CHUNKSIZE / 4 | |
If all characters are 4-byte characters. | |
*/ | |
typedef struct utf_chunk_decoder_t { | |
char8_t *data; | |
uintptr_t ptr; | |
size_t size; | |
/* loaded slices into the chunk */ | |
size_t groups_in; | |
/* see utf_decoder_flags_t */ | |
unsigned int flags; | |
/* keys for indexing into lookup tables (bit-packed lengths) */ | |
uint8_t lutids[UTF_CHUNKSIZE<<2]; | |
uintptr_t chunkptrs[UTF_CHUNKSIZE<<2]; /* pointers to start of groups cached for speed */ | |
uint8_t chunkin[UTF_CHUNKSIZE]; | |
uint8_t chunkout[UTF_CHUNKSIZE]; | |
} utf_chunk_decoder_t; | |
utf_chunk_decoder_t * | |
utf_chunk_decoder_create(void) | |
{ | |
utf_chunk_decoder_t *state = malloc(sizeof(*state)); | |
state->data = NULL; | |
state->ptr = 0; | |
state->size = 0; | |
state->groups_in = 0; | |
state->flags = UTF_BOUNDS_CHECK; | |
memset(state->chunkin, 0, UTF_CHUNKSIZE); | |
memset(state->chunkout, 0, UTF_CHUNKSIZE); | |
return state; | |
} | |
// void | |
// utf_chunk_decoder_set_chars_per_slice(utf_chunk_decoder_t *state, unsigned int n) | |
// { | |
// state->chars_per_group = n; | |
// } | |
void | |
utf_chunk_decoder_destroy(utf_chunk_decoder_t *state) | |
{ | |
free(state); | |
} | |
__attribute__((always_inline)) int | |
utf_chunk_decoder_load_next_chunk(utf_chunk_decoder_t *state) | |
{ | |
uintptr_t startptr; | |
uintptr_t chunkptr; | |
/* early quit if decoder has no text or ptr is at the end of the data */ | |
if (state->data == NULL && state->size == 0 || state->ptr >= state->size) | |
return 0; | |
state->groups_in = 0; | |
startptr = state->ptr; | |
chunkptr = 0; | |
while (chunkptr + 16 < UTF_CHUNKSIZE) { | |
size_t groupsize; | |
uint_fast8_t i; | |
uint8_t lutid = 0; | |
uintptr_t currpos = state->ptr; | |
#pragma GCC ivdep | |
for (i = 0; i < 4; ++i) { | |
int8_t len; | |
char8_t byte; | |
if ((state->ptr + currpos) >= state->size) | |
return 0; /* can't continue reading or copying */ | |
byte = state->data[currpos]; | |
len = prefix_to_length_table[byte >> 3]; | |
currpos += len + 1; | |
lutid = (lutid << 2) | len; | |
} | |
/* store (cache) accumulated lengths for decoding step */ | |
state->lutids[state->groups_in] = lutid; | |
state->chunkptrs[state->groups_in] = chunkptr; | |
/* memcpy another group */ | |
groupsize = (size_t)((intptr_t)currpos - (intptr_t)state->ptr); | |
// memcpy(state->chunkin + chunkptr, state->data + state->ptr, groupsize); | |
chunkptr += groupsize; | |
state->ptr = currpos; | |
state->groups_in ++; | |
} | |
memcpy(state->chunkin, state->data + startptr, chunkptr); | |
return 1; | |
} | |
/* decodes 4 characters */ | |
static __m128i | |
decode_utf8_to_utf32_ssse3(const __m128i data, | |
const vector128 patternptr) | |
{ | |
register __m128i v, mask, pattern; | |
v = _mm_load_si128((__m128i const *)&data); | |
pattern = _mm_load_si128((__m128i const *)&patternptr); | |
v = _mm_shuffle_epi8(v, pattern); | |
return v; | |
} | |
static int | |
decode_utf8_chunk_to_utf32(utf_chunk_decoder_t *state) | |
{ | |
unsigned int i, j; | |
for (i = 0; i < state->groups_in; ++i) { | |
uint8_t lutid = state->lutids[i]; | |
uintptr_t chunkptr = state->chunkptrs[i]; | |
decode_utf8_to_utf32_ssse3(*(__m128i *)(state->chunkin + chunkptr), | |
ibm_pattern_lut_128[lutid]); | |
// for (j = 0; j < 4; ++j) { | |
// printf("U+%0X, ", v._u32[j]); | |
// } | |
// printf("\n"); | |
} | |
} | |
void | |
decode_utf8_to_ucs2() | |
{} | |
/* | |
* Reads in groups of 4 codepoints, or 16 bytes of original UTF-8 data (which can overflow) | |
*/ | |
void | |
decode_4_chars_utf8_to_utf32(utf_chunk_decoder_t *decoder) | |
{ | |
} | |
/* | |
((((((((0*3+3)*3+3)*3+3)*3+3)*3+3)*3+3)*3+3)*3+3) | |
9840 | |
((((0*3+3)*3+3)*3+3)*3+3) | |
120 | |
*/ | |
/* Accept only 4-byte UTF-8 (21-bits), | |
* decodes in chunks of 4 unicode codepoints (in a vector128). | |
* https://researcher.watson.ibm.com/researcher/files/jp-INOUEHRS/IPSJPRO2008_SIMDdecoding.pdf | |
*/ | |
static vector128 | |
decode_utf8_ibm_emulate_sse1(utf_chunk_decoder_t *state) | |
{ | |
/* | |
mask is the data mask for the loaded lengths, | |
pattern is the permutation pattern | |
*/ | |
register vector128 vin,vout; | |
register vector128 mask,pattern; | |
unsigned int i; | |
uintptr_t pos; /* currpos from last offset */ | |
uint16_t lenreg; /* gathered prefix, accumulated lengths (interleaved) */ | |
uint8_t prefix, len; | |
for (lenreg = 0, pos = 0, i = 0; i < 4; ++i) { | |
prefix = state->data[state->ptr + pos] >> 3; | |
len = prefix_to_length_table[prefix]; | |
lenreg = (lenreg<<2) | len; | |
pos += len+1; | |
} | |
pattern = ibm_pattern_lut_128[lenreg]; | |
mask = ibm_mask_lut_128[lenreg]; | |
vin = load_v128(state->data + state->ptr); | |
vin = shuffle_v128(vin, pattern); | |
vout = and_v128(vin, mask); | |
return vout; | |
} | |
static void | |
generate_ibm_pattern_lut_128(void) | |
{ | |
int i, j, k; | |
for (i = 0; i < 256; ++i) { | |
uint8_t index; | |
int8_t counter; | |
vector128 pattern, mask; | |
pattern = loadi8_v128(-1); | |
index = i; | |
counter = 0; /* byte index counter */ | |
for (j = 0; j < 4; ++j) { | |
uint8_t len = ((index>>(j<<1)) & 0x03) + 1; | |
for (k = 0; k < len; ++k) { | |
/* I think the following step depends on endianness | |
(big-endian & little-endian) */ | |
if (BIGENDIAN()) | |
pattern._i8[j*4+(len-k-1)] = counter++; | |
else | |
pattern._i8[j*4+k] = counter++; | |
} | |
} | |
/* print table entry */ | |
printf("{"); | |
for (j = 0; j < 16; ++j) { | |
int8_t v = pattern._i8[j]; | |
if (j < 15) | |
printf("%d,",v); | |
else | |
printf("%d",v); | |
} | |
if (i < 255) | |
printf("},\n"); | |
else | |
printf("}\n"); | |
} | |
} | |
static void | |
generate_ibm_mask_lut_128(void) | |
{ | |
} | |
static void | |
generate_length_table(void) | |
{ | |
int i = 0, l; | |
for (i = 0; i < 32; ++i) { | |
/* | |
xxx0xxxx | |
xxx110xx | |
xxx1110x | |
xxx11110 | |
*/ | |
if (~i & 0x10) { | |
l = 0; | |
} else if ((i & 0x1c) == 0x18) { | |
l = 1; | |
} else if ((i & 0x1e) == 0x1c) { | |
l = 2; | |
} else if ((i & 0x1f) == 0x1e) { | |
l = 3; | |
} | |
printf("%d, ", l); | |
} | |
fflush(stdout); | |
} | |
int main(int argc, char *argv[]) | |
{ | |
char8_t *data; | |
size_t len; | |
FILE *infile; | |
// infile = fopen("./UTF-8_sequence_separated/utf8_sequence_0-0x10ffff_assigned_printable.txt", "rb"); | |
infile = fopen("./utf-8-test-files/islam-wikipedia-china.html", "rb"); | |
if (infile == NULL) { | |
fprintf(stderr, "Failed to load input file.\n"); | |
exit(-1); | |
} | |
fseek(infile,0,SEEK_END); | |
len = ftell(infile); | |
fseek(infile,0,SEEK_SET); | |
data = malloc(len+1); | |
fread(data, 1, len, infile); | |
data[len] = '\0'; | |
bench_decode_chris_venter_scalar(data, len+1, 20); | |
utf_chunk_decoder_t *state = utf_chunk_decoder_create(); | |
state->data = data; | |
state->size = len+1; | |
{ | |
int i; | |
for (i = 0; i < 20; ++i) | |
{ | |
double t1, t2, delta; | |
state->ptr = 0; | |
t1 = get_time(); | |
int readchunks = 0; | |
while (utf_chunk_decoder_load_next_chunk(state)) { | |
decode_utf8_chunk_to_utf32(state); | |
// printf("CHUNK: \n\n%s\n\n", state->chunkin); | |
readchunks++; | |
} | |
t2 = get_time(); | |
delta = t2-t1; | |
double decoderate = (double)(len+1) / delta; | |
printf("sample %d: decoded %.2f %s (%d chunks of %d bytes) of UTF-8 in %.2f %s (%.2f %s / s)\n", | |
i+1, PRINTMEM(len+1), readchunks, UTF_CHUNKSIZE, PRINTTIME(delta), PRINTMEM(decoderate)); | |
} | |
} | |
free(data); | |
// generate_ibm_pattern_lut_128(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment