Skip to content

Instantly share code, notes, and snippets.

@Andersama
Last active July 16, 2023 14:08
Show Gist options
  • Save Andersama/436fe6c6ab9ecdcaf9782c77b67cffef to your computer and use it in GitHub Desktop.
Save Andersama/436fe6c6ab9ecdcaf9782c77b67cffef to your computer and use it in GitHub Desktop.
Simple c++ sub nanosecond/byte utf8 validator
#pragma once
#include <cstdint>
#include <intrin.h>
#include <string_view>
namespace utf8 {
// see: https://github.com/BobSteagall/utf_utils/blob/master/src/utf_utils.h
enum char_class : uint8_t {
ILL = 0, //- C0..C1, F5..FF ILLEGAL octets that should never appear in
// a UTF-8 sequence
//
ASC = 1, //- 00..7F ASCII leading byte range
//
CR1 = 2, //- 80..8F Continuation range 1
CR2 = 3, //- 90..9F Continuation range 2
CR3 = 4, //- A0..BF Continuation range 3
//
L2A = 5, //- C2..DF Leading byte range A / 2-byte sequence
//
L3A = 6, //- E0 Leading byte range A / 3-byte sequence
L3B = 7, //- E1..EC, EE..EF Leading byte range B / 3-byte sequence
L3C = 8, //- ED Leading byte range C / 3-byte sequence
//
L4A = 9, //- F0 Leading byte range A / 4-byte sequence
L4B = 10, //- F1..F3 Leading byte range B / 4-byte sequence
L4C = 11, //- F4 Leading byte range C / 4-byte sequence
};
enum utf8_state : uint8_t {
BGN = 0, //- Start
ERR = 1, //- 12 Invalid sequence
//
CS1 = 2, //- 24 Continuation state 1
CS2 = 3, //- 36 Continuation state 2
CS3 = 4, //- 48 Continuation state 3
//
P3A = 5, //- 60 Partial 3-byte sequence state A
P3B = 6, //- 72 Partial 3-byte sequence state B
//
P4A = 7, //- 84 Partial 4-byte sequence state A
P4B = 8, //- 96 Partial 4-byte sequence state B
//
END = BGN, //- Start and End are the same state!
err = ERR, //- For readability in the state transition table
};
// clang-format off
std::array<char_class, 256> cclass = {
char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, //- 00..0F
char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, //- 10..1F
char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, //- 20..2F
char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, //- 30..3F
//
char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, //- 40..4F
char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, //- 50..5F
char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, //- 60..6F
char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, //- 70..7F
//
char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, //- 80..8F
char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, //- 90..9F
char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, //- A0..AF
char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, //- B0..BF
//
char_class::ILL, char_class::ILL, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, //- C0..CF
char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, //- D0..DF
char_class::L3A, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3C, char_class::L3B, char_class::L3B, //- E0..EF
char_class::L4A, char_class::L4B, char_class::L4B, char_class::L4B, char_class::L4C, char_class::ILL, char_class::ILL, char_class::ILL, char_class::ILL, char_class::ILL, char_class::ILL, char_class::ILL, char_class::ILL, char_class::ILL, char_class::ILL, char_class::ILL, //- F0..FF
};
// ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS/STATE
//=========================================================================
std::array<uint8_t, 9*12> ttable = {
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::END, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR,
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::CS1, (uint8_t)utf8_state::P3A, (uint8_t)utf8_state::CS2,
(uint8_t)utf8_state::P3B, (uint8_t)utf8_state::P4A, (uint8_t)utf8_state::CS3, (uint8_t)utf8_state::P4B, //- BGN|END
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR,
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR,
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, //- ERR
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::END, (uint8_t)utf8_state::END,
(uint8_t)utf8_state::END, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR,
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, //- CS1
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::CS1, (uint8_t)utf8_state::CS1,
(uint8_t)utf8_state::CS1, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR,
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, //- CS2
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::CS2, (uint8_t)utf8_state::CS2,
(uint8_t)utf8_state::CS2, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR,
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, //- CS3
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR,
(uint8_t)utf8_state::CS1, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR,
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, //- P3A
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::CS1, (uint8_t)utf8_state::CS1,
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR,
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, //- P3B
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::CS2,
(uint8_t)utf8_state::CS2, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR,
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, //- P4A
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::CS2, (uint8_t)utf8_state::ERR,
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR,
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, //- P4B
};
// clang-format on
// utf8 validator satisfies the requirements to run this specialized dfa
// implementation, so we'll use it see:
// https://branchfree.org/2018/05/25/say-hello-to-my-little-friend-sheng-a-small-but-fast-deterministic-finite-automaton/
// this immediately executed lamda rotates the transition table above (to
// match sheng's needs)
__declspec(align(16)) std::array<uint8_t, 12 * 16> mtable = []() {
__declspec(align(16)) std::array<uint8_t, 12 * 16> t{};
// default "state" is an error (there are empty slots because sheng is a
// bit larger than original table)
for (size_t i = 0; i < (12 * 16); i++) {
t[i] = (uint8_t)utf8_state::ERR;
}
// rotate the transition table, sheng performs a 16 byte lookup w/
// pshufb in the we're going from state = table[state][input] to state =
// table[input][state]
for (size_t y = 0; y < 12; y++) {
for (size_t x = 0; x < 9; x++) {
t[y * 16 + x] = ttable[x * 12 + y];
}
}
return t;
}();
// can be made constexpr
bool validate_utf8_table(std::string_view src) noexcept {
uint8_t state = 0;
for (size_t i = 0; i < src.size(); i++) {
uint8_t c1 = cclass[(uint8_t)src[i]];
state = ttable[state * 12 + c1];
if (state == (uint8_t)utf8_state::ERR) {
// obviously invalid, bail
return false;
}
}
// double check we're in starting state (no weird sequence here)
return state == (uint8_t)utf8_state::BGN;
}
bool validate_utf8_table_sse3(std::string_view src) noexcept {
size_t i = 0;
__m128i state = _mm_setzero_si128();
uint8_t *sptr = (uint8_t *)src.data();
uint8_t *cptr = (uint8_t *)cclass.data();
__m128i *mptr = (__m128i *)mtable.data();
// process ascii w/o table (since most utf8 is ascii anyway, this can
// accelerate validation on utf8 only data)
/* we're going to do this, but faster
for (; i < src.size(); i++) {
if (sptr[i] > 0x7f)
break;
}
*/
ascii_start:
for (; i + 15 < src.size(); i += 16) {
// we could do alignment tricks, on modern hardware this is stupid
// fast
__m128i a = _mm_loadu_si128((__m128i *)&sptr[i]);
// movemask creates a mask out of leading 1 bits...leading 1s in
// utf8 indicate a utf8 sequence...0 -> ascii ergo...we can do this
// as a 1 instruction ascii validation check (the wider / faster
// instruction the better)
int m = _mm_movemask_epi8(a);
if (m)
// break should optimize to a jump directly into the state
// machine
break;
}
// unroll 8 loops (can load c1-c8 in 1 instruction)
for (; i + 7 < src.size(); i += 8) {
uint8_t c1 = cptr[sptr[i + 0]];
uint8_t c2 = cptr[sptr[i + 1]];
uint8_t c3 = cptr[sptr[i + 2]];
uint8_t c4 = cptr[sptr[i + 3]];
uint8_t c5 = cptr[sptr[i + 4]];
uint8_t c6 = cptr[sptr[i + 5]];
uint8_t c7 = cptr[sptr[i + 6]];
uint8_t c8 = cptr[sptr[i + 7]];
state = _mm_shuffle_epi8(mptr[c1], state);
state = _mm_shuffle_epi8(mptr[c2], state);
state = _mm_shuffle_epi8(mptr[c4], state);
state = _mm_shuffle_epi8(mptr[c5], state);
state = _mm_shuffle_epi8(mptr[c3], state);
state = _mm_shuffle_epi8(mptr[c6], state);
state = _mm_shuffle_epi8(mptr[c7], state);
state = _mm_shuffle_epi8(mptr[c8], state);
uint8_t rstate = (_mm_cvtsi128_si32(state) & 0xff);
if (rstate == (uint8_t)utf8_state::ERR) {
return false;
} else if (rstate == (uint8_t)utf8_state::BGN) {
// restart ascii processing
i += 8;
goto ascii_start;
}
}
for (; i < src.size(); i++) {
uint8_t c1 = cptr[sptr[i + 0]];
state = _mm_shuffle_epi8(mptr[c1], state);
}
uint8_t rstate = (_mm_cvtsi128_si32(state) & 0xff);
return rstate == (uint8_t)utf8_state::BGN;
}
bool validate_utf8_sse_lookup(std::string_view src) {
size_t i = 0;
__m128i state = _mm_setzero_si128();
uint8_t *sptr = (uint8_t *)src.data();
uint8_t *cptr = (uint8_t *)cclass.data();
__m128i *mptr = (__m128i *)mtable.data();
__m128i err = _mm_setzero_si128();
__m128i previous_block = _mm_setzero_si128();
//__m128i previous_block_incomplete = _mm_setzero_si128();
// see:
// https://github.com/simdjson/simdjson/tree/master/src/generic/stage1/utf8_lookup4_algorithm.h
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
// 11______ 11______
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
// 11110100 101_____
// 11110101 1001____
// 11110101 101_____
// 1111011_ 1001____
// 1111011_ 101_____
// 11111___ 1001____
// 11111___ 101_____
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
// 11110101 1000____
// 1111011_ 1000____
// 11111___ 1000____
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
constexpr const uint8_t CARRY =
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
const __m128i byte_1_high = _mm_setr_epi8(
// 0_______ ________ <ASCII in byte 1>
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
TOO_LONG, TOO_LONG,
// 10______ ________ <continuation in byte 1>
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
// 1100____ ________ <two byte lead in byte 1>
TOO_SHORT | OVERLONG_2,
// 1101____ ________ <two byte lead in byte 1>
TOO_SHORT,
// 1110____ ________ <three byte lead in byte 1>
TOO_SHORT | OVERLONG_3 | SURROGATE,
// 1111____ ________ <four+ byte lead in byte 1>
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
const __m128i byte_1_low = _mm_setr_epi8(
// ____0000 ________
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
// ____0001 ________
CARRY | OVERLONG_2,
// ____001_ ________
CARRY, CARRY,
// ____0100 ________
CARRY | TOO_LARGE,
// ____0101 ________
CARRY | TOO_LARGE | TOO_LARGE_1000,
// ____011_ ________
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
// ____1___ ________
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
// ____1101 ________
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000);
const __m128i byte_2_high = _mm_setr_epi8(
// ________ 0_______ <ASCII in byte 2>
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
TOO_SHORT, TOO_SHORT,
// ________ 1000____
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
OVERLONG_4,
// ________ 1001____
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
// ________ 101_____
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
// ________ 11______
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
const __m128i low_mask =
_mm_set_epi8(0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
0xf, 0xf, 0xf, 0xf, 0xf);
const __m128i mask0x80 =
_mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
const __m128i incomplete_vals = _mm_set_epi8(
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1);
ascii_start:
for (; i + 15 < src.size(); i += 16) {
__m128i a = _mm_loadu_si128((__m128i *)&sptr[i]);
// movemask creates a mask of leading 1 bits...leading 1s in utf8
// indicate a utf8 sequence...0 -> ascii ergo...we can do this as a
// 1 instruction ascii validation check (the wider / faster
// instruction the better)
int m = _mm_movemask_epi8(a);
if (m)
break;
}
for (; i + 15 < src.size(); i += 16) {
__m128i current_block = _mm_loadu_si128((__m128i *)&sptr[i]);
// check_utf8_bytes
// previous block shifted 1 byte over
__m128i pb_1 =
_mm_alignr_epi8(current_block, previous_block, 16 - 1);
// setup nibbles and lookup
__m128i pb_shift4 =
_mm_and_si128(_mm_srli_epi16(pb_1, 4), low_mask);
__m128i lookup_1_high = _mm_shuffle_epi8(byte_1_high, pb_shift4);
__m128i pb_m = _mm_and_si128(pb_1, low_mask);
__m128i lookup_1_low = _mm_shuffle_epi8(byte_1_low, pb_m);
__m128i cb_shift4 =
_mm_and_si128(_mm_srli_epi16(current_block, 4), low_mask);
__m128i lookup_2_high = _mm_shuffle_epi8(byte_2_high, cb_shift4);
__m128i lookup_0 = _mm_and_si128(lookup_1_high, lookup_1_low);
__m128i lookup_block = _mm_and_si128(lookup_0, lookup_2_high);
// now we check multi byte lengths
// previous block shifted 2 and 3 bytes over
__m128i pb_2 =
_mm_alignr_epi8(current_block, previous_block, 16 - 2);
__m128i pb_3 =
_mm_alignr_epi8(current_block, previous_block, 16 - 3);
__m128i is_3rd_byte =
_mm_subs_epu8(pb_2, _mm_set1_epi8(0b11100000U - 1));
__m128i is_4th_byte =
_mm_subs_epu8(pb_3, _mm_set1_epi8(0b11110000U - 1));
__m128i is_3rd_or_4th_byte = _mm_or_si128(is_3rd_byte, is_4th_byte);
__m128i must23 =
_mm_cmpgt_epi8(is_3rd_or_4th_byte, _mm_setzero_si128());
__m128i must23_80 = _mm_and_si128(must23, mask0x80);
__m128i length_check = _mm_xor_si128(must23_80, lookup_block);
err = _mm_or_si128(err, length_check);
// previous_block = current_block;
previous_block = _mm_load_si128(&current_block);
__m128i zmask = _mm_cmpeq_epi8(err, _mm_setzero_si128());
int result = _mm_movemask_epi8(zmask);
if (result != 0xffff) {
// there was some form of error
return false;
}
}
// make a temporary buffer filled w/ spaces
__m128i current_block = _mm_set1_epi8(0x20);
if (i < src.size())
std::memcpy(&current_block, &sptr[i], src.size() - i);
// check_utf8_bytes
// previous block shifted 1 byte over
__m128i pb_1 = _mm_alignr_epi8(current_block, previous_block, 16 - 1);
// setup nibbles and lookup
__m128i pb_shift4 = _mm_and_si128(_mm_srli_epi16(pb_1, 4), low_mask);
__m128i lookup_1_high = _mm_shuffle_epi8(byte_1_high, pb_shift4);
__m128i pb_m = _mm_and_si128(pb_1, low_mask);
__m128i lookup_1_low = _mm_shuffle_epi8(byte_1_low, pb_m);
__m128i cb_shift4 =
_mm_and_si128(_mm_srli_epi16(current_block, 4), low_mask);
__m128i lookup_2_high = _mm_shuffle_epi8(byte_2_high, cb_shift4);
__m128i lookup_0 = _mm_and_si128(lookup_1_high, lookup_1_low);
__m128i lookup_block = _mm_and_si128(lookup_0, lookup_2_high);
// now we check multi byte lengths
// previous block shifted 2 and 3 bytes over
__m128i pb_2 = _mm_alignr_epi8(current_block, previous_block, 16 - 2);
__m128i pb_3 = _mm_alignr_epi8(current_block, previous_block, 16 - 3);
__m128i is_3rd_byte =
_mm_subs_epu8(pb_2, _mm_set1_epi8(0b11100000U - 1));
__m128i is_4th_byte =
_mm_subs_epu8(pb_3, _mm_set1_epi8(0b11110000U - 1));
__m128i is_3rd_or_4th_byte = _mm_or_si128(is_3rd_byte, is_4th_byte);
__m128i must23 =
_mm_cmpgt_epi8(is_3rd_or_4th_byte, _mm_setzero_si128());
__m128i must23_80 = _mm_and_si128(must23, mask0x80);
__m128i length_check = _mm_xor_si128(must23_80, lookup_block);
err = _mm_or_si128(err, length_check);
// keep track of incomplete (unclear if we need to check this)
__m128i previous_block_incomplete =
_mm_subs_epu8(previous_block, incomplete_vals);
// if err has any bits set, that's an error
// previous_block = current_block;
previous_block = _mm_load_si128(&current_block);
__m128i zmask = _mm_cmpeq_epi8(err, _mm_setzero_si128());
int result = _mm_movemask_epi8(zmask);
// anything else indicates an error
return result == 0xffff;
}
}; // namespace utf8
@Andersama
Copy link
Author

Title's a bit of a joke, this is a conversion of simdutf8's validator (at the time) into simd calls, it's fairly lengthy.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment