Last active
July 16, 2023 14:08
-
-
Save Andersama/436fe6c6ab9ecdcaf9782c77b67cffef to your computer and use it in GitHub Desktop.
Simple c++ sub nanosecond/byte utf8 validator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include <cstdint> | |
#include <intrin.h> | |
#include <string_view> | |
namespace utf8 { | |
// see: https://github.com/BobSteagall/utf_utils/blob/master/src/utf_utils.h | |
enum char_class : uint8_t { | |
ILL = 0, //- C0..C1, F5..FF ILLEGAL octets that should never appear in | |
// a UTF-8 sequence | |
// | |
ASC = 1, //- 00..7F ASCII leading byte range | |
// | |
CR1 = 2, //- 80..8F Continuation range 1 | |
CR2 = 3, //- 90..9F Continuation range 2 | |
CR3 = 4, //- A0..BF Continuation range 3 | |
// | |
L2A = 5, //- C2..DF Leading byte range A / 2-byte sequence | |
// | |
L3A = 6, //- E0 Leading byte range A / 3-byte sequence | |
L3B = 7, //- E1..EC, EE..EF Leading byte range B / 3-byte sequence | |
L3C = 8, //- ED Leading byte range C / 3-byte sequence | |
// | |
L4A = 9, //- F0 Leading byte range A / 4-byte sequence | |
L4B = 10, //- F1..F3 Leading byte range B / 4-byte sequence | |
L4C = 11, //- F4 Leading byte range C / 4-byte sequence | |
}; | |
enum utf8_state : uint8_t { | |
BGN = 0, //- Start | |
ERR = 1, //- 12 Invalid sequence | |
// | |
CS1 = 2, //- 24 Continuation state 1 | |
CS2 = 3, //- 36 Continuation state 2 | |
CS3 = 4, //- 48 Continuation state 3 | |
// | |
P3A = 5, //- 60 Partial 3-byte sequence state A | |
P3B = 6, //- 72 Partial 3-byte sequence state B | |
// | |
P4A = 7, //- 84 Partial 4-byte sequence state A | |
P4B = 8, //- 96 Partial 4-byte sequence state B | |
// | |
END = BGN, //- Start and End are the same state! | |
err = ERR, //- For readability in the state transition table | |
}; | |
// clang-format off | |
std::array<char_class, 256> cclass = { | |
char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, //- 00..0F | |
char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, //- 10..1F | |
char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, //- 20..2F | |
char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, //- 30..3F | |
// | |
char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, //- 40..4F | |
char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, //- 50..5F | |
char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, //- 60..6F | |
char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, char_class::ASC, //- 70..7F | |
// | |
char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, char_class::CR1, //- 80..8F | |
char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, char_class::CR2, //- 90..9F | |
char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, //- A0..AF | |
char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, char_class::CR3, //- B0..BF | |
// | |
char_class::ILL, char_class::ILL, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, //- C0..CF | |
char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, char_class::L2A, //- D0..DF | |
char_class::L3A, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3B, char_class::L3C, char_class::L3B, char_class::L3B, //- E0..EF | |
char_class::L4A, char_class::L4B, char_class::L4B, char_class::L4B, char_class::L4C, char_class::ILL, char_class::ILL, char_class::ILL, char_class::ILL, char_class::ILL, char_class::ILL, char_class::ILL, char_class::ILL, char_class::ILL, char_class::ILL, char_class::ILL, //- F0..FF | |
}; | |
// ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS/STATE | |
//========================================================================= | |
std::array<uint8_t, 9*12> ttable = { | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::END, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::CS1, (uint8_t)utf8_state::P3A, (uint8_t)utf8_state::CS2, | |
(uint8_t)utf8_state::P3B, (uint8_t)utf8_state::P4A, (uint8_t)utf8_state::CS3, (uint8_t)utf8_state::P4B, //- BGN|END | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, //- ERR | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::END, (uint8_t)utf8_state::END, | |
(uint8_t)utf8_state::END, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, //- CS1 | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::CS1, (uint8_t)utf8_state::CS1, | |
(uint8_t)utf8_state::CS1, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, //- CS2 | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::CS2, (uint8_t)utf8_state::CS2, | |
(uint8_t)utf8_state::CS2, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, //- CS3 | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, | |
(uint8_t)utf8_state::CS1, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, //- P3A | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::CS1, (uint8_t)utf8_state::CS1, | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, //- P3B | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::CS2, | |
(uint8_t)utf8_state::CS2, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, //- P4A | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::CS2, (uint8_t)utf8_state::ERR, | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, | |
(uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, (uint8_t)utf8_state::ERR, //- P4B | |
}; | |
// clang-format on | |
// utf8 validator satisfies the requirements to run this specialized dfa | |
// implementation, so we'll use it see: | |
// https://branchfree.org/2018/05/25/say-hello-to-my-little-friend-sheng-a-small-but-fast-deterministic-finite-automaton/ | |
// this immediately executed lamda rotates the transition table above (to | |
// match sheng's needs) | |
__declspec(align(16)) std::array<uint8_t, 12 * 16> mtable = []() { | |
__declspec(align(16)) std::array<uint8_t, 12 * 16> t{}; | |
// default "state" is an error (there are empty slots because sheng is a | |
// bit larger than original table) | |
for (size_t i = 0; i < (12 * 16); i++) { | |
t[i] = (uint8_t)utf8_state::ERR; | |
} | |
// rotate the transition table, sheng performs a 16 byte lookup w/ | |
// pshufb in the we're going from state = table[state][input] to state = | |
// table[input][state] | |
for (size_t y = 0; y < 12; y++) { | |
for (size_t x = 0; x < 9; x++) { | |
t[y * 16 + x] = ttable[x * 12 + y]; | |
} | |
} | |
return t; | |
}(); | |
// can be made constexpr | |
bool validate_utf8_table(std::string_view src) noexcept { | |
uint8_t state = 0; | |
for (size_t i = 0; i < src.size(); i++) { | |
uint8_t c1 = cclass[(uint8_t)src[i]]; | |
state = ttable[state * 12 + c1]; | |
if (state == (uint8_t)utf8_state::ERR) { | |
// obviously invalid, bail | |
return false; | |
} | |
} | |
// double check we're in starting state (no weird sequence here) | |
return state == (uint8_t)utf8_state::BGN; | |
} | |
bool validate_utf8_table_sse3(std::string_view src) noexcept { | |
size_t i = 0; | |
__m128i state = _mm_setzero_si128(); | |
uint8_t *sptr = (uint8_t *)src.data(); | |
uint8_t *cptr = (uint8_t *)cclass.data(); | |
__m128i *mptr = (__m128i *)mtable.data(); | |
// process ascii w/o table (since most utf8 is ascii anyway, this can | |
// accelerate validation on utf8 only data) | |
/* we're going to do this, but faster | |
for (; i < src.size(); i++) { | |
if (sptr[i] > 0x7f) | |
break; | |
} | |
*/ | |
ascii_start: | |
for (; i + 15 < src.size(); i += 16) { | |
// we could do alignment tricks, on modern hardware this is stupid | |
// fast | |
__m128i a = _mm_loadu_si128((__m128i *)&sptr[i]); | |
// movemask creates a mask out of leading 1 bits...leading 1s in | |
// utf8 indicate a utf8 sequence...0 -> ascii ergo...we can do this | |
// as a 1 instruction ascii validation check (the wider / faster | |
// instruction the better) | |
int m = _mm_movemask_epi8(a); | |
if (m) | |
// break should optimize to a jump directly into the state | |
// machine | |
break; | |
} | |
// unroll 8 loops (can load c1-c8 in 1 instruction) | |
for (; i + 7 < src.size(); i += 8) { | |
uint8_t c1 = cptr[sptr[i + 0]]; | |
uint8_t c2 = cptr[sptr[i + 1]]; | |
uint8_t c3 = cptr[sptr[i + 2]]; | |
uint8_t c4 = cptr[sptr[i + 3]]; | |
uint8_t c5 = cptr[sptr[i + 4]]; | |
uint8_t c6 = cptr[sptr[i + 5]]; | |
uint8_t c7 = cptr[sptr[i + 6]]; | |
uint8_t c8 = cptr[sptr[i + 7]]; | |
state = _mm_shuffle_epi8(mptr[c1], state); | |
state = _mm_shuffle_epi8(mptr[c2], state); | |
state = _mm_shuffle_epi8(mptr[c4], state); | |
state = _mm_shuffle_epi8(mptr[c5], state); | |
state = _mm_shuffle_epi8(mptr[c3], state); | |
state = _mm_shuffle_epi8(mptr[c6], state); | |
state = _mm_shuffle_epi8(mptr[c7], state); | |
state = _mm_shuffle_epi8(mptr[c8], state); | |
uint8_t rstate = (_mm_cvtsi128_si32(state) & 0xff); | |
if (rstate == (uint8_t)utf8_state::ERR) { | |
return false; | |
} else if (rstate == (uint8_t)utf8_state::BGN) { | |
// restart ascii processing | |
i += 8; | |
goto ascii_start; | |
} | |
} | |
for (; i < src.size(); i++) { | |
uint8_t c1 = cptr[sptr[i + 0]]; | |
state = _mm_shuffle_epi8(mptr[c1], state); | |
} | |
uint8_t rstate = (_mm_cvtsi128_si32(state) & 0xff); | |
return rstate == (uint8_t)utf8_state::BGN; | |
} | |
bool validate_utf8_sse_lookup(std::string_view src) { | |
size_t i = 0; | |
__m128i state = _mm_setzero_si128(); | |
uint8_t *sptr = (uint8_t *)src.data(); | |
uint8_t *cptr = (uint8_t *)cclass.data(); | |
__m128i *mptr = (__m128i *)mtable.data(); | |
__m128i err = _mm_setzero_si128(); | |
__m128i previous_block = _mm_setzero_si128(); | |
//__m128i previous_block_incomplete = _mm_setzero_si128(); | |
// see: | |
// https://github.com/simdjson/simdjson/tree/master/src/generic/stage1/utf8_lookup4_algorithm.h | |
constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ | |
// 11______ 11______ | |
constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ | |
constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ | |
constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ | |
constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ | |
constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ | |
constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ | |
// 11110100 101_____ | |
// 11110101 1001____ | |
// 11110101 101_____ | |
// 1111011_ 1001____ | |
// 1111011_ 101_____ | |
// 11111___ 1001____ | |
// 11111___ 101_____ | |
constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; | |
// 11110101 1000____ | |
// 1111011_ 1000____ | |
// 11111___ 1000____ | |
constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ | |
constexpr const uint8_t CARRY = | |
TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . | |
const __m128i byte_1_high = _mm_setr_epi8( | |
// 0_______ ________ <ASCII in byte 1> | |
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, | |
TOO_LONG, TOO_LONG, | |
// 10______ ________ <continuation in byte 1> | |
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, | |
// 1100____ ________ <two byte lead in byte 1> | |
TOO_SHORT | OVERLONG_2, | |
// 1101____ ________ <two byte lead in byte 1> | |
TOO_SHORT, | |
// 1110____ ________ <three byte lead in byte 1> | |
TOO_SHORT | OVERLONG_3 | SURROGATE, | |
// 1111____ ________ <four+ byte lead in byte 1> | |
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); | |
const __m128i byte_1_low = _mm_setr_epi8( | |
// ____0000 ________ | |
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, | |
// ____0001 ________ | |
CARRY | OVERLONG_2, | |
// ____001_ ________ | |
CARRY, CARRY, | |
// ____0100 ________ | |
CARRY | TOO_LARGE, | |
// ____0101 ________ | |
CARRY | TOO_LARGE | TOO_LARGE_1000, | |
// ____011_ ________ | |
CARRY | TOO_LARGE | TOO_LARGE_1000, | |
CARRY | TOO_LARGE | TOO_LARGE_1000, | |
// ____1___ ________ | |
CARRY | TOO_LARGE | TOO_LARGE_1000, | |
CARRY | TOO_LARGE | TOO_LARGE_1000, | |
CARRY | TOO_LARGE | TOO_LARGE_1000, | |
CARRY | TOO_LARGE | TOO_LARGE_1000, | |
CARRY | TOO_LARGE | TOO_LARGE_1000, | |
// ____1101 ________ | |
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, | |
CARRY | TOO_LARGE | TOO_LARGE_1000, | |
CARRY | TOO_LARGE | TOO_LARGE_1000); | |
const __m128i byte_2_high = _mm_setr_epi8( | |
// ________ 0_______ <ASCII in byte 2> | |
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, | |
TOO_SHORT, TOO_SHORT, | |
// ________ 1000____ | |
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | | |
OVERLONG_4, | |
// ________ 1001____ | |
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, | |
// ________ 101_____ | |
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, | |
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, | |
// ________ 11______ | |
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); | |
const __m128i low_mask = | |
_mm_set_epi8(0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, | |
0xf, 0xf, 0xf, 0xf, 0xf); | |
const __m128i mask0x80 = | |
_mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, | |
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); | |
const __m128i incomplete_vals = _mm_set_epi8( | |
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1); | |
ascii_start: | |
for (; i + 15 < src.size(); i += 16) { | |
__m128i a = _mm_loadu_si128((__m128i *)&sptr[i]); | |
// movemask creates a mask of leading 1 bits...leading 1s in utf8 | |
// indicate a utf8 sequence...0 -> ascii ergo...we can do this as a | |
// 1 instruction ascii validation check (the wider / faster | |
// instruction the better) | |
int m = _mm_movemask_epi8(a); | |
if (m) | |
break; | |
} | |
for (; i + 15 < src.size(); i += 16) { | |
__m128i current_block = _mm_loadu_si128((__m128i *)&sptr[i]); | |
// check_utf8_bytes | |
// previous block shifted 1 byte over | |
__m128i pb_1 = | |
_mm_alignr_epi8(current_block, previous_block, 16 - 1); | |
// setup nibbles and lookup | |
__m128i pb_shift4 = | |
_mm_and_si128(_mm_srli_epi16(pb_1, 4), low_mask); | |
__m128i lookup_1_high = _mm_shuffle_epi8(byte_1_high, pb_shift4); | |
__m128i pb_m = _mm_and_si128(pb_1, low_mask); | |
__m128i lookup_1_low = _mm_shuffle_epi8(byte_1_low, pb_m); | |
__m128i cb_shift4 = | |
_mm_and_si128(_mm_srli_epi16(current_block, 4), low_mask); | |
__m128i lookup_2_high = _mm_shuffle_epi8(byte_2_high, cb_shift4); | |
__m128i lookup_0 = _mm_and_si128(lookup_1_high, lookup_1_low); | |
__m128i lookup_block = _mm_and_si128(lookup_0, lookup_2_high); | |
// now we check multi byte lengths | |
// previous block shifted 2 and 3 bytes over | |
__m128i pb_2 = | |
_mm_alignr_epi8(current_block, previous_block, 16 - 2); | |
__m128i pb_3 = | |
_mm_alignr_epi8(current_block, previous_block, 16 - 3); | |
__m128i is_3rd_byte = | |
_mm_subs_epu8(pb_2, _mm_set1_epi8(0b11100000U - 1)); | |
__m128i is_4th_byte = | |
_mm_subs_epu8(pb_3, _mm_set1_epi8(0b11110000U - 1)); | |
__m128i is_3rd_or_4th_byte = _mm_or_si128(is_3rd_byte, is_4th_byte); | |
__m128i must23 = | |
_mm_cmpgt_epi8(is_3rd_or_4th_byte, _mm_setzero_si128()); | |
__m128i must23_80 = _mm_and_si128(must23, mask0x80); | |
__m128i length_check = _mm_xor_si128(must23_80, lookup_block); | |
err = _mm_or_si128(err, length_check); | |
// previous_block = current_block; | |
previous_block = _mm_load_si128(¤t_block); | |
__m128i zmask = _mm_cmpeq_epi8(err, _mm_setzero_si128()); | |
int result = _mm_movemask_epi8(zmask); | |
if (result != 0xffff) { | |
// there was some form of error | |
return false; | |
} | |
} | |
// make a temporary buffer filled w/ spaces | |
__m128i current_block = _mm_set1_epi8(0x20); | |
if (i < src.size()) | |
std::memcpy(¤t_block, &sptr[i], src.size() - i); | |
// check_utf8_bytes | |
// previous block shifted 1 byte over | |
__m128i pb_1 = _mm_alignr_epi8(current_block, previous_block, 16 - 1); | |
// setup nibbles and lookup | |
__m128i pb_shift4 = _mm_and_si128(_mm_srli_epi16(pb_1, 4), low_mask); | |
__m128i lookup_1_high = _mm_shuffle_epi8(byte_1_high, pb_shift4); | |
__m128i pb_m = _mm_and_si128(pb_1, low_mask); | |
__m128i lookup_1_low = _mm_shuffle_epi8(byte_1_low, pb_m); | |
__m128i cb_shift4 = | |
_mm_and_si128(_mm_srli_epi16(current_block, 4), low_mask); | |
__m128i lookup_2_high = _mm_shuffle_epi8(byte_2_high, cb_shift4); | |
__m128i lookup_0 = _mm_and_si128(lookup_1_high, lookup_1_low); | |
__m128i lookup_block = _mm_and_si128(lookup_0, lookup_2_high); | |
// now we check multi byte lengths | |
// previous block shifted 2 and 3 bytes over | |
__m128i pb_2 = _mm_alignr_epi8(current_block, previous_block, 16 - 2); | |
__m128i pb_3 = _mm_alignr_epi8(current_block, previous_block, 16 - 3); | |
__m128i is_3rd_byte = | |
_mm_subs_epu8(pb_2, _mm_set1_epi8(0b11100000U - 1)); | |
__m128i is_4th_byte = | |
_mm_subs_epu8(pb_3, _mm_set1_epi8(0b11110000U - 1)); | |
__m128i is_3rd_or_4th_byte = _mm_or_si128(is_3rd_byte, is_4th_byte); | |
__m128i must23 = | |
_mm_cmpgt_epi8(is_3rd_or_4th_byte, _mm_setzero_si128()); | |
__m128i must23_80 = _mm_and_si128(must23, mask0x80); | |
__m128i length_check = _mm_xor_si128(must23_80, lookup_block); | |
err = _mm_or_si128(err, length_check); | |
// keep track of incomplete (unclear if we need to check this) | |
__m128i previous_block_incomplete = | |
_mm_subs_epu8(previous_block, incomplete_vals); | |
// if err has any bits set, that's an error | |
// previous_block = current_block; | |
previous_block = _mm_load_si128(¤t_block); | |
__m128i zmask = _mm_cmpeq_epi8(err, _mm_setzero_si128()); | |
int result = _mm_movemask_epi8(zmask); | |
// anything else indicates an error | |
return result == 0xffff; | |
} | |
}; // namespace utf8 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Title's a bit of a joke, this is a conversion of simdutf8's validator (at the time) into simd calls, it's fairly lengthy.