-
-
Save aqrit/5c914da98006874d0401983eb687e30e to your computer and use it in GitHub Desktop.
two cache lines used for the shuffle table. untested. not benchmarked.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstddef> | |
#include <cstdint> | |
#include <string.h> | |
#include <bit> | |
#include <climits> | |
#include <cstdio> | |
#include <nmmintrin.h> | |
// The destination buffer size is unknown. The buffer is required to | |
// have enough space for decoding. But it is not required to be | |
// over-provisioned. | |
// | |
size_t sse_convert_latin1_to_utf8( | |
size_t input_length, | |
const char* latin_input, | |
char* utf8_output) | |
{ | |
const uint8_t* src = (const uint8_t*)latin_input; | |
const uint8_t* src_end = &src[input_length]; | |
uint8_t* dst = (uint8_t*)utf8_output; | |
static const uint64_t compress_table[16] = { | |
0x03FFFFFF07050301, 0x04FFFF0705030100, 0x04FFFF0705030201, | |
0x05FF070503020100, 0x04FFFF0705040301, 0x05FF070504030100, | |
0x05FF070504030201, 0x0607050403020100, 0x04FFFF0706050301, | |
0x05FF070605030100, 0x05FF070605030201, 0x0607060503020100, | |
0x05FF070605040301, 0x0607060504030100, 0x0607060504030201, | |
0x0706050403020100 | |
}; | |
// Four inputs bytes expand to 4..8 bytes. Therefore, each 8 byte | |
// store potentially outputs 4 bytes of garbage. So break out of | |
// chunking loop w/at least 4 input bytes remaining. | |
if (input_length >= (16 + 4)) { | |
const uint8_t* chunking_end = &src[((input_length - 4) | 15) ^ 15]; | |
do { | |
__m128i v = _mm_loadu_si128((__m128i*)src); | |
src += 16; | |
size_t mask = (size_t)(unsigned)_mm_movemask_epi8(v); | |
if(mask == 0) { // US ASCII fast path | |
_mm_storeu_si128((__m128i*)dst, v); | |
dst += 16; | |
continue; | |
} | |
// If the input char's MSB is set then it becomes 2 bytes in UTF8. | |
// The UTF8 prefix byte would be either 0xC2 or --if bit6 is set-- 0xC3. | |
// Always clear bit6 of any continuation byte. | |
__m128i continuation = _mm_and_si128(_mm_set1_epi8(-65), v); | |
__m128i is_bit6_zero = _mm_cmpeq_epi8(v, continuation); | |
v = _mm_blendv_epi8(v, continuation, v); | |
__m128i prefix = _mm_add_epi8(is_bit6_zero, _mm_set1_epi8(-61)); | |
// Extend every input char to two bytes. Then conditionally | |
// discard any unwanted prefix bytes using a shuffle. | |
__m128i lo = _mm_unpacklo_epi8(prefix, v); | |
__m128i hi = _mm_unpackhi_epi8(prefix, v); | |
uint64_t shuf0 = compress_table[mask & 0x0F]; | |
uint64_t shuf1 = compress_table[(mask >> 4) & 0x0F]; | |
uint64_t shuf2 = compress_table[(mask >> 8) & 0x0F]; | |
uint64_t shuf3 = compress_table[mask >> 12]; | |
__m128i out0 = _mm_shuffle_epi8(lo, _mm_cvtsi64_si128(shuf0)); | |
__m128i out1 = _mm_shuffle_epi8(lo, _mm_cvtsi64_si128(shuf1 | 0x0808080808080808)); | |
__m128i out2 = _mm_shuffle_epi8(hi, _mm_cvtsi64_si128(shuf2)); | |
__m128i out3 = _mm_shuffle_epi8(hi, _mm_cvtsi64_si128(shuf3 | 0x0808080808080808)); | |
_mm_storel_epi64((__m128i*)dst, out0); dst += (shuf0 >> 56) + 1; | |
_mm_storel_epi64((__m128i*)dst, out1); dst += (shuf1 >> 56) + 1; | |
_mm_storel_epi64((__m128i*)dst, out2); dst += (shuf2 >> 56) + 1; | |
_mm_storel_epi64((__m128i*)dst, out3); dst += (shuf3 >> 56) + 1; | |
} while (src < chunking_end); | |
} | |
// scalar tail loop for up to 19 remaining input bytes | |
while (src < src_end) { | |
uint8_t x = *src++; | |
uint8_t fixbits = x >> 6; | |
*dst = fixbits | 0xC0; | |
dst += x >> 7; | |
*dst++ = x ^ ((fixbits + 0x3D) & 0x40); | |
} | |
// we know we're on a amd64... | |
return (size_t)(dst - (uint8_t*)utf8_output); | |
} | |
std::size_t scalar_convert_latin1_to_utf8((std::size_t len, unsigned char* src, unsigned char* dst){int fn(std::size_t len, unsigned char* src, unsigned char* dst){ | |
using namespace std; | |
const unsigned char *const src_end = &src[len]; | |
size_t n = 0; // dst index to avoid UB with ptrdiff_t ... | |
if constexpr ((CHAR_BIT == 8) && (sizeof(size_t) >= 4)) { | |
if (len >= sizeof(size_t)) { | |
unsigned char* vec_end = &src[len - (sizeof(size_t) - 1)]; | |
while (src < vec_end) { | |
size_t v; | |
memcpy(&v, src, sizeof(v)); | |
memcpy(&dst[n], &v, sizeof(v)); | |
src += sizeof(v); | |
n += sizeof(v); | |
constexpr size_t mask_msb_epi8 = ((~size_t{0}) / 0xFF) << 7; | |
v &= mask_msb_epi8; // v &= 0x8080808080808080 | |
if (v) { | |
size_t backtrack; | |
if constexpr (endian::native == endian::little) { // C++20 | |
backtrack = sizeof(v) - (countr_zero(v) >> 3); | |
} else if constexpr (endian::native == endian::big) { | |
backtrack = sizeof(v) - (countl_zero(v) >> 3); | |
} else { | |
unsigned char* s = src - sizeof(v); | |
size_t i = 0; | |
do { | |
if (s[i] & 0x80) break; | |
} while (++i < sizeof(v)); | |
backtrack = sizeof(v) - i; | |
} | |
src -= backtrack; | |
n -= backtrack; | |
unsigned char x = *src++; | |
dst[n++] = (x >> 6) | 0xC0; | |
dst[n++] = x & 0xBF; | |
} | |
} | |
} | |
} | |
// tail loop | |
while (src < src_end) { | |
unsigned char x = *src++; | |
unsigned char msb = x >> 7; | |
dst[n] = (x >> 6) | 0xC0; // lead_byte.. if has multi_byte | |
n += msb; // if not multi_byte then overwrite lead_byte | |
dst[n++] = x & ~(msb << 6); // if bit7 then clear bit6 | |
} | |
return n; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Seems, there's a possibility of a buffer overflow.
_mm_storel_epi64
always writes 8 bytes, whereas output expects 4-8.