Skip to content

Instantly share code, notes, and snippets.

@aqrit
Last active October 22, 2023 04:43
Show Gist options
  • Save aqrit/5c914da98006874d0401983eb687e30e to your computer and use it in GitHub Desktop.
Save aqrit/5c914da98006874d0401983eb687e30e to your computer and use it in GitHub Desktop.
two cache lines used for the shuffle table. untested. not benchmarked.
#include <cstddef>
#include <cstdint>
#include <string.h>
#include <bit>
#include <climits>
#include <cstdio>
#include <nmmintrin.h>
// The destination buffer size is unknown. The buffer is required to
// have enough space for decoding. But it is not required to be
// over-provisioned.
//
size_t sse_convert_latin1_to_utf8(
size_t input_length,
const char* latin_input,
char* utf8_output)
{
const uint8_t* src = (const uint8_t*)latin_input;
const uint8_t* src_end = &src[input_length];
uint8_t* dst = (uint8_t*)utf8_output;
static const uint64_t compress_table[16] = {
0x03FFFFFF07050301, 0x04FFFF0705030100, 0x04FFFF0705030201,
0x05FF070503020100, 0x04FFFF0705040301, 0x05FF070504030100,
0x05FF070504030201, 0x0607050403020100, 0x04FFFF0706050301,
0x05FF070605030100, 0x05FF070605030201, 0x0607060503020100,
0x05FF070605040301, 0x0607060504030100, 0x0607060504030201,
0x0706050403020100
};
// Four inputs bytes expand to 4..8 bytes. Therefore, each 8 byte
// store potentially outputs 4 bytes of garbage. So break out of
// chunking loop w/at least 4 input bytes remaining.
if (input_length >= (16 + 4)) {
const uint8_t* chunking_end = &src[((input_length - 4) | 15) ^ 15];
do {
__m128i v = _mm_loadu_si128((__m128i*)src);
src += 16;
size_t mask = (size_t)(unsigned)_mm_movemask_epi8(v);
if(mask == 0) { // US ASCII fast path
_mm_storeu_si128((__m128i*)dst, v);
dst += 16;
continue;
}
// If the input char's MSB is set then it becomes 2 bytes in UTF8.
// The UTF8 prefix byte would be either 0xC2 or --if bit6 is set-- 0xC3.
// Always clear bit6 of any continuation byte.
__m128i continuation = _mm_and_si128(_mm_set1_epi8(-65), v);
__m128i is_bit6_zero = _mm_cmpeq_epi8(v, continuation);
v = _mm_blendv_epi8(v, continuation, v);
__m128i prefix = _mm_add_epi8(is_bit6_zero, _mm_set1_epi8(-61));
// Extend every input char to two bytes. Then conditionally
// discard any unwanted prefix bytes using a shuffle.
__m128i lo = _mm_unpacklo_epi8(prefix, v);
__m128i hi = _mm_unpackhi_epi8(prefix, v);
uint64_t shuf0 = compress_table[mask & 0x0F];
uint64_t shuf1 = compress_table[(mask >> 4) & 0x0F];
uint64_t shuf2 = compress_table[(mask >> 8) & 0x0F];
uint64_t shuf3 = compress_table[mask >> 12];
__m128i out0 = _mm_shuffle_epi8(lo, _mm_cvtsi64_si128(shuf0));
__m128i out1 = _mm_shuffle_epi8(lo, _mm_cvtsi64_si128(shuf1 | 0x0808080808080808));
__m128i out2 = _mm_shuffle_epi8(hi, _mm_cvtsi64_si128(shuf2));
__m128i out3 = _mm_shuffle_epi8(hi, _mm_cvtsi64_si128(shuf3 | 0x0808080808080808));
_mm_storel_epi64((__m128i*)dst, out0); dst += (shuf0 >> 56) + 1;
_mm_storel_epi64((__m128i*)dst, out1); dst += (shuf1 >> 56) + 1;
_mm_storel_epi64((__m128i*)dst, out2); dst += (shuf2 >> 56) + 1;
_mm_storel_epi64((__m128i*)dst, out3); dst += (shuf3 >> 56) + 1;
} while (src < chunking_end);
}
// scalar tail loop for up to 19 remaining input bytes
while (src < src_end) {
uint8_t x = *src++;
uint8_t fixbits = x >> 6;
*dst = fixbits | 0xC0;
dst += x >> 7;
*dst++ = x ^ ((fixbits + 0x3D) & 0x40);
}
// we know we're on a amd64...
return (size_t)(dst - (uint8_t*)utf8_output);
}
std::size_t scalar_convert_latin1_to_utf8((std::size_t len, unsigned char* src, unsigned char* dst){int fn(std::size_t len, unsigned char* src, unsigned char* dst){
using namespace std;
const unsigned char *const src_end = &src[len];
size_t n = 0; // dst index to avoid UB with ptrdiff_t ...
if constexpr ((CHAR_BIT == 8) && (sizeof(size_t) >= 4)) {
if (len >= sizeof(size_t)) {
unsigned char* vec_end = &src[len - (sizeof(size_t) - 1)];
while (src < vec_end) {
size_t v;
memcpy(&v, src, sizeof(v));
memcpy(&dst[n], &v, sizeof(v));
src += sizeof(v);
n += sizeof(v);
constexpr size_t mask_msb_epi8 = ((~size_t{0}) / 0xFF) << 7;
v &= mask_msb_epi8; // v &= 0x8080808080808080
if (v) {
size_t backtrack;
if constexpr (endian::native == endian::little) { // C++20
backtrack = sizeof(v) - (countr_zero(v) >> 3);
} else if constexpr (endian::native == endian::big) {
backtrack = sizeof(v) - (countl_zero(v) >> 3);
} else {
unsigned char* s = src - sizeof(v);
size_t i = 0;
do {
if (s[i] & 0x80) break;
} while (++i < sizeof(v));
backtrack = sizeof(v) - i;
}
src -= backtrack;
n -= backtrack;
unsigned char x = *src++;
dst[n++] = (x >> 6) | 0xC0;
dst[n++] = x & 0xBF;
}
}
}
}
// tail loop
while (src < src_end) {
unsigned char x = *src++;
unsigned char msb = x >> 7;
dst[n] = (x >> 6) | 0xC0; // lead_byte.. if has multi_byte
n += msb; // if not multi_byte then overwrite lead_byte
dst[n++] = x & ~(msb << 6); // if bit7 then clear bit6
}
return n;
}
@aspic-fish
Copy link

Seems, there's a possibility of a buffer overflow. _mm_storel_epi64 always writes 8 bytes, whereas output expects 4-8.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment