aqrit/latin_to_utf8.cpp Secret

## latin_to_utf8.cpp
#include <cstddef>
#include <cstdint>
#include <string.h>
#include <bit>
#include <climits>
#include <cstdio>

#include <nmmintrin.h>


// The destination buffer size is unknown. The buffer is required to
// have enough space for decoding. But it is not required to be
// over-provisioned.
//
size_t sse_convert_latin1_to_utf8(
    size_t input_length,
    const char* latin_input,
    char* utf8_output)
{
    const uint8_t* src = (const uint8_t*)latin_input;
    const uint8_t* src_end = &src[input_length];
    uint8_t* dst = (uint8_t*)utf8_output;

    static const uint64_t compress_table[16] = {
        0x03FFFFFF07050301, 0x04FFFF0705030100, 0x04FFFF0705030201,
        0x05FF070503020100, 0x04FFFF0705040301, 0x05FF070504030100,
        0x05FF070504030201, 0x0607050403020100, 0x04FFFF0706050301,
        0x05FF070605030100, 0x05FF070605030201, 0x0607060503020100,
        0x05FF070605040301, 0x0607060504030100, 0x0607060504030201,
        0x0706050403020100
    };

    // Four inputs bytes expand to 4..8 bytes. Therefore, each 8 byte
    // store potentially outputs 4 bytes of garbage. So break out of
    // chunking loop w/at least 4 input bytes remaining.
    if (input_length >= (16 + 4)) {
        const uint8_t* chunking_end = &src[((input_length - 4) | 15) ^ 15];
        do {
            __m128i v = _mm_loadu_si128((__m128i*)src);
            src += 16;

            size_t mask = (size_t)(unsigned)_mm_movemask_epi8(v);
            if(mask == 0) { // US ASCII fast path
                _mm_storeu_si128((__m128i*)dst, v);
                dst += 16;
                continue;
            }

            // If the input char's MSB is set then it becomes 2 bytes in UTF8.
            // The UTF8 prefix byte would be either 0xC2 or --if bit6 is set-- 0xC3.
            // Always clear bit6 of any continuation byte.
            __m128i continuation = _mm_and_si128(_mm_set1_epi8(-65), v);
            __m128i is_bit6_zero = _mm_cmpeq_epi8(v, continuation);
            v = _mm_blendv_epi8(v, continuation, v);
            __m128i prefix = _mm_add_epi8(is_bit6_zero, _mm_set1_epi8(-61));

            // Extend every input char to two bytes. Then conditionally
            // discard any unwanted prefix bytes using a shuffle.
            __m128i lo = _mm_unpacklo_epi8(prefix, v);
            __m128i hi = _mm_unpackhi_epi8(prefix, v);
            uint64_t shuf0 = compress_table[mask & 0x0F];
            uint64_t shuf1 = compress_table[(mask >> 4) & 0x0F];
            uint64_t shuf2 = compress_table[(mask >> 8) & 0x0F];
            uint64_t shuf3 = compress_table[mask >> 12];
            __m128i out0 = _mm_shuffle_epi8(lo, _mm_cvtsi64_si128(shuf0));
            __m128i out1 = _mm_shuffle_epi8(lo, _mm_cvtsi64_si128(shuf1 | 0x0808080808080808));
            __m128i out2 = _mm_shuffle_epi8(hi, _mm_cvtsi64_si128(shuf2));
            __m128i out3 = _mm_shuffle_epi8(hi, _mm_cvtsi64_si128(shuf3 | 0x0808080808080808));

            _mm_storel_epi64((__m128i*)dst, out0); dst += (shuf0 >> 56) + 1;
            _mm_storel_epi64((__m128i*)dst, out1); dst += (shuf1 >> 56) + 1;
            _mm_storel_epi64((__m128i*)dst, out2); dst += (shuf2 >> 56) + 1;
            _mm_storel_epi64((__m128i*)dst, out3); dst += (shuf3 >> 56) + 1;
        } while (src < chunking_end);
    }

    // scalar tail loop for up to 19 remaining input bytes
    while (src < src_end) {
        uint8_t x = *src++;
        uint8_t fixbits = x >> 6;
        *dst = fixbits | 0xC0;
        dst += x >> 7;
        *dst++ = x ^ ((fixbits + 0x3D) & 0x40);
    }

    // we know we're on a amd64...
    return (size_t)(dst - (uint8_t*)utf8_output);
}

std::size_t scalar_convert_latin1_to_utf8((std::size_t len, unsigned char* src, unsigned char* dst){int fn(std::size_t len, unsigned char* src, unsigned char* dst){
    using namespace std;
    const unsigned char *const src_end = &src[len];
    size_t n = 0; // dst index to avoid UB with ptrdiff_t ...
    if constexpr ((CHAR_BIT == 8) && (sizeof(size_t) >= 4)) {
        if (len >= sizeof(size_t)) {
            unsigned char* vec_end = &src[len - (sizeof(size_t) - 1)];
            while (src < vec_end) {
                size_t v;
                memcpy(&v, src, sizeof(v));
                memcpy(&dst[n], &v, sizeof(v));
                src += sizeof(v);
                n += sizeof(v);

                constexpr size_t mask_msb_epi8 = ((~size_t{0}) / 0xFF) << 7;
                v &= mask_msb_epi8; // v &= 0x8080808080808080
                if (v) {
                    size_t backtrack;
                    if constexpr (endian::native == endian::little) { // C++20
                        backtrack = sizeof(v) - (countr_zero(v) >> 3);
                    } else if constexpr (endian::native == endian::big) {
                        backtrack = sizeof(v) - (countl_zero(v) >> 3);
                    } else {
                        unsigned char* s = src - sizeof(v);
                        size_t i = 0;
                        do {
                            if (s[i] & 0x80) break;
                        } while (++i < sizeof(v));
                        backtrack = sizeof(v) - i;
                    }
                    src -= backtrack;
                    n -= backtrack;

                    unsigned char x = *src++;
                    dst[n++] = (x >> 6) | 0xC0;
                    dst[n++] = x & 0xBF;
                }
            }
        }
    }

    // tail loop
    while (src < src_end) {
        unsigned char x = *src++;
        unsigned char msb = x >> 7;
        dst[n] = (x >> 6) | 0xC0; // lead_byte.. if has multi_byte
        n += msb; // if not multi_byte then overwrite lead_byte
        dst[n++] = x & ~(msb << 6); // if bit7 then clear bit6
    }

    return n;
}
	#include <cstddef>
	#include <cstdint>
	#include <string.h>
	#include <bit>
	#include <climits>
	#include <cstdio>

	#include <nmmintrin.h>


	// The destination buffer size is unknown. The buffer is required to
	// have enough space for decoding. But it is not required to be
	// over-provisioned.
	//
	size_t sse_convert_latin1_to_utf8(
	size_t input_length,
	const char* latin_input,
	char* utf8_output)
	{
	const uint8_t* src = (const uint8_t*)latin_input;
	const uint8_t* src_end = &src[input_length];
	uint8_t* dst = (uint8_t*)utf8_output;

	static const uint64_t compress_table[16] = {
	0x03FFFFFF07050301, 0x04FFFF0705030100, 0x04FFFF0705030201,
	0x05FF070503020100, 0x04FFFF0705040301, 0x05FF070504030100,
	0x05FF070504030201, 0x0607050403020100, 0x04FFFF0706050301,
	0x05FF070605030100, 0x05FF070605030201, 0x0607060503020100,
	0x05FF070605040301, 0x0607060504030100, 0x0607060504030201,
	0x0706050403020100
	};

	// Four inputs bytes expand to 4..8 bytes. Therefore, each 8 byte
	// store potentially outputs 4 bytes of garbage. So break out of
	// chunking loop w/at least 4 input bytes remaining.
	if (input_length >= (16 + 4)) {
	const uint8_t* chunking_end = &src[((input_length - 4) \| 15) ^ 15];
	do {
	__m128i v = _mm_loadu_si128((__m128i*)src);
	src += 16;

	size_t mask = (size_t)(unsigned)_mm_movemask_epi8(v);
	if(mask == 0) { // US ASCII fast path
	_mm_storeu_si128((__m128i*)dst, v);
	dst += 16;
	continue;
	}

	// If the input char's MSB is set then it becomes 2 bytes in UTF8.
	// The UTF8 prefix byte would be either 0xC2 or --if bit6 is set-- 0xC3.
	// Always clear bit6 of any continuation byte.
	__m128i continuation = _mm_and_si128(_mm_set1_epi8(-65), v);
	__m128i is_bit6_zero = _mm_cmpeq_epi8(v, continuation);
	v = _mm_blendv_epi8(v, continuation, v);
	__m128i prefix = _mm_add_epi8(is_bit6_zero, _mm_set1_epi8(-61));

	// Extend every input char to two bytes. Then conditionally
	// discard any unwanted prefix bytes using a shuffle.
	__m128i lo = _mm_unpacklo_epi8(prefix, v);
	__m128i hi = _mm_unpackhi_epi8(prefix, v);
	uint64_t shuf0 = compress_table[mask & 0x0F];
	uint64_t shuf1 = compress_table[(mask >> 4) & 0x0F];
	uint64_t shuf2 = compress_table[(mask >> 8) & 0x0F];
	uint64_t shuf3 = compress_table[mask >> 12];
	__m128i out0 = _mm_shuffle_epi8(lo, _mm_cvtsi64_si128(shuf0));
	__m128i out1 = _mm_shuffle_epi8(lo, _mm_cvtsi64_si128(shuf1 \| 0x0808080808080808));
	__m128i out2 = _mm_shuffle_epi8(hi, _mm_cvtsi64_si128(shuf2));
	__m128i out3 = _mm_shuffle_epi8(hi, _mm_cvtsi64_si128(shuf3 \| 0x0808080808080808));

	_mm_storel_epi64((__m128i*)dst, out0); dst += (shuf0 >> 56) + 1;
	_mm_storel_epi64((__m128i*)dst, out1); dst += (shuf1 >> 56) + 1;
	_mm_storel_epi64((__m128i*)dst, out2); dst += (shuf2 >> 56) + 1;
	_mm_storel_epi64((__m128i*)dst, out3); dst += (shuf3 >> 56) + 1;
	} while (src < chunking_end);
	}

	// scalar tail loop for up to 19 remaining input bytes
	while (src < src_end) {
	uint8_t x = *src++;
	uint8_t fixbits = x >> 6;
	*dst = fixbits \| 0xC0;
	dst += x >> 7;
	*dst++ = x ^ ((fixbits + 0x3D) & 0x40);
	}

	// we know we're on a amd64...
	return (size_t)(dst - (uint8_t*)utf8_output);
	}

	std::size_t scalar_convert_latin1_to_utf8((std::size_t len, unsigned char* src, unsigned char* dst){int fn(std::size_t len, unsigned char* src, unsigned char* dst){
	using namespace std;
	const unsigned char *const src_end = &src[len];
	size_t n = 0; // dst index to avoid UB with ptrdiff_t ...
	if constexpr ((CHAR_BIT == 8) && (sizeof(size_t) >= 4)) {
	if (len >= sizeof(size_t)) {
	unsigned char* vec_end = &src[len - (sizeof(size_t) - 1)];
	while (src < vec_end) {
	size_t v;
	memcpy(&v, src, sizeof(v));
	memcpy(&dst[n], &v, sizeof(v));
	src += sizeof(v);
	n += sizeof(v);

	constexpr size_t mask_msb_epi8 = ((~size_t{0}) / 0xFF) << 7;
	v &= mask_msb_epi8; // v &= 0x8080808080808080
	if (v) {
	size_t backtrack;
	if constexpr (endian::native == endian::little) { // C++20
	backtrack = sizeof(v) - (countr_zero(v) >> 3);
	} else if constexpr (endian::native == endian::big) {
	backtrack = sizeof(v) - (countl_zero(v) >> 3);
	} else {
	unsigned char* s = src - sizeof(v);
	size_t i = 0;
	do {
	if (s[i] & 0x80) break;
	} while (++i < sizeof(v));
	backtrack = sizeof(v) - i;
	}
	src -= backtrack;
	n -= backtrack;

	unsigned char x = *src++;
	dst[n++] = (x >> 6) \| 0xC0;
	dst[n++] = x & 0xBF;
	}
	}
	}
	}

	// tail loop
	while (src < src_end) {
	unsigned char x = *src++;
	unsigned char msb = x >> 7;
	dst[n] = (x >> 6) \| 0xC0; // lead_byte.. if has multi_byte
	n += msb; // if not multi_byte then overwrite lead_byte
	dst[n++] = x & ~(msb << 6); // if bit7 then clear bit6
	}

	return n;
	}