Const-me/varIntDecoder.cpp

## varIntDecoder.cpp
#include <immintrin.h>
#include <stdint.h>

// 1  = use `vpgatherdq` to load 4 numbers with 1 instruction, 0 = load them with scalar loads
// It seems on AMD CPUs scalar loads are slightly faster
#define USE_GATHER_INSTUCTIONS 0

// Inclusive prefix sum of unsigned bytes = offsets of the end of the numbers
// When the sum of all bytes exceeds 0xFF, the output is garbage
// Which is fine here because our bytes are in [0..8] interval
inline __m128i inclusivePrefixSum( __m128i v )
{
	// https://en.wikipedia.org/wiki/Prefix_sum#/media/File:Hillis-Steele_Prefix_Sum.svg
	v = _mm_add_epi8( v, _mm_slli_si128( v, 1 ) );
	v = _mm_add_epi8( v, _mm_slli_si128( v, 2 ) );
	v = _mm_add_epi8( v, _mm_slli_si128( v, 4 ) );
	v = _mm_add_epi8( v, _mm_slli_si128( v, 8 ) );
	return v;
}

// Extract the last byte from the vector
inline uint8_t extractLast( __m128i v )
{
	uint16_t tmp = _mm_extract_epi16( v, 7 );
	tmp >>= 8;
	return (uint8_t)tmp;
}

// Load 4 uint64_t numbers from the correct locations, without AVX2 gathers
inline __m256i loadNumbers( const uint8_t* rsi, uint32_t offsets )
{
	const int64_t* s0 = (const int64_t*)( rsi + (uint8_t)offsets );
	const int64_t* s1 = (const int64_t*)( rsi + (uint8_t)( offsets >> 8 ) );
	const int64_t* s2 = (const int64_t*)( rsi + (uint8_t)( offsets >> 16 ) );
	const int64_t* s3 = (const int64_t*)( rsi + ( offsets >> 24 ) );
	return _mm256_setr_epi64x( *s0, *s1, *s2, *s3 );
}

// Load 4 uint64_t numbers from the correct locations, using AVX2 gathers
inline __m256i loadNumbers( const uint8_t* rsi, __m128i offsets )
{
	// Zero extend bytes to int32
	__m128i off = _mm_cvtepu8_epi32( offsets );
	// Load 4 numbers with 1 instruction; unfortunately, on AMD this is slower
	return _mm256_i32gather_epi64( (const int64_t*)rsi, off, 1 );
}

// Shift the highest ( 64 - bits[ i ] ) bits in the int64 numbers into the low position
inline __m256i shiftNumbers( __m256i v, __m128i bits )
{
	// Zero extend bytes to int64
	__m256i shift = _mm256_cvtepu8_epi64( bits );
	// Shift the numbers
	return _mm256_srlv_epi64( v, shift );
}

// Conditionally negate int64 numbers based on the 0x80 bit in the lowest 4 bytes of the second argument
inline __m256i applySigns( __m256i v, __m128i signs )
{
	// Sign extend the masks from bytes to int64
	__m256i mask = _mm256_cvtepi8_epi64( signs );
	// Conditionally negate
	__m256i neg = _mm256_sub_epi64( _mm256_setzero_si256(), v );
	return _mm256_blendv_epi8( v, neg, mask );
}

struct BlockHeader
{
	// Load offsets in bytes related to the start of the block header
	__m128i offsetBytes;
	// Right shift amounts to move loaded values to the correct positions, [ 0 .. 64 ]
	__m128i shifts;
	// 16 bytes with the 0x80 bit set when the corresponding input was negative; the rest of the bits are unused
	__m128i signs;
	// Count of payload bytes in the complete block
	size_t payloadBytes;
};

inline BlockHeader loadHeader( const uint8_t* rsi )
{
	// Load 8 bytes, and zero extend them into uint16_t
	const __m128i v = _mm_cvtepu8_epi16( _mm_loadu_si64( rsi ) );

	// Unpack lengths
	const __m128i seven = _mm_set1_epi8( 7 );
	const __m128i l4 = _mm_slli_epi16( v, 4 );
	__m128i lengths = _mm_or_si128( v, l4 );
	lengths = _mm_and_si128( lengths, seven );
	// Transform 7 into 8
	__m128i tmp = _mm_cmpeq_epi8( lengths, seven );
	lengths = _mm_sub_epi8( lengths, tmp );

	BlockHeader header;
	// Byte offsets to load 16 numbers, relative to the start of the header
	header.offsetBytes = inclusivePrefixSum( lengths );
	// Count of payload bytes in the complete block
	header.payloadBytes = extractLast( header.offsetBytes );
	// Shift amounts, 64 - lengths * 8
	header.shifts = _mm_sub_epi8( _mm_set1_epi8( 64 ), _mm_slli_epi16( lengths, 3 ) );
	// Signs vector, we only use the highest 0x80 bit in these bytes
	header.signs = _mm_or_si128( _mm_slli_epi16( v, 8 ), l4 );

	return header;
}

template<int slice>
inline void decodeSlice( const BlockHeader& block, int64_t* rdi, const uint8_t* payload )
{
#if USE_GATHER_INSTUCTIONS
	__m128i off;
#else
	uint32_t off;
#endif
	__m128i bits, s;
	if constexpr( slice != 0 )
	{
		constexpr int imm = _MM_SHUFFLE( slice, slice, slice, slice );
#if USE_GATHER_INSTUCTIONS
		off = _mm_shuffle_epi32( block.offsetBytes, imm );
#else
		off = (uint32_t)_mm_extract_epi32( block.offsetBytes, slice );
#endif
		bits = _mm_shuffle_epi32( block.shifts, imm );
		s = _mm_shuffle_epi32( block.signs, imm );
	}
	else
	{
		// For the first slice of the block, the 4 lowest bytes are in the correct locations already
#if USE_GATHER_INSTUCTIONS
		off = block.offsetBytes;
#else
		off = (uint32_t)_mm_cvtsi128_si32( block.offsetBytes );
#endif
		bits = block.shifts;
		s = block.signs;
	}

	__m256i v = loadNumbers( payload, off );
	v = shiftNumbers( v, bits );
	v = applySigns( v, s );
	_mm256_storeu_si256( ( __m256i* )rdi, v );
}

// Decode and store a block of 16 numbers, and return pointer to the next encoded block.
// BTW, it helps to make sure this function is inlined by the compiler
const uint8_t* decodeBlock( int64_t* rdi, const uint8_t* rsi )
{
	const BlockHeader block = loadHeader( rsi );

	decodeSlice<0>( block, rdi, rsi );
	decodeSlice<1>( block, rdi + 4, rsi );
	decodeSlice<2>( block, rdi + 8, rsi );
	decodeSlice<3>( block, rdi + 12, rsi );

	return rsi + block.payloadBytes + 8;
}
	#include <immintrin.h>
	#include <stdint.h>

	// 1 = use `vpgatherdq` to load 4 numbers with 1 instruction, 0 = load them with scalar loads
	// It seems on AMD CPUs scalar loads are slightly faster
	#define USE_GATHER_INSTUCTIONS 0

	// Inclusive prefix sum of unsigned bytes = offsets of the end of the numbers
	// When the sum of all bytes exceeds 0xFF, the output is garbage
	// Which is fine here because our bytes are in [0..8] interval
	inline __m128i inclusivePrefixSum( __m128i v )
	{
	// https://en.wikipedia.org/wiki/Prefix_sum#/media/File:Hillis-Steele_Prefix_Sum.svg
	v = _mm_add_epi8( v, _mm_slli_si128( v, 1 ) );
	v = _mm_add_epi8( v, _mm_slli_si128( v, 2 ) );
	v = _mm_add_epi8( v, _mm_slli_si128( v, 4 ) );
	v = _mm_add_epi8( v, _mm_slli_si128( v, 8 ) );
	return v;
	}

	// Extract the last byte from the vector
	inline uint8_t extractLast( __m128i v )
	{
	uint16_t tmp = _mm_extract_epi16( v, 7 );
	tmp >>= 8;
	return (uint8_t)tmp;
	}

	// Load 4 uint64_t numbers from the correct locations, without AVX2 gathers
	inline __m256i loadNumbers( const uint8_t* rsi, uint32_t offsets )
	{
	const int64_t* s0 = (const int64_t*)( rsi + (uint8_t)offsets );
	const int64_t* s1 = (const int64_t*)( rsi + (uint8_t)( offsets >> 8 ) );
	const int64_t* s2 = (const int64_t*)( rsi + (uint8_t)( offsets >> 16 ) );
	const int64_t* s3 = (const int64_t*)( rsi + ( offsets >> 24 ) );
	return _mm256_setr_epi64x( s0, s1, s2, s3 );
	}

	// Load 4 uint64_t numbers from the correct locations, using AVX2 gathers
	inline __m256i loadNumbers( const uint8_t* rsi, __m128i offsets )
	{
	// Zero extend bytes to int32
	__m128i off = _mm_cvtepu8_epi32( offsets );
	// Load 4 numbers with 1 instruction; unfortunately, on AMD this is slower
	return _mm256_i32gather_epi64( (const int64_t*)rsi, off, 1 );
	}

	// Shift the highest ( 64 - bits[ i ] ) bits in the int64 numbers into the low position
	inline __m256i shiftNumbers( __m256i v, __m128i bits )
	{
	// Zero extend bytes to int64
	__m256i shift = _mm256_cvtepu8_epi64( bits );
	// Shift the numbers
	return _mm256_srlv_epi64( v, shift );
	}

	// Conditionally negate int64 numbers based on the 0x80 bit in the lowest 4 bytes of the second argument
	inline __m256i applySigns( __m256i v, __m128i signs )
	{
	// Sign extend the masks from bytes to int64
	__m256i mask = _mm256_cvtepi8_epi64( signs );
	// Conditionally negate
	__m256i neg = _mm256_sub_epi64( _mm256_setzero_si256(), v );
	return _mm256_blendv_epi8( v, neg, mask );
	}

	struct BlockHeader
	{
	// Load offsets in bytes related to the start of the block header
	__m128i offsetBytes;
	// Right shift amounts to move loaded values to the correct positions, [ 0 .. 64 ]
	__m128i shifts;
	// 16 bytes with the 0x80 bit set when the corresponding input was negative; the rest of the bits are unused
	__m128i signs;
	// Count of payload bytes in the complete block
	size_t payloadBytes;
	};

	inline BlockHeader loadHeader( const uint8_t* rsi )
	{
	// Load 8 bytes, and zero extend them into uint16_t
	const __m128i v = _mm_cvtepu8_epi16( _mm_loadu_si64( rsi ) );

	// Unpack lengths
	const __m128i seven = _mm_set1_epi8( 7 );
	const __m128i l4 = _mm_slli_epi16( v, 4 );
	__m128i lengths = _mm_or_si128( v, l4 );
	lengths = _mm_and_si128( lengths, seven );
	// Transform 7 into 8
	__m128i tmp = _mm_cmpeq_epi8( lengths, seven );
	lengths = _mm_sub_epi8( lengths, tmp );

	BlockHeader header;
	// Byte offsets to load 16 numbers, relative to the start of the header
	header.offsetBytes = inclusivePrefixSum( lengths );
	// Count of payload bytes in the complete block
	header.payloadBytes = extractLast( header.offsetBytes );
	// Shift amounts, 64 - lengths * 8
	header.shifts = _mm_sub_epi8( _mm_set1_epi8( 64 ), _mm_slli_epi16( lengths, 3 ) );
	// Signs vector, we only use the highest 0x80 bit in these bytes
	header.signs = _mm_or_si128( _mm_slli_epi16( v, 8 ), l4 );

	return header;
	}

	template<int slice>
	inline void decodeSlice( const BlockHeader& block, int64_t* rdi, const uint8_t* payload )
	{
	#if USE_GATHER_INSTUCTIONS
	__m128i off;
	#else
	uint32_t off;
	#endif
	__m128i bits, s;
	if constexpr( slice != 0 )
	{
	constexpr int imm = _MM_SHUFFLE( slice, slice, slice, slice );
	#if USE_GATHER_INSTUCTIONS
	off = _mm_shuffle_epi32( block.offsetBytes, imm );
	#else
	off = (uint32_t)_mm_extract_epi32( block.offsetBytes, slice );
	#endif
	bits = _mm_shuffle_epi32( block.shifts, imm );
	s = _mm_shuffle_epi32( block.signs, imm );
	}
	else
	{
	// For the first slice of the block, the 4 lowest bytes are in the correct locations already
	#if USE_GATHER_INSTUCTIONS
	off = block.offsetBytes;
	#else
	off = (uint32_t)_mm_cvtsi128_si32( block.offsetBytes );
	#endif
	bits = block.shifts;
	s = block.signs;
	}

	__m256i v = loadNumbers( payload, off );
	v = shiftNumbers( v, bits );
	v = applySigns( v, s );
	_mm256_storeu_si256( ( __m256i* )rdi, v );
	}

	// Decode and store a block of 16 numbers, and return pointer to the next encoded block.
	// BTW, it helps to make sure this function is inlined by the compiler
	const uint8_t* decodeBlock( int64_t* rdi, const uint8_t* rsi )
	{
	const BlockHeader block = loadHeader( rsi );

	decodeSlice<0>( block, rdi, rsi );
	decodeSlice<1>( block, rdi + 4, rsi );
	decodeSlice<2>( block, rdi + 8, rsi );
	decodeSlice<3>( block, rdi + 12, rsi );

	return rsi + block.payloadBytes + 8;
	}