Skip to content

Instantly share code, notes, and snippets.

@Const-me
Created October 30, 2023 15:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Const-me/e897e4565b2c6a2e69d8b5d2c1457730 to your computer and use it in GitHub Desktop.
Save Const-me/e897e4565b2c6a2e69d8b5d2c1457730 to your computer and use it in GitHub Desktop.
#include <immintrin.h>
#include <stdint.h>
// 1 = use `vpgatherdq` to load 4 numbers with 1 instruction, 0 = load them with scalar loads
// It seems on AMD CPUs scalar loads are slightly faster
#define USE_GATHER_INSTUCTIONS 0
// Inclusive prefix sum of unsigned bytes = offsets of the end of the numbers
// When the sum of all bytes exceeds 0xFF, the output is garbage
// Which is fine here because our bytes are in [0..8] interval
inline __m128i inclusivePrefixSum( __m128i v )
{
// https://en.wikipedia.org/wiki/Prefix_sum#/media/File:Hillis-Steele_Prefix_Sum.svg
v = _mm_add_epi8( v, _mm_slli_si128( v, 1 ) );
v = _mm_add_epi8( v, _mm_slli_si128( v, 2 ) );
v = _mm_add_epi8( v, _mm_slli_si128( v, 4 ) );
v = _mm_add_epi8( v, _mm_slli_si128( v, 8 ) );
return v;
}
// Extract the last byte from the vector
inline uint8_t extractLast( __m128i v )
{
uint16_t tmp = _mm_extract_epi16( v, 7 );
tmp >>= 8;
return (uint8_t)tmp;
}
// Load 4 uint64_t numbers from the correct locations, without AVX2 gathers
inline __m256i loadNumbers( const uint8_t* rsi, uint32_t offsets )
{
const int64_t* s0 = (const int64_t*)( rsi + (uint8_t)offsets );
const int64_t* s1 = (const int64_t*)( rsi + (uint8_t)( offsets >> 8 ) );
const int64_t* s2 = (const int64_t*)( rsi + (uint8_t)( offsets >> 16 ) );
const int64_t* s3 = (const int64_t*)( rsi + ( offsets >> 24 ) );
return _mm256_setr_epi64x( *s0, *s1, *s2, *s3 );
}
// Load 4 uint64_t numbers from the correct locations, using AVX2 gathers
inline __m256i loadNumbers( const uint8_t* rsi, __m128i offsets )
{
// Zero extend bytes to int32
__m128i off = _mm_cvtepu8_epi32( offsets );
// Load 4 numbers with 1 instruction; unfortunately, on AMD this is slower
return _mm256_i32gather_epi64( (const int64_t*)rsi, off, 1 );
}
// Shift the highest ( 64 - bits[ i ] ) bits in the int64 numbers into the low position
inline __m256i shiftNumbers( __m256i v, __m128i bits )
{
// Zero extend bytes to int64
__m256i shift = _mm256_cvtepu8_epi64( bits );
// Shift the numbers
return _mm256_srlv_epi64( v, shift );
}
// Conditionally negate int64 numbers based on the 0x80 bit in the lowest 4 bytes of the second argument
inline __m256i applySigns( __m256i v, __m128i signs )
{
// Sign extend the masks from bytes to int64
__m256i mask = _mm256_cvtepi8_epi64( signs );
// Conditionally negate
__m256i neg = _mm256_sub_epi64( _mm256_setzero_si256(), v );
return _mm256_blendv_epi8( v, neg, mask );
}
struct BlockHeader
{
// Load offsets in bytes related to the start of the block header
__m128i offsetBytes;
// Right shift amounts to move loaded values to the correct positions, [ 0 .. 64 ]
__m128i shifts;
// 16 bytes with the 0x80 bit set when the corresponding input was negative; the rest of the bits are unused
__m128i signs;
// Count of payload bytes in the complete block
size_t payloadBytes;
};
inline BlockHeader loadHeader( const uint8_t* rsi )
{
// Load 8 bytes, and zero extend them into uint16_t
const __m128i v = _mm_cvtepu8_epi16( _mm_loadu_si64( rsi ) );
// Unpack lengths
const __m128i seven = _mm_set1_epi8( 7 );
const __m128i l4 = _mm_slli_epi16( v, 4 );
__m128i lengths = _mm_or_si128( v, l4 );
lengths = _mm_and_si128( lengths, seven );
// Transform 7 into 8
__m128i tmp = _mm_cmpeq_epi8( lengths, seven );
lengths = _mm_sub_epi8( lengths, tmp );
BlockHeader header;
// Byte offsets to load 16 numbers, relative to the start of the header
header.offsetBytes = inclusivePrefixSum( lengths );
// Count of payload bytes in the complete block
header.payloadBytes = extractLast( header.offsetBytes );
// Shift amounts, 64 - lengths * 8
header.shifts = _mm_sub_epi8( _mm_set1_epi8( 64 ), _mm_slli_epi16( lengths, 3 ) );
// Signs vector, we only use the highest 0x80 bit in these bytes
header.signs = _mm_or_si128( _mm_slli_epi16( v, 8 ), l4 );
return header;
}
template<int slice>
inline void decodeSlice( const BlockHeader& block, int64_t* rdi, const uint8_t* payload )
{
#if USE_GATHER_INSTUCTIONS
__m128i off;
#else
uint32_t off;
#endif
__m128i bits, s;
if constexpr( slice != 0 )
{
constexpr int imm = _MM_SHUFFLE( slice, slice, slice, slice );
#if USE_GATHER_INSTUCTIONS
off = _mm_shuffle_epi32( block.offsetBytes, imm );
#else
off = (uint32_t)_mm_extract_epi32( block.offsetBytes, slice );
#endif
bits = _mm_shuffle_epi32( block.shifts, imm );
s = _mm_shuffle_epi32( block.signs, imm );
}
else
{
// For the first slice of the block, the 4 lowest bytes are in the correct locations already
#if USE_GATHER_INSTUCTIONS
off = block.offsetBytes;
#else
off = (uint32_t)_mm_cvtsi128_si32( block.offsetBytes );
#endif
bits = block.shifts;
s = block.signs;
}
__m256i v = loadNumbers( payload, off );
v = shiftNumbers( v, bits );
v = applySigns( v, s );
_mm256_storeu_si256( ( __m256i* )rdi, v );
}
// Decode and store a block of 16 numbers, and return pointer to the next encoded block.
// BTW, it helps to make sure this function is inlined by the compiler
const uint8_t* decodeBlock( int64_t* rdi, const uint8_t* rsi )
{
const BlockHeader block = loadHeader( rsi );
decodeSlice<0>( block, rdi, rsi );
decodeSlice<1>( block, rdi + 4, rsi );
decodeSlice<2>( block, rdi + 8, rsi );
decodeSlice<3>( block, rdi + 12, rsi );
return rsi + block.payloadBytes + 8;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment