Skip to content

Instantly share code, notes, and snippets.

Last active June 30, 2023 16:22
Show Gist options
  • Save Const-me/90a52f291c1fcb06142307facdb8e54e to your computer and use it in GitHub Desktop.
Save Const-me/90a52f291c1fcb06142307facdb8e54e to your computer and use it in GitHub Desktop.
// Transform 4 inputs with 4 lookup tables, making 4 outputs
// The 4 inputs are packed in uint32_t value, each byte is expected to be in [ 0 .. 15 ] interval
// The 4 tables are in a single AVX2 vector
uint32_t applyLookup4( uint32_t i4, __m256i tables4 )
// Move 4 bytes into SSE vector
__m128i bytes = _mm_cvtsi32_si128( (int)i4 );
// Expand bytes into uint64_t lanes
__m256i v = _mm256_cvtepu8_epi64( bytes );
// Multiply them by 4 to get shift amounts in bits
v = _mm256_slli_epi64( v, 2 );
// Shift numbers in the 4 tables
v = _mm256_srlv_epi64( tables4, v );
// Move bytes into the correct positions, within 16-byte pieces
const __m256i perm = _mm256_setr_epi8(
0, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, 0, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 );
v = _mm256_shuffle_epi8( v, perm );
// Merge 16-byte pieces into a single vector
__m128i res = _mm256_extracti128_si256( v, 1 );
res = _mm_or_si128( res, _mm256_castsi256_si128( v ) );
// Move result into a scalar register
uint32_t scalar = (uint32_t)_mm_cvtsi128_si32( res );
// Mask away higher 4 bits in each byte
// They are artifacts from the lookup tables
scalar &= 0x0F0F0F0Fu;
// Return the 4 outputs, same packing as the inputs
return scalar;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment