Skip to content

Instantly share code, notes, and snippets.

@Const-me
Created March 15, 2024 13:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Const-me/dfee9f414982262df5e959713f5f63ac to your computer and use it in GitHub Desktop.
Save Const-me/dfee9f414982262df5e959713f5f63ac to your computer and use it in GitHub Desktop.
#include <stdint.h>
#include <emmintrin.h> // SSE 2
#include <tmmintrin.h> // SSSE 3
#include <smmintrin.h> // SSE 4.1
// Vector constants for dot4Sse function
struct ConstantVectorsSse
{
__m128i abcd;
__m128i lowNibbleMask;
__m128i zero;
};
// Pack 4 bytes into a single uint32_t value
uint32_t packBytes( uint32_t a, uint32_t b, uint32_t c, uint32_t d )
{
b <<= 8;
c <<= 16;
d <<= 24;
return a | b | c | d;
}
// Initialize vector constants for dot4Sse function
struct ConstantVectorsSse makeConstantsSse( uint8_t a, uint8_t b, uint8_t c, uint8_t d )
{
struct ConstantVectorsSse cv;
cv.abcd = _mm_set1_epi32( (int)packBytes( a, b, c, d ) );
cv.lowNibbleMask = _mm_set1_epi8( 0x0F );
cv.zero = _mm_setzero_si128();
return cv;
}
// Dot products of 4 groups of 4 bytes in memory against 4 small constants
// Returns a vector of 4 int32 lanes
__m128i dot4Sse( const uint8_t* rsi, const struct ConstantVectorsSse* cv )
{
// Load 16 bytes, and mask away higher 4 bits in each byte
__m128i v = _mm_loadu_si128( ( const __m128i* )rsi );
v = _mm_and_si128( cv->lowNibbleMask, v );
// Compute products, add pairwise
v = _mm_maddubs_epi16( cv->abcd, v );
// Final reduction step, add adjacent pairs of uint16_t lanes
__m128i high = _mm_srli_epi32( v, 16 );
__m128i low = _mm_blend_epi16( v, cv->zero, 0b10101010 );
return _mm_add_epi32( high, low );
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment