-
-
Save slaren/869800a9e80717d0b5b9b9c6cbf5e5f0 to your computer and use it in GitHub Desktop.
test-avx1-avx2.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdlib.h> | |
#include <time.h> | |
#include <benchmark/benchmark.h> | |
#include <float.h> | |
#include <math.h> | |
#include <immintrin.h> | |
#include <memory.h> | |
#define QK 32 | |
typedef double ggml_float; | |
typedef struct { | |
float d; // delta | |
uint8_t qs[QK / 2]; // nibbles / quants | |
} block_q4_0; | |
#define restrict | |
// AVX | |
static inline __m128i bytesFromNibbles128( const uint8_t* rsi ) | |
{ | |
// Load 8 bytes from memory | |
__m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi ); | |
// Expand bytes into uint16_t values | |
__m128i bytes = _mm_cvtepu8_epi16( tmp ); | |
// Unpack values into individual bytes | |
const __m128i lowMask = _mm_set1_epi8( 0xF ); | |
__m128i high = _mm_andnot_si128( lowMask, bytes ); | |
__m128i low = _mm_and_si128( lowMask, bytes ); | |
high = _mm_slli_epi16( high, 4 ); | |
bytes = _mm_or_si128( low, high ); | |
return bytes; | |
} | |
static inline __m128i packNibbles128( __m128i bytes1, __m128i bytes2 ) | |
{ | |
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh | |
const __m128i lowByte = _mm_set1_epi16( 0xFF ); | |
__m128i high = _mm_andnot_si128( lowByte, bytes1 ); | |
__m128i low = _mm_and_si128( lowByte, bytes1 ); | |
high = _mm_srli_epi16( high, 4 ); | |
bytes1 = _mm_or_si128( low, high ); | |
high = _mm_andnot_si128( lowByte, bytes2 ); | |
low = _mm_and_si128( lowByte, bytes2 ); | |
high = _mm_srli_epi16( high, 4 ); | |
bytes2 = _mm_or_si128( low, high ); | |
return _mm_packus_epi16( bytes1, bytes2); | |
} | |
// AVX2 | |
static inline __m256i bytesFromNibbles( const uint8_t* rsi ) | |
{ | |
// Load 16 bytes from memory | |
__m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi ); | |
// Expand bytes into uint16_t values | |
__m256i bytes = _mm256_cvtepu8_epi16( tmp ); | |
// Unpack values into individual bytes | |
const __m256i lowMask = _mm256_set1_epi8( 0xF ); | |
__m256i high = _mm256_andnot_si256( lowMask, bytes ); | |
__m256i low = _mm256_and_si256( lowMask, bytes ); | |
high = _mm256_slli_epi16( high, 4 ); | |
bytes = _mm256_or_si256( low, high ); | |
return bytes; | |
} | |
static inline __m128i packNibbles( __m256i bytes ) | |
{ | |
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh | |
const __m256i lowByte = _mm256_set1_epi16( 0xFF ); | |
__m256i high = _mm256_andnot_si256( lowByte, bytes ); | |
__m256i low = _mm256_and_si256( lowByte, bytes ); | |
high = _mm256_srli_epi16( high, 4 ); | |
bytes = _mm256_or_si256( low, high ); | |
// Compress uint16_t lanes into bytes | |
__m128i r0 = _mm256_castsi256_si128( bytes ); | |
__m128i r1 = _mm256_extracti128_si256( bytes, 1 ); | |
return _mm_packus_epi16( r0, r1 ); | |
} | |
static void ggml_vec_dot_q4_0_avx(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { | |
const int nb = n / QK; | |
assert(n % QK == 0); | |
assert(nb % 2 == 0); | |
const block_q4_0 * restrict x = (const block_q4_0*)vx; | |
const block_q4_0 * restrict y = (const block_q4_0*)vy; | |
ggml_float sumf = 0.0; | |
// Initialize accumulator with zeros | |
__m256 acc = _mm256_setzero_ps(); | |
// Main loop | |
for (int i = 0; i < nb; ++i) { | |
// Compute combined scale for the block | |
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); | |
__m128i i32[2]; | |
for (int j = 0; j < 2; ++j) { | |
// Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes | |
__m128i bx = bytesFromNibbles128( x[i].qs + 8*j ); | |
__m128i by = bytesFromNibbles128( y[i].qs + 8*j ); | |
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. | |
const __m128i off = _mm_set1_epi8( 8 ); | |
bx = _mm_sub_epi8( bx, off ); | |
by = _mm_sub_epi8( by, off ); | |
// Get absolute values of x vectors | |
const __m128i ax = _mm_sign_epi8(bx, bx); | |
// Sign the values of the y vectors | |
const __m128i sy = _mm_sign_epi8(by, bx); | |
// Perform multiplication and create 16-bit values | |
const __m128i dot = _mm_maddubs_epi16(ax, sy); | |
const __m128i ones = _mm_set1_epi16(1); | |
i32[j] = _mm_madd_epi16(ones, dot); | |
} | |
// Convert int32_t to float | |
__m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] )); | |
// Apply the scale, and accumulate | |
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); | |
} | |
// Return horizontal sum of the acc vector | |
__m128 res = _mm256_extractf128_ps( acc, 1 ); | |
res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); | |
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); | |
res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); | |
sumf = _mm_cvtss_f32( res ); | |
*s = sumf; | |
} | |
static void ggml_vec_dot_q4_0_avx2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { | |
const int nb = n / QK; | |
assert(n % QK == 0); | |
assert(nb % 2 == 0); | |
const block_q4_0 * restrict x = (const block_q4_0*)vx; | |
const block_q4_0 * restrict y = (const block_q4_0*)vy; | |
ggml_float sumf = 0.0; | |
__m256 acc = _mm256_setzero_ps(); | |
// Main loop | |
for (int i = 0; i < nb; ++i) { | |
// Compute combined scale for the block | |
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); | |
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes | |
__m256i bx = bytesFromNibbles( x[i].qs ); | |
__m256i by = bytesFromNibbles( y[i].qs ); | |
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. | |
const __m256i off = _mm256_set1_epi8( 8 ); | |
bx = _mm256_sub_epi8( bx, off ); | |
by = _mm256_sub_epi8( by, off ); | |
// Sign-extend first 16 signed bytes into int16_t | |
__m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) ); | |
__m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) ); | |
// Compute products of int16_t integers, add pairwise | |
__m256i i32 = _mm256_madd_epi16( x16, y16 ); | |
// Sign-extend last 16 signed bytes into int16_t vectors | |
x16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) ); | |
y16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) ); | |
// Accumulate products of int16_t integers | |
i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16, y16 ) ); | |
// Convert int32_t to float | |
__m256 p = _mm256_cvtepi32_ps( i32 ); | |
// Apply the scale, and accumulate | |
acc = _mm256_fmadd_ps( d, p, acc ); | |
} | |
// Return horizontal sum of the acc vector | |
__m128 res = _mm256_extractf128_ps( acc, 1 ); | |
res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); | |
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); | |
res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); | |
sumf = _mm_cvtss_f32( res ); | |
*s = sumf; | |
} | |
static void ggml_vec_dot_q4_0_avx2_new(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { | |
const int nb = n / QK; | |
assert(n % QK == 0); | |
assert(nb % 2 == 0); | |
const block_q4_0 * restrict x = (const block_q4_0*) vx; | |
const block_q4_0 * restrict y = (const block_q4_0*) vy; | |
ggml_float sumf = 0.0; | |
// Initialize accumulator with zeros | |
__m256 acc = _mm256_setzero_ps(); | |
// Main loop | |
for (int i = 0; i < nb; ++i) { | |
// Compute combined scale for the block | |
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); | |
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes | |
__m256i bx = bytesFromNibbles( x[i].qs ); | |
__m256i by = bytesFromNibbles( y[i].qs ); | |
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. | |
const __m256i off = _mm256_set1_epi8( 8 ); | |
bx = _mm256_sub_epi8( bx, off ); | |
by = _mm256_sub_epi8( by, off ); | |
// Get absolute values of x vectors | |
const __m256i ax = _mm256_sign_epi8(bx, bx); | |
// Sign the values of the y vectors | |
const __m256i sy = _mm256_sign_epi8(by, bx); | |
// Perform multiplication and create 16-bit values | |
const __m256i dot = _mm256_maddubs_epi16(ax, sy); | |
const __m256i ones = _mm256_set1_epi16(1); | |
__m256i i32 = _mm256_madd_epi16(ones, dot); | |
// Convert int32_t to float | |
__m256 p = _mm256_cvtepi32_ps( i32 ); | |
// Apply the scale, and accumulate | |
acc = _mm256_fmadd_ps( d, p, acc ); | |
//acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); | |
} | |
// Return horizontal sum of the acc vector | |
__m128 res = _mm256_extractf128_ps( acc, 1 ); | |
res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); | |
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); | |
res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); | |
sumf = _mm_cvtss_f32( res ); | |
*s = sumf; | |
} | |
const int n=4096; | |
void BM_ggml_vec_dot_q4_0_avx(benchmark::State& state) { | |
float x[n] __attribute__ ((aligned (32))); | |
float y[n] __attribute__ ((aligned (32))); | |
float z[n] __attribute__ ((aligned (32))); | |
srand(time(0)); | |
for (int i=0;i<n;i++) x[i] = ((float)rand()/(float)RAND_MAX-0.5f)*1e3f; | |
for (int i=0;i<n;i++) y[i] = ((float)rand()/(float)RAND_MAX-0.5f)*1e3f; | |
for (auto _ : state) { | |
ggml_vec_dot_q4_0_avx(n, z, x, y); | |
benchmark::DoNotOptimize(z); | |
} | |
} | |
BENCHMARK(BM_ggml_vec_dot_q4_0_avx); | |
void BM_ggml_vec_dot_q4_0_avx2(benchmark::State& state) { | |
float x[n] __attribute__ ((aligned (32))); | |
float y[n] __attribute__ ((aligned (32))); | |
float z[n] __attribute__ ((aligned (32))); | |
srand(time(0)); | |
for (int i=0;i<n;i++) x[i] = ((float)rand()/(float)RAND_MAX-0.5f)*1e3f; | |
for (int i=0;i<n;i++) y[i] = ((float)rand()/(float)RAND_MAX-0.5f)*1e3f; | |
for (auto _ : state) { | |
ggml_vec_dot_q4_0_avx2(n, z, x, y); | |
benchmark::DoNotOptimize(z); | |
} | |
} | |
BENCHMARK(BM_ggml_vec_dot_q4_0_avx2); | |
void BM_ggml_vec_dot_q4_0_avx2_new(benchmark::State& state) { | |
float x[n] __attribute__ ((aligned (32))); | |
float y[n] __attribute__ ((aligned (32))); | |
float z[n] __attribute__ ((aligned (32))); | |
srand(time(0)); | |
for (int i=0;i<n;i++) x[i] = ((float)rand()/(float)RAND_MAX-0.5f)*1e3f; | |
for (int i=0;i<n;i++) y[i] = ((float)rand()/(float)RAND_MAX-0.5f)*1e3f; | |
for (auto _ : state) { | |
ggml_vec_dot_q4_0_avx2_new(n, z, x, y); | |
benchmark::DoNotOptimize(z); | |
} | |
} | |
BENCHMARK(BM_ggml_vec_dot_q4_0_avx2_new); | |
// Run the benchmark | |
BENCHMARK_MAIN(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment