slaren/test-avx1-avx2.cpp Secret

## test-avx1-avx2.cpp
#include <stdlib.h>
#include <time.h>
#include <benchmark/benchmark.h>
#include <float.h>
#include <math.h>
#include <immintrin.h>
#include <memory.h>


#define QK 32
typedef double ggml_float;
typedef struct {
    float   d; // delta
    uint8_t qs[QK / 2]; // nibbles / quants
} block_q4_0;

#define restrict

// AVX

static inline __m128i bytesFromNibbles128( const uint8_t* rsi )
{
    // Load 8 bytes from memory
    __m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );

    // Expand bytes into uint16_t values
    __m128i bytes = _mm_cvtepu8_epi16( tmp );

    // Unpack values into individual bytes
    const __m128i lowMask = _mm_set1_epi8( 0xF );
    __m128i high = _mm_andnot_si128( lowMask, bytes );
    __m128i low = _mm_and_si128( lowMask, bytes );
    high = _mm_slli_epi16( high, 4 );
    bytes = _mm_or_si128( low, high );
    return bytes;
}

static inline __m128i packNibbles128( __m128i bytes1, __m128i bytes2 )
{
    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
    const __m128i lowByte = _mm_set1_epi16( 0xFF );
    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
    __m128i low = _mm_and_si128( lowByte, bytes1 );
    high = _mm_srli_epi16( high, 4 );
    bytes1 = _mm_or_si128( low, high );
    high = _mm_andnot_si128( lowByte, bytes2 );
    low = _mm_and_si128( lowByte, bytes2 );
    high = _mm_srli_epi16( high, 4 );
    bytes2 = _mm_or_si128( low, high );

    return _mm_packus_epi16( bytes1, bytes2);
}
// AVX2
static inline __m256i bytesFromNibbles( const uint8_t* rsi )
{
    // Load 16 bytes from memory
    __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );

    // Expand bytes into uint16_t values
    __m256i bytes = _mm256_cvtepu8_epi16( tmp );

    // Unpack values into individual bytes
    const __m256i lowMask = _mm256_set1_epi8( 0xF );
    __m256i high = _mm256_andnot_si256( lowMask, bytes );
    __m256i low = _mm256_and_si256( lowMask, bytes );
    high = _mm256_slli_epi16( high, 4 );
    bytes = _mm256_or_si256( low, high );
    return bytes;
}

static inline __m128i packNibbles( __m256i bytes )
{
    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
    const __m256i lowByte = _mm256_set1_epi16( 0xFF );
    __m256i high = _mm256_andnot_si256( lowByte, bytes );
    __m256i low = _mm256_and_si256( lowByte, bytes );
    high = _mm256_srli_epi16( high, 4 );
    bytes = _mm256_or_si256( low, high );

    // Compress uint16_t lanes into bytes
    __m128i r0 = _mm256_castsi256_si128( bytes );
    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
    return _mm_packus_epi16( r0, r1 );
}


static void ggml_vec_dot_q4_0_avx(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
    const int nb = n / QK;

    assert(n % QK == 0);
    assert(nb % 2 == 0);

    const block_q4_0 * restrict x = (const block_q4_0*)vx;
    const block_q4_0 * restrict y = (const block_q4_0*)vy;

    ggml_float sumf = 0.0;


    // Initialize accumulator with zeros
    __m256 acc = _mm256_setzero_ps();

    // Main loop
    for (int i = 0; i < nb; ++i) {
        // Compute combined scale for the block
        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );

        __m128i i32[2];
        for (int j = 0; j < 2; ++j) {
            // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
            __m128i bx = bytesFromNibbles128( x[i].qs + 8*j );
            __m128i by = bytesFromNibbles128( y[i].qs + 8*j );

            // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
            const __m128i off = _mm_set1_epi8( 8 );
            bx = _mm_sub_epi8( bx, off );
            by = _mm_sub_epi8( by, off );

	        // Get absolute values of x vectors
            const __m128i ax = _mm_sign_epi8(bx, bx);

            // Sign the values of the y vectors
            const __m128i sy = _mm_sign_epi8(by, bx);

            // Perform multiplication and create 16-bit values
            const __m128i dot = _mm_maddubs_epi16(ax, sy);

            const __m128i ones = _mm_set1_epi16(1);
            i32[j] = _mm_madd_epi16(ones, dot);
        }

        // Convert int32_t to float
        __m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
        // Apply the scale, and accumulate
        acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
    }

    // Return horizontal sum of the acc vector
    __m128 res = _mm256_extractf128_ps( acc, 1 );
    res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
    res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
    res = _mm_add_ss( res, _mm_movehdup_ps( res ) );

    sumf = _mm_cvtss_f32( res );

    *s = sumf;
}


static void ggml_vec_dot_q4_0_avx2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
    const int nb = n / QK;

    assert(n % QK == 0);
    assert(nb % 2 == 0);

    const block_q4_0 * restrict x = (const block_q4_0*)vx;
    const block_q4_0 * restrict y = (const block_q4_0*)vy;

    ggml_float sumf = 0.0;
    __m256 acc = _mm256_setzero_ps();

    // Main loop
    for (int i = 0; i < nb; ++i) {
        // Compute combined scale for the block
        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );

        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
        __m256i bx = bytesFromNibbles( x[i].qs );
        __m256i by = bytesFromNibbles( y[i].qs );

        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
        const __m256i off = _mm256_set1_epi8( 8 );
        bx = _mm256_sub_epi8( bx, off );
        by = _mm256_sub_epi8( by, off );

        // Sign-extend first 16 signed bytes into int16_t
        __m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) );
        __m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );
        // Compute products of int16_t integers, add pairwise
        __m256i i32 = _mm256_madd_epi16( x16, y16 );

        // Sign-extend last 16 signed bytes into int16_t vectors
        x16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) );
        y16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) );
        // Accumulate products of int16_t integers
        i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16, y16 ) );

        // Convert int32_t to float
        __m256 p = _mm256_cvtepi32_ps( i32 );
        // Apply the scale, and accumulate
        acc = _mm256_fmadd_ps( d, p, acc );
    }

    // Return horizontal sum of the acc vector
    __m128 res = _mm256_extractf128_ps( acc, 1 );
    res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
    res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
    res = _mm_add_ss( res, _mm_movehdup_ps( res ) );

    sumf = _mm_cvtss_f32( res );
    *s = sumf;
}

static void ggml_vec_dot_q4_0_avx2_new(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
    const int nb = n / QK;

    assert(n % QK == 0);
    assert(nb % 2 == 0);

    const block_q4_0 * restrict x = (const block_q4_0*) vx;
    const block_q4_0 * restrict y = (const block_q4_0*) vy;

    ggml_float sumf = 0.0;

    // Initialize accumulator with zeros
    __m256 acc = _mm256_setzero_ps();

    // Main loop
    for (int i = 0; i < nb; ++i) {
        // Compute combined scale for the block
        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );

        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
        __m256i bx = bytesFromNibbles( x[i].qs );
        __m256i by = bytesFromNibbles( y[i].qs );

        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
        const __m256i off = _mm256_set1_epi8( 8 );
        bx = _mm256_sub_epi8( bx, off );
        by = _mm256_sub_epi8( by, off );

        // Get absolute values of x vectors
        const __m256i ax = _mm256_sign_epi8(bx, bx);

        // Sign the values of the y vectors
        const __m256i sy = _mm256_sign_epi8(by, bx);

        // Perform multiplication and create 16-bit values
        const __m256i dot = _mm256_maddubs_epi16(ax, sy);

        const __m256i ones = _mm256_set1_epi16(1);
        __m256i i32 = _mm256_madd_epi16(ones, dot);

        // Convert int32_t to float
        __m256 p = _mm256_cvtepi32_ps( i32 );
        // Apply the scale, and accumulate
        acc = _mm256_fmadd_ps( d, p, acc );
        //acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
    }

    // Return horizontal sum of the acc vector
    __m128 res = _mm256_extractf128_ps( acc, 1 );
    res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
    res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
    res = _mm_add_ss( res, _mm_movehdup_ps( res ) );

    sumf = _mm_cvtss_f32( res );

    *s = sumf;
}


const int n=4096;

void BM_ggml_vec_dot_q4_0_avx(benchmark::State& state) {
    float x[n] __attribute__ ((aligned (32)));
    float y[n] __attribute__ ((aligned (32)));
    float z[n] __attribute__ ((aligned (32)));

    srand(time(0));
    for (int i=0;i<n;i++) x[i] = ((float)rand()/(float)RAND_MAX-0.5f)*1e3f;
    for (int i=0;i<n;i++) y[i] = ((float)rand()/(float)RAND_MAX-0.5f)*1e3f;

    for (auto _ : state) {
        ggml_vec_dot_q4_0_avx(n, z, x, y);
        benchmark::DoNotOptimize(z);
    }
}
BENCHMARK(BM_ggml_vec_dot_q4_0_avx);

void BM_ggml_vec_dot_q4_0_avx2(benchmark::State& state) {
    float x[n] __attribute__ ((aligned (32)));
    float y[n] __attribute__ ((aligned (32)));
    float z[n] __attribute__ ((aligned (32)));

    srand(time(0));
    for (int i=0;i<n;i++) x[i] = ((float)rand()/(float)RAND_MAX-0.5f)*1e3f;
    for (int i=0;i<n;i++) y[i] = ((float)rand()/(float)RAND_MAX-0.5f)*1e3f;

    for (auto _ : state) {
        ggml_vec_dot_q4_0_avx2(n, z, x, y);
        benchmark::DoNotOptimize(z);
    }

}
BENCHMARK(BM_ggml_vec_dot_q4_0_avx2);

void BM_ggml_vec_dot_q4_0_avx2_new(benchmark::State& state) {
    float x[n] __attribute__ ((aligned (32)));
    float y[n] __attribute__ ((aligned (32)));
    float z[n] __attribute__ ((aligned (32)));

    srand(time(0));
    for (int i=0;i<n;i++) x[i] = ((float)rand()/(float)RAND_MAX-0.5f)*1e3f;
    for (int i=0;i<n;i++) y[i] = ((float)rand()/(float)RAND_MAX-0.5f)*1e3f;

    for (auto _ : state) {
        ggml_vec_dot_q4_0_avx2_new(n, z, x, y);
        benchmark::DoNotOptimize(z);
    }

}
BENCHMARK(BM_ggml_vec_dot_q4_0_avx2_new);

// Run the benchmark
BENCHMARK_MAIN();
	#include <stdlib.h>
	#include <time.h>
	#include <benchmark/benchmark.h>
	#include <float.h>
	#include <math.h>
	#include <immintrin.h>
	#include <memory.h>


	#define QK 32
	typedef double ggml_float;
	typedef struct {
	float d; // delta
	uint8_t qs[QK / 2]; // nibbles / quants
	} block_q4_0;

	#define restrict

	// AVX

	static inline __m128i bytesFromNibbles128( const uint8_t* rsi )
	{
	// Load 8 bytes from memory
	__m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );

	// Expand bytes into uint16_t values
	__m128i bytes = _mm_cvtepu8_epi16( tmp );

	// Unpack values into individual bytes
	const __m128i lowMask = _mm_set1_epi8( 0xF );
	__m128i high = _mm_andnot_si128( lowMask, bytes );
	__m128i low = _mm_and_si128( lowMask, bytes );
	high = _mm_slli_epi16( high, 4 );
	bytes = _mm_or_si128( low, high );
	return bytes;
	}

	static inline __m128i packNibbles128( __m128i bytes1, __m128i bytes2 )
	{
	// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
	const __m128i lowByte = _mm_set1_epi16( 0xFF );
	__m128i high = _mm_andnot_si128( lowByte, bytes1 );
	__m128i low = _mm_and_si128( lowByte, bytes1 );
	high = _mm_srli_epi16( high, 4 );
	bytes1 = _mm_or_si128( low, high );
	high = _mm_andnot_si128( lowByte, bytes2 );
	low = _mm_and_si128( lowByte, bytes2 );
	high = _mm_srli_epi16( high, 4 );
	bytes2 = _mm_or_si128( low, high );

	return _mm_packus_epi16( bytes1, bytes2);
	}
	// AVX2
	static inline __m256i bytesFromNibbles( const uint8_t* rsi )
	{
	// Load 16 bytes from memory
	__m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );

	// Expand bytes into uint16_t values
	__m256i bytes = _mm256_cvtepu8_epi16( tmp );

	// Unpack values into individual bytes
	const __m256i lowMask = _mm256_set1_epi8( 0xF );
	__m256i high = _mm256_andnot_si256( lowMask, bytes );
	__m256i low = _mm256_and_si256( lowMask, bytes );
	high = _mm256_slli_epi16( high, 4 );
	bytes = _mm256_or_si256( low, high );
	return bytes;
	}

	static inline __m128i packNibbles( __m256i bytes )
	{
	// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
	const __m256i lowByte = _mm256_set1_epi16( 0xFF );
	__m256i high = _mm256_andnot_si256( lowByte, bytes );
	__m256i low = _mm256_and_si256( lowByte, bytes );
	high = _mm256_srli_epi16( high, 4 );
	bytes = _mm256_or_si256( low, high );

	// Compress uint16_t lanes into bytes
	__m128i r0 = _mm256_castsi256_si128( bytes );
	__m128i r1 = _mm256_extracti128_si256( bytes, 1 );
	return _mm_packus_epi16( r0, r1 );
	}


	static void ggml_vec_dot_q4_0_avx(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
	const int nb = n / QK;

	assert(n % QK == 0);
	assert(nb % 2 == 0);

	const block_q4_0 * restrict x = (const block_q4_0*)vx;
	const block_q4_0 * restrict y = (const block_q4_0*)vy;

	ggml_float sumf = 0.0;


	// Initialize accumulator with zeros
	__m256 acc = _mm256_setzero_ps();

	// Main loop
	for (int i = 0; i < nb; ++i) {
	// Compute combined scale for the block
	const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );

	__m128i i32[2];
	for (int j = 0; j < 2; ++j) {
	// Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
	__m128i bx = bytesFromNibbles128( x[i].qs + 8*j );
	__m128i by = bytesFromNibbles128( y[i].qs + 8*j );

	// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
	const __m128i off = _mm_set1_epi8( 8 );
	bx = _mm_sub_epi8( bx, off );
	by = _mm_sub_epi8( by, off );

	// Get absolute values of x vectors
	const __m128i ax = _mm_sign_epi8(bx, bx);

	// Sign the values of the y vectors
	const __m128i sy = _mm_sign_epi8(by, bx);

	// Perform multiplication and create 16-bit values
	const __m128i dot = _mm_maddubs_epi16(ax, sy);

	const __m128i ones = _mm_set1_epi16(1);
	i32[j] = _mm_madd_epi16(ones, dot);
	}

	// Convert int32_t to float
	__m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
	// Apply the scale, and accumulate
	acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
	}

	// Return horizontal sum of the acc vector
	__m128 res = _mm256_extractf128_ps( acc, 1 );
	res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
	res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
	res = _mm_add_ss( res, _mm_movehdup_ps( res ) );

	sumf = _mm_cvtss_f32( res );

	*s = sumf;
	}


	static void ggml_vec_dot_q4_0_avx2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
	const int nb = n / QK;

	assert(n % QK == 0);
	assert(nb % 2 == 0);

	const block_q4_0 * restrict x = (const block_q4_0*)vx;
	const block_q4_0 * restrict y = (const block_q4_0*)vy;

	ggml_float sumf = 0.0;
	__m256 acc = _mm256_setzero_ps();

	// Main loop
	for (int i = 0; i < nb; ++i) {
	// Compute combined scale for the block
	const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );

	// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
	__m256i bx = bytesFromNibbles( x[i].qs );
	__m256i by = bytesFromNibbles( y[i].qs );

	// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
	const __m256i off = _mm256_set1_epi8( 8 );
	bx = _mm256_sub_epi8( bx, off );
	by = _mm256_sub_epi8( by, off );

	// Sign-extend first 16 signed bytes into int16_t
	__m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) );
	__m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );
	// Compute products of int16_t integers, add pairwise
	__m256i i32 = _mm256_madd_epi16( x16, y16 );

	// Sign-extend last 16 signed bytes into int16_t vectors
	x16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) );
	y16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) );
	// Accumulate products of int16_t integers
	i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16, y16 ) );

	// Convert int32_t to float
	__m256 p = _mm256_cvtepi32_ps( i32 );
	// Apply the scale, and accumulate
	acc = _mm256_fmadd_ps( d, p, acc );
	}

	// Return horizontal sum of the acc vector
	__m128 res = _mm256_extractf128_ps( acc, 1 );
	res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
	res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
	res = _mm_add_ss( res, _mm_movehdup_ps( res ) );

	sumf = _mm_cvtss_f32( res );
	*s = sumf;
	}

	static void ggml_vec_dot_q4_0_avx2_new(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
	const int nb = n / QK;

	assert(n % QK == 0);
	assert(nb % 2 == 0);

	const block_q4_0 * restrict x = (const block_q4_0*) vx;
	const block_q4_0 * restrict y = (const block_q4_0*) vy;

	ggml_float sumf = 0.0;

	// Initialize accumulator with zeros
	__m256 acc = _mm256_setzero_ps();

	// Main loop
	for (int i = 0; i < nb; ++i) {
	// Compute combined scale for the block
	const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );

	// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
	__m256i bx = bytesFromNibbles( x[i].qs );
	__m256i by = bytesFromNibbles( y[i].qs );

	// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
	const __m256i off = _mm256_set1_epi8( 8 );
	bx = _mm256_sub_epi8( bx, off );
	by = _mm256_sub_epi8( by, off );

	// Get absolute values of x vectors
	const __m256i ax = _mm256_sign_epi8(bx, bx);

	// Sign the values of the y vectors
	const __m256i sy = _mm256_sign_epi8(by, bx);

	// Perform multiplication and create 16-bit values
	const __m256i dot = _mm256_maddubs_epi16(ax, sy);

	const __m256i ones = _mm256_set1_epi16(1);
	__m256i i32 = _mm256_madd_epi16(ones, dot);

	// Convert int32_t to float
	__m256 p = _mm256_cvtepi32_ps( i32 );
	// Apply the scale, and accumulate
	acc = _mm256_fmadd_ps( d, p, acc );
	//acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
	}

	// Return horizontal sum of the acc vector
	__m128 res = _mm256_extractf128_ps( acc, 1 );
	res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
	res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
	res = _mm_add_ss( res, _mm_movehdup_ps( res ) );

	sumf = _mm_cvtss_f32( res );

	*s = sumf;
	}


	const int n=4096;

	void BM_ggml_vec_dot_q4_0_avx(benchmark::State& state) {
	float x[n] __attribute__ ((aligned (32)));
	float y[n] __attribute__ ((aligned (32)));
	float z[n] __attribute__ ((aligned (32)));

	srand(time(0));
	for (int i=0;i<n;i++) x[i] = ((float)rand()/(float)RAND_MAX-0.5f)*1e3f;
	for (int i=0;i<n;i++) y[i] = ((float)rand()/(float)RAND_MAX-0.5f)*1e3f;

	for (auto _ : state) {
	ggml_vec_dot_q4_0_avx(n, z, x, y);
	benchmark::DoNotOptimize(z);
	}
	}
	BENCHMARK(BM_ggml_vec_dot_q4_0_avx);

	void BM_ggml_vec_dot_q4_0_avx2(benchmark::State& state) {
	float x[n] __attribute__ ((aligned (32)));
	float y[n] __attribute__ ((aligned (32)));
	float z[n] __attribute__ ((aligned (32)));

	srand(time(0));
	for (int i=0;i<n;i++) x[i] = ((float)rand()/(float)RAND_MAX-0.5f)*1e3f;
	for (int i=0;i<n;i++) y[i] = ((float)rand()/(float)RAND_MAX-0.5f)*1e3f;

	for (auto _ : state) {
	ggml_vec_dot_q4_0_avx2(n, z, x, y);
	benchmark::DoNotOptimize(z);
	}

	}
	BENCHMARK(BM_ggml_vec_dot_q4_0_avx2);

	void BM_ggml_vec_dot_q4_0_avx2_new(benchmark::State& state) {
	float x[n] __attribute__ ((aligned (32)));
	float y[n] __attribute__ ((aligned (32)));
	float z[n] __attribute__ ((aligned (32)));

	srand(time(0));
	for (int i=0;i<n;i++) x[i] = ((float)rand()/(float)RAND_MAX-0.5f)*1e3f;
	for (int i=0;i<n;i++) y[i] = ((float)rand()/(float)RAND_MAX-0.5f)*1e3f;

	for (auto _ : state) {
	ggml_vec_dot_q4_0_avx2_new(n, z, x, y);
	benchmark::DoNotOptimize(z);
	}

	}
	BENCHMARK(BM_ggml_vec_dot_q4_0_avx2_new);

	// Run the benchmark
	BENCHMARK_MAIN();