Skip to content

Instantly share code, notes, and snippets.

@ashafq
Last active February 4, 2023 15:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ashafq/50880bbbf7769f4f307e9ea9c7e71cf9 to your computer and use it in GitHub Desktop.
Save ashafq/50880bbbf7769f4f307e9ea9c7e71cf9 to your computer and use it in GitHub Desktop.
WIP AVX512F mix function optimization
azureuser@Ardour:~/src2$ g++ -fsanitize=address -O3 -ffast-math -fopenmp -mfma -mavx512f -o test test.cc -lm && ./test 3333 4
compute_peak [def, AVX]: 5.000000e+00, 5.000000e+00
find_peaks [def, AVX]: (-5.000000e+00, 5.000000e+00) (-5.000000e+00, 5.000000e+00)
apply_gain_to_bufer [Error]: 0.000000e+00
mix_buffers_no_gain [Error]: 0.000000e+00
mix_buffers_with_gain [Error]: 0.000000e+00
MICRO_BENCH: default_compute_peak
Average time: 2.86104e-13
MICRO_BENCH: x86_avx_compute_peak
Average time: 6.29599e-07
MICRO_BENCH: default_find_peaks
Average time: 6.59345e-06
MICRO_BENCH: x86_avx512f_find_peaks
Average time: 6.56885e-07
MICRO_BENCH: default_apply_gain_to_buffer
Average time: 3.89993e-06
MICRO_BENCH: x86_avx512f_apply_gain_to_buffer
Average time: 5.88325e-07
MICRO_BENCH: default_mix_buffers_no_gain
Average time: 1.04519e-05
MICRO_BENCH: x86_avx512f_mix_buffers_no_gain
Average time: 1.18523e-06
MICRO_BENCH: default_mix_buffers_with_gain
Average time: 1.03626e-05
MICRO_BENCH: x86_avx512f_mix_buffers_with_gain
Average time: 1.79619e-06
MICRO_BENCH: default_copy_vector
Average time: 6.84109e-07
MICRO_BENCH: x86_avx512f_copy_vector
Average time: 1.15288e-06
/*
* Copyright (C) 2023 Ayan Shafqat <ayan@shafq.at>
* Copyright (C) 2023 Paul Davix <paul@linuxaudiosystems.com>
* Copyright (C) 2023 Robin Gareus <robin@gareus.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
// To compile: gcc -fopenmp -fsanitize=address -Os -mavx512f -mfma -o test test.cc -lm && ./mix 32768 8
#include <assert.h>
#include <math.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void fill_rand_f32(float *dst, size_t nframes);
float sum_abs_diff_f32(const float *src_a, const float *src_b, uint32_t nframes);
float frandf(void);
/**
* Default "unoptimized" functions
**/
float default_compute_peak(const float *src, uint32_t nframes, float current);
void default_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain);
void default_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes, float gain);
void default_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes);
void default_copy_vector(float *dst, const float *src, uint32_t nframes);
void default_find_peaks(const float *buf, uint32_t nsamples, float *minf, float *maxf);
/**
* Optimized AVX functions
**/
extern "C" {
float x86_avx512f_compute_peak(const float *src, uint32_t nframes, float current);
void x86_avx512f_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain);
void x86_avx512f_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes, float gain);
void x86_avx512f_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes);
void x86_avx512f_copy_vector(float *dst, const float *src, uint32_t nframes);
void x86_avx512f_find_peaks(const float *buf, uint32_t nsamples, float *minf, float *maxf);
}
#include <omp.h>
#define MICRO_BENCH(__statement, iter) \
do \
{ \
double start = omp_get_wtime(); \
for (long i = 0; i < iter; ++i) \
{ \
do \
{ \
__statement \
} while (0); \
} \
double end = omp_get_wtime(); \
double duration = (end - start) / ((double)iter); \
printf("Average time: %g\n", duration); \
} while (0)
int main(int argc, char **argv)
{
if (argc != 3) {
puts("D [num] [alignment]");
return 1;
}
uint32_t nframes = atoi(argv[1]);
size_t alignment = atoi(argv[2]);
float *src = (float *) aligned_alloc(alignment, nframes * sizeof(float));
float *dst = (float *) aligned_alloc(alignment, nframes * sizeof(float));
float *ref = (float *) aligned_alloc(alignment, nframes * sizeof(float));
srand(((uintptr_t)src) ^ ((uintptr_t)dst));
assert(src && "src is NULL");
assert(dst && "dst is NULL");
assert(ref && "ref is NULL");
fill_rand_f32(src, nframes);
fill_rand_f32(dst, nframes);
/* Compute peak */
{
src[5] = 5.0F;
src[6] = -5.0F;
float peak_d = default_compute_peak(src, nframes, 0.0F);
float peak_a = x86_avx512f_compute_peak(src, nframes, 0.0F);
printf("compute_peak [def, AVX]: %e, %e\n", peak_d, peak_a);
}
/* Find peak */
{
float amin, bmin, amax, bmax;
amin = bmin = __builtin_inf();
amax = bmax = 0.0F;
default_find_peaks(src, nframes, &amin, &amax);
x86_avx512f_find_peaks(src + 1, nframes - 1, &bmin, &bmax);
printf("find_peaks [def, AVX]: (%e, %e) (%e, %e)\n", amin, amax, bmin, bmax);
}
/* Apply gain */
{
float gain = frandf();
fill_rand_f32(src, nframes);
default_copy_vector(ref, src, nframes);
default_apply_gain_to_buffer(src, nframes, gain);
x86_avx512f_apply_gain_to_buffer(ref, nframes, gain);
printf("apply_gain_to_bufer [Error]: %e\n", sum_abs_diff_f32(ref, src, nframes));
}
/* Mix buffer no gain */
{
fill_rand_f32(src, nframes);
fill_rand_f32(dst, nframes);
default_copy_vector(ref, dst, nframes);
default_mix_buffers_no_gain(ref, src, nframes);
x86_avx512f_mix_buffers_no_gain(dst, src, nframes);
printf("mix_buffers_no_gain [Error]: %e\n", sum_abs_diff_f32(ref, dst, nframes));
}
/* Mix buffer with gain */
{
float gain = frandf();
fill_rand_f32(src, nframes);
fill_rand_f32(dst, nframes);
default_copy_vector(ref, dst, nframes);
default_mix_buffers_with_gain(ref, src, nframes, gain);
x86_avx512f_mix_buffers_with_gain(dst, src, nframes, gain);
printf("mix_buffers_with_gain [Error]: %e\n", sum_abs_diff_f32(ref, dst, nframes));
}
#define ITER (1 << 20)
puts("MICRO_BENCH: default_compute_peak");
MICRO_BENCH({
(void)default_compute_peak(src, nframes, 0.0F);
},ITER);
puts("MICRO_BENCH: x86_avx_compute_peak");
MICRO_BENCH({
(void)x86_avx512f_compute_peak(src, nframes, 0.0F);
},ITER);
puts("MICRO_BENCH: default_find_peaks");
float a,b;
MICRO_BENCH({
(void)default_find_peaks(src, nframes, &a, &b);
},ITER);
puts("MICRO_BENCH: x86_avx512f_find_peaks");
MICRO_BENCH({
(void)x86_avx512f_find_peaks(src, nframes, &a, &b);
},ITER);
float gain = frandf();
puts("MICRO_BENCH: default_apply_gain_to_buffer");
MICRO_BENCH({
default_apply_gain_to_buffer(src, nframes, gain);
},ITER);
puts("MICRO_BENCH: x86_avx512f_apply_gain_to_buffer");
MICRO_BENCH({
x86_avx512f_apply_gain_to_buffer(src, nframes, gain);
},ITER);
puts("MICRO_BENCH: default_mix_buffers_no_gain");
MICRO_BENCH({
default_mix_buffers_no_gain(dst, src, nframes);
},ITER);
puts("MICRO_BENCH: x86_avx512f_mix_buffers_no_gain");
MICRO_BENCH({
x86_avx512f_mix_buffers_no_gain(dst, src, nframes);
},ITER);
puts("MICRO_BENCH: default_mix_buffers_with_gain");
MICRO_BENCH({
default_mix_buffers_with_gain(dst, src, nframes, gain);
},ITER);
puts("MICRO_BENCH: x86_avx512f_mix_buffers_with_gain");
MICRO_BENCH({
x86_avx512f_mix_buffers_with_gain(dst, src, nframes, gain);
},ITER);
puts("MICRO_BENCH: default_copy_vector");
MICRO_BENCH({
default_copy_vector(dst, src, nframes);
},ITER);
puts("MICRO_BENCH: x86_avx512f_copy_vector");
MICRO_BENCH({
x86_avx512f_copy_vector(dst, src, nframes);
},ITER);
free(src);
free(dst);
free(ref);
}
float frandf(void)
{
const float scale = 1.0F / ((float)RAND_MAX);
return scale * ((float)(rand()));
}
void fill_rand_f32(float *dst, size_t nframes)
{
const float scale = 2.0F / ((float)RAND_MAX);
for (size_t i = 0; i < nframes; ++i) {
float rval = rand();
dst[i] = rval * scale - 1.0F;
}
}
float sum_abs_diff_f32(const float *src_a, const float *src_b, uint32_t nframes)
{
float sum = 0.0F;
for (uint32_t i = 0; i < nframes; ++i) {
sum += fabsf(src_a[i] - src_b[i]);
}
return sum;
}
/**
* Default "unoptimized" functions
**/
float default_compute_peak(const float *src, uint32_t nframes, float current)
{
for (uint32_t i = 0; i < nframes; ++i) {
current = fmaxf(current, fabsf(src[i]));
}
return current;
}
void default_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain)
{
for (uint32_t i = 0; i < nframes; ++i)
dst[i] *= gain;
}
void default_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes, float gain)
{
for (uint32_t i = 0; i < nframes; ++i)
dst[i] = dst[i] + (src[i] * gain);
}
void default_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes)
{
for (uint32_t i = 0; i < nframes; ++i)
dst[i] += src[i];
}
void default_copy_vector(float *dst, const float *src, uint32_t nframes)
{
memcpy(dst, src, nframes * sizeof(float));
}
void default_find_peaks(const float *buf, uint32_t nframes, float *minf, float *maxf)
{
uint32_t i;
float a, b;
a = *maxf;
b = *minf;
for (i = 0; i < nframes; i++)
{
a = fmaxf(buf[i], a);
b = fminf(buf[i], b);
}
*maxf = a;
*minf = b;
}
/**
* Optimized supercharged method!
**/
#include <immintrin.h>
#include <xmmintrin.h>
#ifndef __AVX512F__
#error "__AVX512F__ must be eanbled for this module to work"
#endif
#define LIKELY(cond) \
__builtin_expect((cond), 1)
#define UNLIKELY(cond) \
__builtin_expect((cond), 0)
#define IS_ALIGNED_TO(ptr, bytes) (reinterpret_cast<uintptr_t>(ptr) % (bytes) == 0)
#if defined(__GNUC__)
#define IS_NOT_ALIGNED_TO(ptr, bytes) \
__builtin_expect(!!(reinterpret_cast<intptr_t>(ptr) % (bytes)), 0)
#else
#define IS_NOT_ALIGNED_TO(ptr, bytes) \
(!!(reinterpret_cast<intptr_t>(ptr) % (bytes)))
#endif
#ifdef __cplusplus
#define C_FUNC extern "C"
#else
#define C_FUNC
#endif
/**
* Local functions
*/
static inline __m256 avx_abs_ps(__m256 x);
static inline __m256 avx_getmax_ps(__m256 vmax);
static inline __m256 avx_getmin_ps(__m256 vmin);
static void
x86_avx512f_mix_buffers_with_gain_unaligned(float *dst, const float *src, uint32_t nframes, float gain);
static void
x86_avx512f_mix_buffers_with_gain_aligned(float *dst, const float *src, uint32_t nframes, float gain);
static void
x86_avx512f_mix_buffers_no_gain_unaligned(float *dst, const float *src, uint32_t nframes);
static void
x86_avx512f_mix_buffers_no_gain_aligned(float *dst, const float *src, uint32_t nframes);
/**
* Module implementation
*/
/**
* @brief x86-64 AVX optimized routine for compute peak procedure
* @param src Pointer to source buffer
* @param nframes Number of frames to process
* @param current Current peak value
* @return float New peak value
*/
C_FUNC float
x86_avx512f_compute_peak(const float *src, uint32_t nframes, float current)
{
// Convert to signed integer to prevent any arithmetic overflow errors
int32_t frames = (int32_t)nframes;
// Broadcast the current max values to all elements of the ZMM register
__m512 zmax = _mm512_set1_ps(current);
// Compute single/4/8 min/max of unaligned portion until alignment is reached
while (frames > 0) {
if (IS_ALIGNED_TO(src, sizeof(__m512))) {
break;
}
if (IS_ALIGNED_TO(src, sizeof(__m256))) {
__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src));
x = _mm512_abs_ps(x);
zmax = _mm512_max_ps(zmax, x);
src += 8;
frames -= 8;
continue;
}
if (IS_ALIGNED_TO(src, sizeof(__m128))) {
__m512 x = _mm512_castps128_ps512(_mm_load_ps(src));
x = _mm512_abs_ps(x);
zmax = _mm512_max_ps(zmax, x);
src += 4;
frames -= 4;
continue;
}
// Pointers are aligned to float boundaries (4 bytes)
__m512 x = _mm512_castps128_ps512(_mm_load_ss(src));
x = _mm512_abs_ps(x);
zmax = _mm512_max_ps(zmax, x);
++src;
--frames;
}
while (frames >= 256) {
_mm_prefetch(reinterpret_cast<void const *>(src + 256), _mm_hint(0));
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
__m512 x4 = _mm512_load_ps(src + 64);
__m512 x5 = _mm512_load_ps(src + 80);
__m512 x6 = _mm512_load_ps(src + 96);
__m512 x7 = _mm512_load_ps(src + 112);
__m512 x8 = _mm512_load_ps(src + 128);
__m512 x9 = _mm512_load_ps(src + 144);
__m512 x10 = _mm512_load_ps(src + 160);
__m512 x11 = _mm512_load_ps(src + 176);
__m512 x12 = _mm512_load_ps(src + 192);
__m512 x13 = _mm512_load_ps(src + 208);
__m512 x14 = _mm512_load_ps(src + 224);
__m512 x15 = _mm512_load_ps(src + 240);
x0 = _mm512_abs_ps(x0);
x1 = _mm512_abs_ps(x1);
x2 = _mm512_abs_ps(x2);
x3 = _mm512_abs_ps(x3);
x4 = _mm512_abs_ps(x4);
x5 = _mm512_abs_ps(x5);
x6 = _mm512_abs_ps(x6);
x7 = _mm512_abs_ps(x7);
x8 = _mm512_abs_ps(x8);
x9 = _mm512_abs_ps(x9);
x10 = _mm512_abs_ps(x10);
x11 = _mm512_abs_ps(x11);
x12 = _mm512_abs_ps(x12);
x13 = _mm512_abs_ps(x13);
x14 = _mm512_abs_ps(x14);
x15 = _mm512_abs_ps(x15);
zmax = _mm512_max_ps(zmax, x0);
zmax = _mm512_max_ps(zmax, x1);
zmax = _mm512_max_ps(zmax, x2);
zmax = _mm512_max_ps(zmax, x3);
zmax = _mm512_max_ps(zmax, x4);
zmax = _mm512_max_ps(zmax, x5);
zmax = _mm512_max_ps(zmax, x6);
zmax = _mm512_max_ps(zmax, x7);
zmax = _mm512_max_ps(zmax, x8);
zmax = _mm512_max_ps(zmax, x9);
zmax = _mm512_max_ps(zmax, x10);
zmax = _mm512_max_ps(zmax, x11);
zmax = _mm512_max_ps(zmax, x12);
zmax = _mm512_max_ps(zmax, x13);
zmax = _mm512_max_ps(zmax, x14);
zmax = _mm512_max_ps(zmax, x15);
src += 256;
frames -= 256;
}
while (frames >= 128) {
_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
__m512 x4 = _mm512_load_ps(src + 64);
__m512 x5 = _mm512_load_ps(src + 80);
__m512 x6 = _mm512_load_ps(src + 96);
__m512 x7 = _mm512_load_ps(src + 112);
x0 = _mm512_abs_ps(x0);
x1 = _mm512_abs_ps(x1);
x2 = _mm512_abs_ps(x2);
x3 = _mm512_abs_ps(x3);
x4 = _mm512_abs_ps(x4);
x5 = _mm512_abs_ps(x5);
x6 = _mm512_abs_ps(x6);
x7 = _mm512_abs_ps(x7);
zmax = _mm512_max_ps(zmax, x0);
zmax = _mm512_max_ps(zmax, x1);
zmax = _mm512_max_ps(zmax, x2);
zmax = _mm512_max_ps(zmax, x3);
zmax = _mm512_max_ps(zmax, x4);
zmax = _mm512_max_ps(zmax, x5);
zmax = _mm512_max_ps(zmax, x6);
zmax = _mm512_max_ps(zmax, x7);
src += 128;
frames -= 128;
}
while (frames >= 64) {
_mm_prefetch(reinterpret_cast<void const *>(src + 64), _mm_hint(0));
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
x0 = _mm512_abs_ps(x0);
x1 = _mm512_abs_ps(x1);
x2 = _mm512_abs_ps(x2);
x3 = _mm512_abs_ps(x3);
zmax = _mm512_max_ps(zmax, x0);
zmax = _mm512_max_ps(zmax, x1);
zmax = _mm512_max_ps(zmax, x2);
zmax = _mm512_max_ps(zmax, x3);
src += 64;
frames -= 64;
}
// Process the remaining samples 16 at a time
while (frames >= 16) {
__m512 x = _mm512_load_ps(src);
x = _mm512_abs_ps(x);
zmax = _mm512_max_ps(zmax, x);
src += 16;
frames -= 16;
}
// Process the remaining samples 8 at a time
while (frames >= 8) {
__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src));
x = _mm512_abs_ps(x);
zmax = _mm512_max_ps(zmax, x);
src += 8;
frames -= 8;
}
// Process the remaining samples 4 at a time
while (frames >= 4) {
__m512 x = _mm512_castps128_ps512(_mm_load_ps(src));
x = _mm512_abs_ps(x);
zmax = _mm512_max_ps(zmax, x);
src += 4;
frames -= 4;
}
// If there are still some left 2-4 samples, process them one at a time.
while (frames > 0) {
__m512 x = _mm512_castps128_ps512(_mm_load_ss(src));
x = _mm512_abs_ps(x);
zmax = _mm512_max_ps(zmax, x);
++src;
--frames;
}
// Get the max of the ZMM registers
current = _mm512_reduce_max_ps(zmax);
// There's a penalty going from AVX mode to SSE mode. This can
// be avoided by ensuring the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
_mm256_zeroupper(); // zeros the upper portion of YMM register
return current;
}
/**
* @brief x86-64 AVX optimized routine for find peak procedure
* @param src Pointer to source buffer
* @param nframes Number of frames to process
* @param[in,out] minf Current minimum value, updated
* @param[in,out] maxf Current maximum value, updated
*/
C_FUNC void
x86_avx512f_find_peaks(const float *src, uint32_t nframes, float *minf, float *maxf)
{
// Convert to signed integer to prevent any arithmetic overflow errors
int32_t frames = (int32_t)nframes;
// Broadcast the current min and max values to all elements of the ZMM register
__m512 zmin = _mm512_set1_ps(*minf);
__m512 zmax = _mm512_set1_ps(*maxf);
// Compute single/4/8 min/max of unaligned portion until alignment is reached
while (frames > 0) {
if (IS_ALIGNED_TO(src, sizeof(__m512))) {
break;
}
if (IS_ALIGNED_TO(src, sizeof(__m256))) {
__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src));
zmin = _mm512_min_ps(zmin, x);
zmax = _mm512_max_ps(zmax, x);
src += 8;
frames -= 8;
continue;
}
if (IS_ALIGNED_TO(src, sizeof(__m128))) {
__m512 x = _mm512_castps128_ps512(_mm_load_ps(src));
zmin = _mm512_min_ps(zmin, x);
zmax = _mm512_max_ps(zmax, x);
src += 4;
frames -= 4;
continue;
}
// Pointers are aligned to float boundaries (4 bytes)
__m512 x = _mm512_castps128_ps512(_mm_load_ss(src));
zmin = _mm512_min_ps(zmin, x);
zmax = _mm512_max_ps(zmax, x);
++src;
--frames;
}
while (frames >= 256) {
_mm_prefetch(reinterpret_cast<void const *>(src + 256), _mm_hint(0));
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
__m512 x4 = _mm512_load_ps(src + 64);
__m512 x5 = _mm512_load_ps(src + 80);
__m512 x6 = _mm512_load_ps(src + 96);
__m512 x7 = _mm512_load_ps(src + 112);
__m512 x8 = _mm512_load_ps(src + 128);
__m512 x9 = _mm512_load_ps(src + 144);
__m512 x10 = _mm512_load_ps(src + 160);
__m512 x11 = _mm512_load_ps(src + 176);
__m512 x12 = _mm512_load_ps(src + 192);
__m512 x13 = _mm512_load_ps(src + 208);
__m512 x14 = _mm512_load_ps(src + 224);
__m512 x15 = _mm512_load_ps(src + 240);
zmin = _mm512_min_ps(zmin, x0);
zmin = _mm512_min_ps(zmin, x1);
zmin = _mm512_min_ps(zmin, x2);
zmin = _mm512_min_ps(zmin, x3);
zmin = _mm512_min_ps(zmin, x4);
zmin = _mm512_min_ps(zmin, x5);
zmin = _mm512_min_ps(zmin, x6);
zmin = _mm512_min_ps(zmin, x7);
zmin = _mm512_min_ps(zmin, x8);
zmin = _mm512_min_ps(zmin, x9);
zmin = _mm512_min_ps(zmin, x10);
zmin = _mm512_min_ps(zmin, x11);
zmin = _mm512_min_ps(zmin, x12);
zmin = _mm512_min_ps(zmin, x13);
zmin = _mm512_min_ps(zmin, x14);
zmin = _mm512_min_ps(zmin, x15);
zmax = _mm512_max_ps(zmax, x0);
zmax = _mm512_max_ps(zmax, x1);
zmax = _mm512_max_ps(zmax, x2);
zmax = _mm512_max_ps(zmax, x3);
zmax = _mm512_max_ps(zmax, x4);
zmax = _mm512_max_ps(zmax, x5);
zmax = _mm512_max_ps(zmax, x6);
zmax = _mm512_max_ps(zmax, x7);
zmax = _mm512_max_ps(zmax, x8);
zmax = _mm512_max_ps(zmax, x9);
zmax = _mm512_max_ps(zmax, x10);
zmax = _mm512_max_ps(zmax, x11);
zmax = _mm512_max_ps(zmax, x12);
zmax = _mm512_max_ps(zmax, x13);
zmax = _mm512_max_ps(zmax, x14);
zmax = _mm512_max_ps(zmax, x15);
src += 256;
frames -= 256;
}
while (frames >= 128) {
_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
__m512 x4 = _mm512_load_ps(src + 64);
__m512 x5 = _mm512_load_ps(src + 80);
__m512 x6 = _mm512_load_ps(src + 96);
__m512 x7 = _mm512_load_ps(src + 112);
zmin = _mm512_min_ps(zmin, x0);
zmin = _mm512_min_ps(zmin, x1);
zmin = _mm512_min_ps(zmin, x2);
zmin = _mm512_min_ps(zmin, x3);
zmin = _mm512_min_ps(zmin, x4);
zmin = _mm512_min_ps(zmin, x5);
zmin = _mm512_min_ps(zmin, x6);
zmin = _mm512_min_ps(zmin, x7);
zmax = _mm512_max_ps(zmax, x0);
zmax = _mm512_max_ps(zmax, x1);
zmax = _mm512_max_ps(zmax, x2);
zmax = _mm512_max_ps(zmax, x3);
zmax = _mm512_max_ps(zmax, x4);
zmax = _mm512_max_ps(zmax, x5);
zmax = _mm512_max_ps(zmax, x6);
zmax = _mm512_max_ps(zmax, x7);
src += 128;
frames -= 128;
}
while (frames >= 64) {
_mm_prefetch(reinterpret_cast<void const *>(src + 64), _mm_hint(0));
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
zmin = _mm512_min_ps(zmin, x0);
zmin = _mm512_min_ps(zmin, x1);
zmin = _mm512_min_ps(zmin, x2);
zmin = _mm512_min_ps(zmin, x3);
zmax = _mm512_max_ps(zmax, x0);
zmax = _mm512_max_ps(zmax, x1);
zmax = _mm512_max_ps(zmax, x2);
zmax = _mm512_max_ps(zmax, x3);
src += 64;
frames -= 64;
}
// Process the remaining samples 16 at a time
while (frames >= 16) {
__m512 x = _mm512_load_ps(src);
zmin = _mm512_min_ps(zmin, x);
zmax = _mm512_max_ps(zmax, x);
src += 16;
frames -= 16;
}
// Process the remaining samples 8 at a time
while (frames >= 8) {
__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src));
zmin = _mm512_min_ps(zmin, x);
zmax = _mm512_max_ps(zmax, x);
src += 8;
frames -= 8;
}
// Process the remaining samples 4 at a time
while (frames >= 4) {
__m512 x = _mm512_castps128_ps512(_mm_load_ps(src));
zmin = _mm512_min_ps(zmin, x);
zmax = _mm512_max_ps(zmax, x);
src += 4;
frames -= 4;
}
// If there are still some left 2-4 samples, process them one at a time.
while (frames > 0) {
__m512 x = _mm512_castps128_ps512(_mm_load_ss(src));
zmin = _mm512_min_ps(zmin, x);
zmax = _mm512_max_ps(zmax, x);
++src;
--frames;
}
// Get min and max of the ZMM registers
*minf = _mm512_reduce_min_ps(zmin);
*maxf = _mm512_reduce_max_ps(zmax);
// There's a penalty going from AVX mode to SSE mode. This can
// be avoided by ensuring the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
_mm256_zeroupper(); // zeros the upper portion of YMM register
}
/**
* @brief x86-64 AVX optimized routine for apply gain routine
* @param[in,out] dst Pointer to the destination buffer, which gets updated
* @param nframes Number of frames (or samples) to process
* @param gain Gain to apply
*/
C_FUNC void
x86_avx512f_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain)
{
// Convert to signed integer to prevent any arithmetic overflow errors
int32_t frames = (int32_t)nframes;
// Load gain vector to all elements of XMM, YMM, and ZMM register
// It's the same register, but used for SSE, AVX, and AVX512 calculation
__m512 zgain = _mm512_set1_ps(gain);
__m256 ygain = _mm512_castps512_ps256(zgain);
__m128 xgain = _mm512_castps512_ps128(zgain);
while (frames > 0) {
if (LIKELY(IS_ALIGNED_TO(dst, sizeof(__m512)))) {
break;
}
if (IS_ALIGNED_TO(dst, sizeof(__m256))) {
__m256 x = _mm256_load_ps(dst);
__m256 y = _mm256_mul_ps(ygain, x);
_mm256_store_ps(dst, y);
dst += 8;
frames -= 8;
continue;
}
if (IS_ALIGNED_TO(dst, sizeof(__m128))) {
__m128 x = _mm_load_ps(dst);
__m128 y = _mm_mul_ps(xgain, x);
_mm_store_ps(dst, y);
dst += 4;
frames -= 4;
continue;
}
// Pointers are aligned to float boundaries (4 bytes)
__m128 x = _mm_load_ss(dst);
__m128 y = _mm_mul_ss(xgain, x);
_mm_store_ss(dst, y);
++dst;
--frames;
}
// Process the remaining samples 128 at a time
while (frames >= 128) {
_mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0));
__m512 x0 = _mm512_load_ps(dst + 0);
__m512 x1 = _mm512_load_ps(dst + 16);
__m512 x2 = _mm512_load_ps(dst + 32);
__m512 x3 = _mm512_load_ps(dst + 48);
__m512 x4 = _mm512_load_ps(dst + 64);
__m512 x5 = _mm512_load_ps(dst + 80);
__m512 x6 = _mm512_load_ps(dst + 96);
__m512 x7 = _mm512_load_ps(dst + 112);
__m512 y0 = _mm512_mul_ps(zgain, x0);
__m512 y1 = _mm512_mul_ps(zgain, x1);
__m512 y2 = _mm512_mul_ps(zgain, x2);
__m512 y3 = _mm512_mul_ps(zgain, x3);
__m512 y4 = _mm512_mul_ps(zgain, x4);
__m512 y5 = _mm512_mul_ps(zgain, x5);
__m512 y6 = _mm512_mul_ps(zgain, x6);
__m512 y7 = _mm512_mul_ps(zgain, x7);
_mm512_store_ps(dst + 0, y0);
_mm512_store_ps(dst + 16, y1);
_mm512_store_ps(dst + 32, y2);
_mm512_store_ps(dst + 48, y3);
_mm512_store_ps(dst + 64, y4);
_mm512_store_ps(dst + 80, y5);
_mm512_store_ps(dst + 96, y6);
_mm512_store_ps(dst + 112, y7);
dst += 128;
frames -= 128;
}
// Process the remaining samples 16 at a time
while (frames >= 16) {
__m512 x = _mm512_load_ps(dst);
__m512 y = _mm512_mul_ps(zgain, x);
_mm512_store_ps(dst, y);
dst += 16;
frames -= 16;
}
// Process remaining samples x8
while (frames >= 8) {
__m256 x = _mm256_load_ps(dst);
__m256 y = _mm256_mul_ps(ygain, x);
_mm256_store_ps(dst, y);
dst += 8;
frames -= 8;
}
// Process remaining samples x4
while (frames >= 4) {
__m128 x = _mm_load_ps(dst);
__m128 y = _mm_mul_ps(xgain, x);
_mm_store_ps(dst, y);
dst += 4;
frames -= 4;
}
// Process remaining samples
while (frames > 0) {
__m128 x = _mm_load_ss(dst);
__m128 y = _mm_mul_ss(xgain, x);
_mm_store_ss(dst, y);
++dst;
--frames;
}
// There's a penalty going from AVX mode to SSE mode. This can
// be avoided by ensuring the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
//
_mm256_zeroupper(); // zeros the upper portion of YMM register
}
/**
* @brief x86-64 AVX optimized routine for mixing buffer with gain.
*
* This function may choose SSE over AVX if the pointers are aligned
* to 16 byte boundary instead of 32 byte boundary to reduce time to
* process.
*
* @param[in,out] dst Pointer to destination buffer, which gets updated
* @param[in] src Pointer to source buffer (not updated)
* @param nframes Number of samples to process
* @param gain Gain to apply
*/
C_FUNC void
x86_avx512f_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes, float gain)
{
// Convert to signed integer to prevent any arithmetic overflow errors
int32_t frames = (int32_t)nframes;
// Load gain vector to all elements of XMM, YMM, and ZMM register
// It's the same register, but used for SSE, AVX, and AVX512 calculation
__m512 zgain = _mm512_set1_ps(gain);
__m256 ygain = _mm512_castps512_ps256(zgain);
__m128 xgain = _mm512_castps512_ps128(zgain);
while (frames > 0)
{
if (IS_ALIGNED_TO(src, sizeof(__m512)) &&
IS_ALIGNED_TO(dst, sizeof(__m512))) {
break;
}
if (IS_ALIGNED_TO(src, sizeof(__m256)) &&
IS_ALIGNED_TO(dst, sizeof(__m256))) {
__m256 x = _mm256_load_ps(src);
__m256 y = _mm256_load_ps(dst);
y = _mm256_fmadd_ps(ygain, x, y);
_mm256_store_ps(dst, y);
src += 8;
dst += 8;
frames -= 8;
continue;
}
if (IS_ALIGNED_TO(src, sizeof(__m128)) &&
IS_ALIGNED_TO(dst, sizeof(__m128))) {
__m128 x = _mm_load_ps(src);
__m128 y = _mm_load_ps(dst);
y = _mm_fmadd_ps(xgain, x, y);
_mm_store_ps(dst, y);
src += 4;
dst += 4;
frames -= 4;
continue;
}
// Pointers are aligned to float boundaries (4 bytes)
__m128 x = _mm_load_ss(src);
__m128 y = _mm_load_ss(dst);
y = _mm_fmadd_ss(xgain, x, y);
_mm_store_ss(dst, y);
++src;
++dst;
--frames;
}
// Process the remaining samples 128 at a time
while (frames >= 128) {
_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
_mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0));
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
__m512 x4 = _mm512_load_ps(src + 64);
__m512 x5 = _mm512_load_ps(src + 80);
__m512 x6 = _mm512_load_ps(src + 96);
__m512 x7 = _mm512_load_ps(src + 112);
__m512 y0 = _mm512_load_ps(dst + 0);
__m512 y1 = _mm512_load_ps(dst + 16);
__m512 y2 = _mm512_load_ps(dst + 32);
__m512 y3 = _mm512_load_ps(dst + 48);
__m512 y4 = _mm512_load_ps(dst + 64);
__m512 y5 = _mm512_load_ps(dst + 80);
__m512 y6 = _mm512_load_ps(dst + 96);
__m512 y7 = _mm512_load_ps(dst + 112);
y0 = _mm512_fmadd_ps(zgain, x0, y0);
y1 = _mm512_fmadd_ps(zgain, x1, y1);
y2 = _mm512_fmadd_ps(zgain, x2, y2);
y3 = _mm512_fmadd_ps(zgain, x3, y3);
y4 = _mm512_fmadd_ps(zgain, x4, y4);
y5 = _mm512_fmadd_ps(zgain, x5, y5);
y6 = _mm512_fmadd_ps(zgain, x6, y6);
y7 = _mm512_fmadd_ps(zgain, x7, y7);
_mm512_store_ps(dst + 0, y0);
_mm512_store_ps(dst + 16, y1);
_mm512_store_ps(dst + 32, y2);
_mm512_store_ps(dst + 48, y3);
_mm512_store_ps(dst + 64, y4);
_mm512_store_ps(dst + 80, y5);
_mm512_store_ps(dst + 96, y6);
_mm512_store_ps(dst + 112, y7);
src += 128;
dst += 128;
frames -= 128;
}
// Process the remaining samples 16 at a time
while (frames >= 16) {
__m512 x = _mm512_load_ps(src);
__m512 y = _mm512_load_ps(dst);
y = _mm512_fmadd_ps(zgain, x, y);
_mm512_store_ps(dst, y);
src += 16;
dst += 16;
frames -= 16;
}
// Process remaining samples x8
while (frames >= 8) {
__m256 x = _mm256_load_ps(src);
__m256 y = _mm256_load_ps(dst);
y = _mm256_fmadd_ps(ygain, x, y);
_mm256_store_ps(dst, y);
src += 8;
dst += 8;
frames -= 8;
}
// Process remaining samples x4
while (frames >= 4) {
__m128 x = _mm_load_ps(src);
__m128 y = _mm_load_ps(dst);
y = _mm_fmadd_ps(xgain, x, y);
_mm_store_ps(dst, y);
src += 4;
dst += 4;
frames -= 4;
}
// Process remaining samples
while (frames > 0) {
__m128 x = _mm_load_ss(src);
__m128 y = _mm_load_ss(dst);
y = _mm_fmadd_ss(xgain, x, y);
_mm_store_ss(dst, y);
++src;
++dst;
--frames;
}
// There's a penalty going from AVX mode to SSE mode. This can
// be avoided by ensuring the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
//
_mm256_zeroupper(); // zeros the upper portion of YMM register
}
/**
* @brief x86-64 AVX optimized routine for mixing buffer with no gain.
*
* This function may choose SSE over AVX if the pointers are aligned
* to 16 byte boundary instead of 32 byte boundary to reduce time to
* process.
*
* @param[in,out] dst Pointer to destination buffer, which gets updated
* @param[in] src Pointer to source buffer (not updated)
* @param nframes Number of samples to process
*/
C_FUNC void
x86_avx512f_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes)
{
// Convert to signed integer to prevent any arithmetic overflow errors
int32_t frames = (int32_t)nframes;
while (frames > 0)
{
if (IS_ALIGNED_TO(src, sizeof(__m512)) &&
IS_ALIGNED_TO(dst, sizeof(__m512))) {
break;
}
if (IS_ALIGNED_TO(src, sizeof(__m256)) &&
IS_ALIGNED_TO(dst, sizeof(__m256))) {
__m256 x = _mm256_load_ps(src);
__m256 y = _mm256_load_ps(dst);
y = _mm256_add_ps(x, y);
_mm256_store_ps(dst, y);
src += 8;
dst += 8;
frames -= 8;
continue;
}
if (IS_ALIGNED_TO(src, sizeof(__m128)) &&
IS_ALIGNED_TO(dst, sizeof(__m128))) {
__m128 x = _mm_load_ps(src);
__m128 y = _mm_load_ps(dst);
y = _mm_add_ps(x, y);
_mm_store_ps(dst, y);
src += 4;
dst += 4;
frames -= 4;
continue;
}
// Pointers are aligned to float boundaries (4 bytes)
__m128 x = _mm_load_ss(src);
__m128 y = _mm_load_ss(dst);
y = _mm_add_ss(x, y);
_mm_store_ss(dst, y);
++src;
++dst;
--frames;
}
// Process the remaining samples 128 at a time
while (frames >= 128) {
_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
_mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0));
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
__m512 x4 = _mm512_load_ps(src + 64);
__m512 x5 = _mm512_load_ps(src + 80);
__m512 x6 = _mm512_load_ps(src + 96);
__m512 x7 = _mm512_load_ps(src + 112);
__m512 y0 = _mm512_load_ps(dst + 0);
__m512 y1 = _mm512_load_ps(dst + 16);
__m512 y2 = _mm512_load_ps(dst + 32);
__m512 y3 = _mm512_load_ps(dst + 48);
__m512 y4 = _mm512_load_ps(dst + 64);
__m512 y5 = _mm512_load_ps(dst + 80);
__m512 y6 = _mm512_load_ps(dst + 96);
__m512 y7 = _mm512_load_ps(dst + 112);
y0 = _mm512_add_ps(x0, y0);
y1 = _mm512_add_ps(x1, y1);
y2 = _mm512_add_ps(x2, y2);
y3 = _mm512_add_ps(x3, y3);
y4 = _mm512_add_ps(x4, y4);
y5 = _mm512_add_ps(x5, y5);
y6 = _mm512_add_ps(x6, y6);
y7 = _mm512_add_ps(x7, y7);
_mm512_store_ps(dst + 0, y0);
_mm512_store_ps(dst + 16, y1);
_mm512_store_ps(dst + 32, y2);
_mm512_store_ps(dst + 48, y3);
_mm512_store_ps(dst + 64, y4);
_mm512_store_ps(dst + 80, y5);
_mm512_store_ps(dst + 96, y6);
_mm512_store_ps(dst + 112, y7);
src += 128;
dst += 128;
frames -= 128;
}
// Process the remaining samples 16 at a time
while (frames >= 16) {
__m512 x = _mm512_load_ps(src);
__m512 y = _mm512_load_ps(dst);
y = _mm512_add_ps(x, y);
_mm512_store_ps(dst, y);
src += 16;
dst += 16;
frames -= 16;
}
// Process remaining samples x8
while (frames >= 8) {
__m256 x = _mm256_load_ps(src);
__m256 y = _mm256_load_ps(dst);
y = _mm256_add_ps(x, y);
_mm256_store_ps(dst, y);
src += 8;
dst += 8;
frames -= 8;
}
// Process remaining samples x4
while (frames >= 4) {
__m128 x = _mm_load_ps(src);
__m128 y = _mm_load_ps(dst);
y = _mm_add_ps(x, y);
_mm_store_ps(dst, y);
src += 4;
dst += 4;
frames -= 4;
}
// Process remaining samples
while (frames > 0) {
__m128 x = _mm_load_ss(src);
__m128 y = _mm_load_ss(dst);
y = _mm_add_ss(x, y);
_mm_store_ss(dst, y);
++src;
++dst;
--frames;
}
// There's a penalty going from AVX mode to SSE mode. This can
// be avoided by ensuring the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
//
_mm256_zeroupper(); // zeros the upper portion of YMM register
}
C_FUNC void
x86_avx512f_copy_vector(float *dst, const float *src, uint32_t nframes)
{
// Convert to signed integer to prevent any arithmetic overflow errors
int32_t frames = (int32_t)nframes;
while (frames > 0) {
if (LIKELY(IS_ALIGNED_TO(dst, sizeof(__m512)))) {
break;
}
if (IS_ALIGNED_TO(dst, sizeof(__m256))) {
__m256 x = _mm256_load_ps(src);
_mm256_store_ps(dst, x);
src += 8;
dst += 8;
frames -= 8;
continue;
}
if (IS_ALIGNED_TO(dst, sizeof(__m128))) {
__m128 x = _mm_load_ps(src);
_mm_store_ps(dst, x);
src += 4;
dst += 4;
frames -= 4;
continue;
}
// Pointers are aligned to float boundaries (4 bytes)
__m128 x = _mm_load_ss(src);
_mm_store_ss(dst, x);
++src;
++dst;
--frames;
}
while (frames >= 256) {
_mm_prefetch(reinterpret_cast<void const *>(src + 256), _mm_hint(0));
_mm_prefetch(reinterpret_cast<void const *>(dst + 256), _mm_hint(0));
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
__m512 x4 = _mm512_load_ps(src + 64);
__m512 x5 = _mm512_load_ps(src + 80);
__m512 x6 = _mm512_load_ps(src + 96);
__m512 x7 = _mm512_load_ps(src + 112);
__m512 x8 = _mm512_load_ps(src + 128);
__m512 x9 = _mm512_load_ps(src + 144);
__m512 x10 = _mm512_load_ps(src + 160);
__m512 x11 = _mm512_load_ps(src + 176);
__m512 x12 = _mm512_load_ps(src + 192);
__m512 x13 = _mm512_load_ps(src + 208);
__m512 x14 = _mm512_load_ps(src + 224);
__m512 x15 = _mm512_load_ps(src + 240);
_mm512_store_ps(dst + 0, x0);
_mm512_store_ps(dst + 16, x1);
_mm512_store_ps(dst + 32, x2);
_mm512_store_ps(dst + 48, x3);
_mm512_store_ps(dst + 64, x4);
_mm512_store_ps(dst + 80, x5);
_mm512_store_ps(dst + 96, x6);
_mm512_store_ps(dst + 112, x7);
_mm512_store_ps(dst + 128, x8);
_mm512_store_ps(dst + 144, x9);
_mm512_store_ps(dst + 160, x10);
_mm512_store_ps(dst + 176, x11);
_mm512_store_ps(dst + 192, x12);
_mm512_store_ps(dst + 208, x13);
_mm512_store_ps(dst + 224, x14);
_mm512_store_ps(dst + 240, x15);
src += 256;
dst += 256;
frames -= 256;
}
while (frames >= 128) {
_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
_mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0));
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
__m512 x4 = _mm512_load_ps(src + 64);
__m512 x5 = _mm512_load_ps(src + 80);
__m512 x6 = _mm512_load_ps(src + 96);
__m512 x7 = _mm512_load_ps(src + 112);
_mm512_store_ps(dst + 0, x0);
_mm512_store_ps(dst + 16, x1);
_mm512_store_ps(dst + 32, x2);
_mm512_store_ps(dst + 48, x3);
_mm512_store_ps(dst + 64, x4);
_mm512_store_ps(dst + 80, x5);
_mm512_store_ps(dst + 96, x6);
_mm512_store_ps(dst + 112, x7);
src += 128;
dst += 128;
frames -= 128;
}
// Process the remaining samples 16 at a time
while (frames >= 16) {
__m512 x = _mm512_load_ps(src);
_mm512_store_ps(dst, x);
src += 16;
dst += 16;
frames -= 16;
}
// Process remaining samples x8
while (frames >= 8) {
__m256 x = _mm256_load_ps(src);
_mm256_store_ps(dst, x);
src += 8;
dst += 8;
frames -= 8;
}
// Process remaining samples x4
while (frames >= 4) {
__m128 x = _mm_load_ps(src);
_mm_store_ps(dst, x);
src += 4;
dst += 4;
frames -= 4;
}
// Process remaining samples
while (frames > 0) {
__m128 x = _mm_load_ss(src);
_mm_store_ss(dst, x);
++src;
++dst;
--frames;
}
// There's a penalty going from AVX mode to SSE mode. This can
// be avoided by ensuring the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
_mm256_zeroupper(); // zeros the upper portion of YMM register
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment