ashafq/test.cc

## gistfile1.txt
azureuser@Ardour:~/src2$ g++ -fsanitize=address -O3 -ffast-math -fopenmp -mfma -mavx512f -o test test.cc -lm && ./test 3333 4
compute_peak [def, AVX]: 5.000000e+00, 5.000000e+00
find_peaks [def, AVX]: (-5.000000e+00, 5.000000e+00) (-5.000000e+00, 5.000000e+00)
apply_gain_to_bufer [Error]: 0.000000e+00
mix_buffers_no_gain [Error]: 0.000000e+00
mix_buffers_with_gain [Error]: 0.000000e+00
MICRO_BENCH: default_compute_peak
Average time: 2.86104e-13
MICRO_BENCH: x86_avx_compute_peak
Average time: 6.29599e-07
MICRO_BENCH: default_find_peaks
Average time: 6.59345e-06
MICRO_BENCH: x86_avx512f_find_peaks
Average time: 6.56885e-07
MICRO_BENCH: default_apply_gain_to_buffer
Average time: 3.89993e-06
MICRO_BENCH: x86_avx512f_apply_gain_to_buffer
Average time: 5.88325e-07
MICRO_BENCH: default_mix_buffers_no_gain
Average time: 1.04519e-05
MICRO_BENCH: x86_avx512f_mix_buffers_no_gain
Average time: 1.18523e-06
MICRO_BENCH: default_mix_buffers_with_gain
Average time: 1.03626e-05
MICRO_BENCH: x86_avx512f_mix_buffers_with_gain
Average time: 1.79619e-06
MICRO_BENCH: default_copy_vector
Average time: 6.84109e-07
MICRO_BENCH: x86_avx512f_copy_vector
Average time: 1.15288e-06

## test.cc
/*
 * Copyright (C) 2023 Ayan Shafqat <ayan@shafq.at>
 * Copyright (C) 2023 Paul Davix <paul@linuxaudiosystems.com>
 * Copyright (C) 2023 Robin Gareus <robin@gareus.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

// To compile: gcc -fopenmp -fsanitize=address -Os -mavx512f -mfma -o test test.cc -lm && ./mix 32768 8

#include <assert.h>
#include <math.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

void fill_rand_f32(float *dst, size_t nframes);
float sum_abs_diff_f32(const float *src_a, const float *src_b, uint32_t nframes);
float frandf(void);

/**
 * Default "unoptimized" functions
 **/

float default_compute_peak(const float *src, uint32_t nframes, float current);
void default_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain);
void default_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes, float gain);
void default_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes);
void default_copy_vector(float *dst, const float *src, uint32_t nframes);
void default_find_peaks(const float *buf, uint32_t nsamples, float *minf, float *maxf);

/**
 * Optimized AVX functions
 **/

extern "C" {
float x86_avx512f_compute_peak(const float *src, uint32_t nframes, float current);
void x86_avx512f_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain);
void x86_avx512f_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes, float gain);
void x86_avx512f_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes);
void x86_avx512f_copy_vector(float *dst, const float *src, uint32_t nframes);
void x86_avx512f_find_peaks(const float *buf, uint32_t nsamples, float *minf, float *maxf);
}

#include <omp.h>

#define MICRO_BENCH(__statement, iter)                    \
	do                                                    \
	{                                                     \
		double start = omp_get_wtime();                   \
		for (long i = 0; i < iter; ++i)                   \
		{                                                 \
			do                                            \
			{                                             \
				__statement                               \
			} while (0);                                  \
		}                                                 \
		double end = omp_get_wtime();                     \
		double duration = (end - start) / ((double)iter); \
		printf("Average time: %g\n", duration);           \
	} while (0)

int main(int argc, char **argv)
{
	if (argc != 3) {
		puts("D [num] [alignment]");
		return 1;
	}
	uint32_t nframes = atoi(argv[1]);
	size_t alignment = atoi(argv[2]);
	float *src = (float *) aligned_alloc(alignment, nframes * sizeof(float));
	float *dst = (float *) aligned_alloc(alignment, nframes * sizeof(float));
	float *ref = (float *) aligned_alloc(alignment, nframes * sizeof(float));

	srand(((uintptr_t)src) ^ ((uintptr_t)dst));

	assert(src && "src is NULL");
	assert(dst && "dst is NULL");
	assert(ref && "ref is NULL");

	fill_rand_f32(src, nframes);
	fill_rand_f32(dst, nframes);

	/* Compute peak */
	{
		src[5] = 5.0F;
		src[6] = -5.0F;
		float peak_d = default_compute_peak(src, nframes, 0.0F);
		float peak_a = x86_avx512f_compute_peak(src, nframes, 0.0F);

		printf("compute_peak [def, AVX]: %e, %e\n", peak_d, peak_a);
	}

	/* Find peak */
	{
		float amin, bmin, amax, bmax;
		amin = bmin = __builtin_inf();
		amax = bmax = 0.0F;

		default_find_peaks(src, nframes, &amin, &amax);
		x86_avx512f_find_peaks(src + 1, nframes - 1, &bmin, &bmax);

		printf("find_peaks [def, AVX]: (%e, %e) (%e, %e)\n", amin, amax, bmin, bmax);
	}

	/* Apply gain */
	{
		float gain = frandf();

		fill_rand_f32(src, nframes);
		default_copy_vector(ref, src, nframes);

		default_apply_gain_to_buffer(src, nframes, gain);
		x86_avx512f_apply_gain_to_buffer(ref, nframes, gain);

		printf("apply_gain_to_bufer [Error]: %e\n", sum_abs_diff_f32(ref, src, nframes));

	}

	/* Mix buffer no gain */
	{
		fill_rand_f32(src, nframes);
		fill_rand_f32(dst, nframes);
		default_copy_vector(ref, dst, nframes);

		default_mix_buffers_no_gain(ref, src, nframes);
		x86_avx512f_mix_buffers_no_gain(dst, src, nframes);

		printf("mix_buffers_no_gain [Error]: %e\n", sum_abs_diff_f32(ref, dst, nframes));
	}


	/* Mix buffer with gain */
	{
		float gain = frandf();

		fill_rand_f32(src, nframes);
		fill_rand_f32(dst, nframes);
		default_copy_vector(ref, dst, nframes);

		default_mix_buffers_with_gain(ref, src, nframes, gain);
		x86_avx512f_mix_buffers_with_gain(dst, src, nframes, gain);

		printf("mix_buffers_with_gain [Error]: %e\n", sum_abs_diff_f32(ref, dst, nframes));
	}

	#define ITER (1 << 20)

	puts("MICRO_BENCH: default_compute_peak");
	MICRO_BENCH({
		(void)default_compute_peak(src, nframes, 0.0F);
	},ITER);

	puts("MICRO_BENCH: x86_avx_compute_peak");
	MICRO_BENCH({
		(void)x86_avx512f_compute_peak(src, nframes, 0.0F);
	},ITER);

	puts("MICRO_BENCH: default_find_peaks");
	float a,b;
	MICRO_BENCH({
		(void)default_find_peaks(src, nframes, &a, &b);
	},ITER);

	puts("MICRO_BENCH: x86_avx512f_find_peaks");
	MICRO_BENCH({
		(void)x86_avx512f_find_peaks(src, nframes, &a, &b);
	},ITER);

	float gain = frandf();
	puts("MICRO_BENCH: default_apply_gain_to_buffer");
	MICRO_BENCH({
		default_apply_gain_to_buffer(src, nframes, gain);
	},ITER);

	puts("MICRO_BENCH: x86_avx512f_apply_gain_to_buffer");
	MICRO_BENCH({
		x86_avx512f_apply_gain_to_buffer(src, nframes, gain);
	},ITER);

	puts("MICRO_BENCH: default_mix_buffers_no_gain");
	MICRO_BENCH({
		default_mix_buffers_no_gain(dst, src, nframes);
	},ITER);

	puts("MICRO_BENCH: x86_avx512f_mix_buffers_no_gain");
	MICRO_BENCH({
		x86_avx512f_mix_buffers_no_gain(dst, src, nframes);
	},ITER);

	puts("MICRO_BENCH: default_mix_buffers_with_gain");
	MICRO_BENCH({
		default_mix_buffers_with_gain(dst, src, nframes, gain);
	},ITER);

	puts("MICRO_BENCH: x86_avx512f_mix_buffers_with_gain");
	MICRO_BENCH({
		x86_avx512f_mix_buffers_with_gain(dst, src, nframes, gain);
	},ITER);

	puts("MICRO_BENCH: default_copy_vector");
	MICRO_BENCH({
		default_copy_vector(dst, src, nframes);
	},ITER);

	puts("MICRO_BENCH: x86_avx512f_copy_vector");
	MICRO_BENCH({
		x86_avx512f_copy_vector(dst, src, nframes);
	},ITER);

	free(src);
	free(dst);
	free(ref);
}

float frandf(void)
{
	const float scale = 1.0F / ((float)RAND_MAX);
	return scale * ((float)(rand()));
}

void fill_rand_f32(float *dst, size_t nframes)
{
	const float scale = 2.0F / ((float)RAND_MAX);

	for (size_t i = 0; i < nframes; ++i) {
		float rval = rand();
		dst[i] = rval * scale - 1.0F;
	}
}

float sum_abs_diff_f32(const float *src_a, const float *src_b, uint32_t nframes)
{
	float sum = 0.0F;

	for (uint32_t i = 0; i < nframes; ++i) {
		sum += fabsf(src_a[i] - src_b[i]);
	}

	return sum;
}


/**
 * Default "unoptimized" functions
 **/

float default_compute_peak(const float *src, uint32_t nframes, float current)
{
	for (uint32_t i = 0; i < nframes; ++i) {
		current = fmaxf(current, fabsf(src[i]));
	}
	return current;
}

void default_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain)
{
	for (uint32_t i = 0; i < nframes; ++i)
		dst[i] *= gain;
}

void default_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes, float gain)
{
	for (uint32_t i = 0; i < nframes; ++i)
		dst[i] = dst[i] + (src[i] * gain);
}

void default_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes)
{
	for (uint32_t i = 0; i < nframes; ++i)
		dst[i] += src[i];
}

void default_copy_vector(float *dst, const float *src, uint32_t nframes)
{
	memcpy(dst, src, nframes * sizeof(float));
}

void default_find_peaks(const float *buf, uint32_t nframes, float *minf, float *maxf)
{
	uint32_t i;
	float a, b;

	a = *maxf;
	b = *minf;

	for (i = 0; i < nframes; i++)
	{
		a = fmaxf(buf[i], a);
		b = fminf(buf[i], b);
	}

	*maxf = a;
	*minf = b;
}

/**
 * Optimized supercharged method!
 **/

#include <immintrin.h>
#include <xmmintrin.h>

#ifndef __AVX512F__
#error "__AVX512F__ must be eanbled for this module to work"
#endif


#define LIKELY(cond) \
	__builtin_expect((cond), 1)

#define UNLIKELY(cond) \
	__builtin_expect((cond), 0)

#define IS_ALIGNED_TO(ptr, bytes) (reinterpret_cast<uintptr_t>(ptr) % (bytes) == 0)

#if defined(__GNUC__)
#define IS_NOT_ALIGNED_TO(ptr, bytes) \
	__builtin_expect(!!(reinterpret_cast<intptr_t>(ptr) % (bytes)), 0)
#else
#define IS_NOT_ALIGNED_TO(ptr, bytes) \
	(!!(reinterpret_cast<intptr_t>(ptr) % (bytes)))
#endif

#ifdef __cplusplus
#define C_FUNC extern "C"
#else
#define C_FUNC
#endif

/**
 * Local functions
 */

static inline __m256 avx_abs_ps(__m256 x);
static inline __m256 avx_getmax_ps(__m256 vmax);
static inline __m256 avx_getmin_ps(__m256 vmin);

static void
x86_avx512f_mix_buffers_with_gain_unaligned(float *dst, const float *src, uint32_t nframes, float gain);

static void
x86_avx512f_mix_buffers_with_gain_aligned(float *dst, const float *src, uint32_t nframes, float gain);

static void
x86_avx512f_mix_buffers_no_gain_unaligned(float *dst, const float *src, uint32_t nframes);

static void
x86_avx512f_mix_buffers_no_gain_aligned(float *dst, const float *src, uint32_t nframes);

/**
 * Module implementation
 */

/**
 * @brief x86-64 AVX optimized routine for compute peak procedure
 * @param src Pointer to source buffer
 * @param nframes Number of frames to process
 * @param current Current peak value
 * @return float New peak value
 */
C_FUNC float
x86_avx512f_compute_peak(const float *src, uint32_t nframes, float current)
{
        // Convert to signed integer to prevent any arithmetic overflow errors
	int32_t frames = (int32_t)nframes;

	// Broadcast the current max values to all elements of the ZMM register
	__m512 zmax = _mm512_set1_ps(current);


	// Compute single/4/8 min/max of unaligned portion until alignment is reached
	while (frames > 0) {
		if (IS_ALIGNED_TO(src, sizeof(__m512))) {
			break;
		}

		if (IS_ALIGNED_TO(src, sizeof(__m256))) {
			__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src));

			x = _mm512_abs_ps(x);
			zmax = _mm512_max_ps(zmax, x);

			src += 8;
			frames -= 8;
			continue;
		}

		if (IS_ALIGNED_TO(src, sizeof(__m128))) {
			__m512 x = _mm512_castps128_ps512(_mm_load_ps(src));

			x = _mm512_abs_ps(x);
			zmax = _mm512_max_ps(zmax, x);

			src += 4;
			frames -= 4;
			continue;
		}

		// Pointers are aligned to float boundaries (4 bytes)
		__m512 x = _mm512_castps128_ps512(_mm_load_ss(src));

		x = _mm512_abs_ps(x);
		zmax = _mm512_max_ps(zmax, x);

		++src;
		--frames;
	}

	while (frames >= 256) {
		_mm_prefetch(reinterpret_cast<void const *>(src + 256), _mm_hint(0));

		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);
		__m512 x4 = _mm512_load_ps(src + 64);
		__m512 x5 = _mm512_load_ps(src + 80);
		__m512 x6 = _mm512_load_ps(src + 96);
		__m512 x7 = _mm512_load_ps(src + 112);

		__m512 x8 = _mm512_load_ps(src +  128);
		__m512 x9 = _mm512_load_ps(src +  144);
		__m512 x10 = _mm512_load_ps(src + 160);
		__m512 x11 = _mm512_load_ps(src + 176);
		__m512 x12 = _mm512_load_ps(src + 192);
		__m512 x13 = _mm512_load_ps(src + 208);
		__m512 x14 = _mm512_load_ps(src + 224);
		__m512 x15 = _mm512_load_ps(src + 240);

		x0  = _mm512_abs_ps(x0);
		x1  = _mm512_abs_ps(x1);
		x2  = _mm512_abs_ps(x2);
		x3  = _mm512_abs_ps(x3);
		x4  = _mm512_abs_ps(x4);
		x5  = _mm512_abs_ps(x5);
		x6  = _mm512_abs_ps(x6);
		x7  = _mm512_abs_ps(x7);
		x8  = _mm512_abs_ps(x8);
		x9  = _mm512_abs_ps(x9);
		x10 = _mm512_abs_ps(x10);
		x11 = _mm512_abs_ps(x11);
		x12 = _mm512_abs_ps(x12);
		x13 = _mm512_abs_ps(x13);
		x14 = _mm512_abs_ps(x14);
		x15 = _mm512_abs_ps(x15);

		zmax = _mm512_max_ps(zmax, x0);
		zmax = _mm512_max_ps(zmax, x1);
		zmax = _mm512_max_ps(zmax, x2);
		zmax = _mm512_max_ps(zmax, x3);
		zmax = _mm512_max_ps(zmax, x4);
		zmax = _mm512_max_ps(zmax, x5);
		zmax = _mm512_max_ps(zmax, x6);
		zmax = _mm512_max_ps(zmax, x7);
		zmax = _mm512_max_ps(zmax, x8);
		zmax = _mm512_max_ps(zmax, x9);
		zmax = _mm512_max_ps(zmax, x10);
		zmax = _mm512_max_ps(zmax, x11);
		zmax = _mm512_max_ps(zmax, x12);
		zmax = _mm512_max_ps(zmax, x13);
		zmax = _mm512_max_ps(zmax, x14);
		zmax = _mm512_max_ps(zmax, x15);

		src += 256;
		frames -= 256;
	}

	while (frames >= 128) {
		_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));

		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);
		__m512 x4 = _mm512_load_ps(src + 64);
		__m512 x5 = _mm512_load_ps(src + 80);
		__m512 x6 = _mm512_load_ps(src + 96);
		__m512 x7 = _mm512_load_ps(src + 112);

		x0  = _mm512_abs_ps(x0);
		x1  = _mm512_abs_ps(x1);
		x2  = _mm512_abs_ps(x2);
		x3  = _mm512_abs_ps(x3);
		x4  = _mm512_abs_ps(x4);
		x5  = _mm512_abs_ps(x5);
		x6  = _mm512_abs_ps(x6);
		x7  = _mm512_abs_ps(x7);

		zmax = _mm512_max_ps(zmax, x0);
		zmax = _mm512_max_ps(zmax, x1);
		zmax = _mm512_max_ps(zmax, x2);
		zmax = _mm512_max_ps(zmax, x3);
		zmax = _mm512_max_ps(zmax, x4);
		zmax = _mm512_max_ps(zmax, x5);
		zmax = _mm512_max_ps(zmax, x6);
		zmax = _mm512_max_ps(zmax, x7);

		src += 128;
		frames -= 128;
	}

	while (frames >= 64) {
		_mm_prefetch(reinterpret_cast<void const *>(src + 64), _mm_hint(0));

		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);

		x0  = _mm512_abs_ps(x0);
		x1  = _mm512_abs_ps(x1);
		x2  = _mm512_abs_ps(x2);
		x3  = _mm512_abs_ps(x3);

		zmax = _mm512_max_ps(zmax, x0);
		zmax = _mm512_max_ps(zmax, x1);
		zmax = _mm512_max_ps(zmax, x2);
		zmax = _mm512_max_ps(zmax, x3);

		src += 64;
		frames -= 64;
	}

	// Process the remaining samples 16 at a time
	while (frames >= 16) {
		__m512 x = _mm512_load_ps(src);

		x = _mm512_abs_ps(x);
		zmax = _mm512_max_ps(zmax, x);

		src += 16;
		frames -= 16;
	}

	// Process the remaining samples 8 at a time
	while (frames >= 8) {
		__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src));

		x = _mm512_abs_ps(x);
		zmax = _mm512_max_ps(zmax, x);

		src += 8;
		frames -= 8;
	}

	// Process the remaining samples 4 at a time
	while (frames >= 4) {
		__m512 x = _mm512_castps128_ps512(_mm_load_ps(src));

		x = _mm512_abs_ps(x);
		zmax = _mm512_max_ps(zmax, x);

		src += 4;
		frames -= 4;
	}

	// If there are still some left 2-4 samples, process them one at a time.
	while (frames > 0) {
		__m512 x = _mm512_castps128_ps512(_mm_load_ss(src));

		x = _mm512_abs_ps(x);
		zmax = _mm512_max_ps(zmax, x);

		++src;
		--frames;
	}

	// Get the max of the ZMM registers
	current = _mm512_reduce_max_ps(zmax);

	// There's a penalty going from AVX mode to SSE mode. This can
	// be avoided by ensuring the CPU that rest of the routine is no
	// longer interested in the upper portion of the YMM register.

	_mm256_zeroupper(); // zeros the upper portion of YMM register

	return current;
}

/**
 * @brief x86-64 AVX optimized routine for find peak procedure
 * @param src Pointer to source buffer
 * @param nframes Number of frames to process
 * @param[in,out] minf Current minimum value, updated
 * @param[in,out] maxf Current maximum value, updated
 */
C_FUNC void
x86_avx512f_find_peaks(const float *src, uint32_t nframes, float *minf, float *maxf)
{
        // Convert to signed integer to prevent any arithmetic overflow errors
	int32_t frames = (int32_t)nframes;

	// Broadcast the current min and max values to all elements of the ZMM register
	__m512 zmin = _mm512_set1_ps(*minf);
	__m512 zmax = _mm512_set1_ps(*maxf);


	// Compute single/4/8 min/max of unaligned portion until alignment is reached
	while (frames > 0) {
		if (IS_ALIGNED_TO(src, sizeof(__m512))) {
			break;
		}

		if (IS_ALIGNED_TO(src, sizeof(__m256))) {
			__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src));

			zmin = _mm512_min_ps(zmin, x);
			zmax = _mm512_max_ps(zmax, x);

			src += 8;
			frames -= 8;
			continue;
		}

		if (IS_ALIGNED_TO(src, sizeof(__m128))) {
			__m512 x = _mm512_castps128_ps512(_mm_load_ps(src));

			zmin = _mm512_min_ps(zmin, x);
			zmax = _mm512_max_ps(zmax, x);

			src += 4;
			frames -= 4;
			continue;
		}

		// Pointers are aligned to float boundaries (4 bytes)
		__m512 x = _mm512_castps128_ps512(_mm_load_ss(src));

		zmin = _mm512_min_ps(zmin, x);
		zmax = _mm512_max_ps(zmax, x);

		++src;
		--frames;
	}

	while (frames >= 256) {
		_mm_prefetch(reinterpret_cast<void const *>(src + 256), _mm_hint(0));

		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);
		__m512 x4 = _mm512_load_ps(src + 64);
		__m512 x5 = _mm512_load_ps(src + 80);
		__m512 x6 = _mm512_load_ps(src + 96);
		__m512 x7 = _mm512_load_ps(src + 112);

		__m512 x8 = _mm512_load_ps(src +  128);
		__m512 x9 = _mm512_load_ps(src +  144);
		__m512 x10 = _mm512_load_ps(src + 160);
		__m512 x11 = _mm512_load_ps(src + 176);
		__m512 x12 = _mm512_load_ps(src + 192);
		__m512 x13 = _mm512_load_ps(src + 208);
		__m512 x14 = _mm512_load_ps(src + 224);
		__m512 x15 = _mm512_load_ps(src + 240);

		zmin = _mm512_min_ps(zmin, x0);
		zmin = _mm512_min_ps(zmin, x1);
		zmin = _mm512_min_ps(zmin, x2);
		zmin = _mm512_min_ps(zmin, x3);
		zmin = _mm512_min_ps(zmin, x4);
		zmin = _mm512_min_ps(zmin, x5);
		zmin = _mm512_min_ps(zmin, x6);
		zmin = _mm512_min_ps(zmin, x7);

		zmin = _mm512_min_ps(zmin, x8);
		zmin = _mm512_min_ps(zmin, x9);
		zmin = _mm512_min_ps(zmin, x10);
		zmin = _mm512_min_ps(zmin, x11);
		zmin = _mm512_min_ps(zmin, x12);
		zmin = _mm512_min_ps(zmin, x13);
		zmin = _mm512_min_ps(zmin, x14);
		zmin = _mm512_min_ps(zmin, x15);

		zmax = _mm512_max_ps(zmax, x0);
		zmax = _mm512_max_ps(zmax, x1);
		zmax = _mm512_max_ps(zmax, x2);
		zmax = _mm512_max_ps(zmax, x3);
		zmax = _mm512_max_ps(zmax, x4);
		zmax = _mm512_max_ps(zmax, x5);
		zmax = _mm512_max_ps(zmax, x6);
		zmax = _mm512_max_ps(zmax, x7);

		zmax = _mm512_max_ps(zmax, x8);
		zmax = _mm512_max_ps(zmax, x9);
		zmax = _mm512_max_ps(zmax, x10);
		zmax = _mm512_max_ps(zmax, x11);
		zmax = _mm512_max_ps(zmax, x12);
		zmax = _mm512_max_ps(zmax, x13);
		zmax = _mm512_max_ps(zmax, x14);
		zmax = _mm512_max_ps(zmax, x15);

		src += 256;
		frames -= 256;
	}

	while (frames >= 128) {
		_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));

		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);
		__m512 x4 = _mm512_load_ps(src + 64);
		__m512 x5 = _mm512_load_ps(src + 80);
		__m512 x6 = _mm512_load_ps(src + 96);
		__m512 x7 = _mm512_load_ps(src + 112);

		zmin = _mm512_min_ps(zmin, x0);
		zmin = _mm512_min_ps(zmin, x1);
		zmin = _mm512_min_ps(zmin, x2);
		zmin = _mm512_min_ps(zmin, x3);
		zmin = _mm512_min_ps(zmin, x4);
		zmin = _mm512_min_ps(zmin, x5);
		zmin = _mm512_min_ps(zmin, x6);
		zmin = _mm512_min_ps(zmin, x7);

		zmax = _mm512_max_ps(zmax, x0);
		zmax = _mm512_max_ps(zmax, x1);
		zmax = _mm512_max_ps(zmax, x2);
		zmax = _mm512_max_ps(zmax, x3);
		zmax = _mm512_max_ps(zmax, x4);
		zmax = _mm512_max_ps(zmax, x5);
		zmax = _mm512_max_ps(zmax, x6);
		zmax = _mm512_max_ps(zmax, x7);

		src += 128;
		frames -= 128;
	}

	while (frames >= 64) {
		_mm_prefetch(reinterpret_cast<void const *>(src + 64), _mm_hint(0));

		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);

		zmin = _mm512_min_ps(zmin, x0);
		zmin = _mm512_min_ps(zmin, x1);
		zmin = _mm512_min_ps(zmin, x2);
		zmin = _mm512_min_ps(zmin, x3);

		zmax = _mm512_max_ps(zmax, x0);
		zmax = _mm512_max_ps(zmax, x1);
		zmax = _mm512_max_ps(zmax, x2);
		zmax = _mm512_max_ps(zmax, x3);

		src += 64;
		frames -= 64;
	}

	// Process the remaining samples 16 at a time
	while (frames >= 16) {
		__m512 x = _mm512_load_ps(src);

		zmin = _mm512_min_ps(zmin, x);
		zmax = _mm512_max_ps(zmax, x);

		src += 16;
		frames -= 16;
	}

	// Process the remaining samples 8 at a time
	while (frames >= 8) {
		__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src));

		zmin = _mm512_min_ps(zmin, x);
		zmax = _mm512_max_ps(zmax, x);

		src += 8;
		frames -= 8;
	}

	// Process the remaining samples 4 at a time
	while (frames >= 4) {
		__m512 x = _mm512_castps128_ps512(_mm_load_ps(src));

		zmin = _mm512_min_ps(zmin, x);
		zmax = _mm512_max_ps(zmax, x);

		src += 4;
		frames -= 4;
	}

	// If there are still some left 2-4 samples, process them one at a time.
	while (frames > 0) {
		__m512 x = _mm512_castps128_ps512(_mm_load_ss(src));

		zmin = _mm512_min_ps(zmin, x);
		zmax = _mm512_max_ps(zmax, x);

		++src;
		--frames;
	}

	// Get min and max of the ZMM registers
	*minf = _mm512_reduce_min_ps(zmin);
	*maxf = _mm512_reduce_max_ps(zmax);

	// There's a penalty going from AVX mode to SSE mode. This can
	// be avoided by ensuring the CPU that rest of the routine is no
	// longer interested in the upper portion of the YMM register.

	_mm256_zeroupper(); // zeros the upper portion of YMM register
}

/**
 * @brief x86-64 AVX optimized routine for apply gain routine
 * @param[in,out] dst Pointer to the destination buffer, which gets updated
 * @param nframes Number of frames (or samples) to process
 * @param gain Gain to apply
 */
C_FUNC void
x86_avx512f_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain)
{
        // Convert to signed integer to prevent any arithmetic overflow errors
	int32_t frames = (int32_t)nframes;

	// Load gain vector to all elements of XMM, YMM, and ZMM register
	// It's the same register, but used for SSE, AVX, and AVX512 calculation
	__m512 zgain = _mm512_set1_ps(gain);
	__m256 ygain = _mm512_castps512_ps256(zgain);
	__m128 xgain = _mm512_castps512_ps128(zgain);

	while (frames > 0) {
		if (LIKELY(IS_ALIGNED_TO(dst, sizeof(__m512)))) {
			break;
		}

		if (IS_ALIGNED_TO(dst, sizeof(__m256))) {
			__m256 x = _mm256_load_ps(dst);
			__m256 y = _mm256_mul_ps(ygain, x);
			_mm256_store_ps(dst, y);
			dst += 8;
			frames -= 8;
			continue;
		}

		if (IS_ALIGNED_TO(dst, sizeof(__m128))) {
			__m128 x = _mm_load_ps(dst);
			__m128 y = _mm_mul_ps(xgain, x);
			_mm_store_ps(dst, y);
			dst += 4;
			frames -= 4;
			continue;
		}

		// Pointers are aligned to float boundaries (4 bytes)
		__m128 x = _mm_load_ss(dst);
		__m128 y = _mm_mul_ss(xgain, x);
		_mm_store_ss(dst, y);
		++dst;
		--frames;
	}

	// Process the remaining samples 128 at a time
	while (frames >= 128) {
		_mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0));

		__m512 x0 = _mm512_load_ps(dst + 0);
		__m512 x1 = _mm512_load_ps(dst + 16);
		__m512 x2 = _mm512_load_ps(dst + 32);
		__m512 x3 = _mm512_load_ps(dst + 48);
		__m512 x4 = _mm512_load_ps(dst + 64);
		__m512 x5 = _mm512_load_ps(dst + 80);
		__m512 x6 = _mm512_load_ps(dst + 96);
		__m512 x7 = _mm512_load_ps(dst + 112);

		__m512 y0 = _mm512_mul_ps(zgain, x0);
		__m512 y1 = _mm512_mul_ps(zgain, x1);
		__m512 y2 = _mm512_mul_ps(zgain, x2);
		__m512 y3 = _mm512_mul_ps(zgain, x3);
		__m512 y4 = _mm512_mul_ps(zgain, x4);
		__m512 y5 = _mm512_mul_ps(zgain, x5);
		__m512 y6 = _mm512_mul_ps(zgain, x6);
		__m512 y7 = _mm512_mul_ps(zgain, x7);

		_mm512_store_ps(dst + 0, y0);
		_mm512_store_ps(dst + 16, y1);
		_mm512_store_ps(dst + 32, y2);
		_mm512_store_ps(dst + 48, y3);
		_mm512_store_ps(dst + 64, y4);
		_mm512_store_ps(dst + 80, y5);
		_mm512_store_ps(dst + 96, y6);
		_mm512_store_ps(dst + 112, y7);

		dst += 128;
		frames -= 128;
	}

	// Process the remaining samples 16 at a time
	while (frames >= 16) {
		__m512 x = _mm512_load_ps(dst);
		__m512 y = _mm512_mul_ps(zgain, x);
		_mm512_store_ps(dst, y);

		dst += 16;
		frames -= 16;
	}

	// Process remaining samples x8
	while (frames >= 8) {
		__m256 x = _mm256_load_ps(dst);
		__m256 y = _mm256_mul_ps(ygain, x);
		_mm256_store_ps(dst, y);

		dst += 8;
		frames -= 8;
	}

	// Process remaining samples x4
	while (frames >= 4) {
		__m128 x = _mm_load_ps(dst);
		__m128 y = _mm_mul_ps(xgain, x);
		_mm_store_ps(dst, y);

		dst += 4;
		frames -= 4;
	}

	// Process remaining samples
	while (frames > 0) {
		__m128 x = _mm_load_ss(dst);
		__m128 y = _mm_mul_ss(xgain, x);
		_mm_store_ss(dst, y);
		++dst;
		--frames;
	}

	// There's a penalty going from AVX mode to SSE mode. This can
	// be avoided by ensuring the CPU that rest of the routine is no
	// longer interested in the upper portion of the YMM register.
	//
	_mm256_zeroupper(); // zeros the upper portion of YMM register
}

/**
 * @brief x86-64 AVX optimized routine for mixing buffer with gain.
 *
 * This function may choose SSE over AVX if the pointers are aligned
 * to 16 byte boundary instead of 32 byte boundary to reduce time to
 * process.
 *
 * @param[in,out] dst Pointer to destination buffer, which gets updated
 * @param[in] src Pointer to source buffer (not updated)
 * @param nframes Number of samples to process
 * @param gain Gain to apply
 */
C_FUNC void
x86_avx512f_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes, float gain)
{
        // Convert to signed integer to prevent any arithmetic overflow errors
	int32_t frames = (int32_t)nframes;

	// Load gain vector to all elements of XMM, YMM, and ZMM register
	// It's the same register, but used for SSE, AVX, and AVX512 calculation
	__m512 zgain = _mm512_set1_ps(gain);
	__m256 ygain = _mm512_castps512_ps256(zgain);
	__m128 xgain = _mm512_castps512_ps128(zgain);

	while (frames > 0)
	{
		if (IS_ALIGNED_TO(src, sizeof(__m512)) &&
		    IS_ALIGNED_TO(dst, sizeof(__m512))) {
			break;
		}

		if (IS_ALIGNED_TO(src, sizeof(__m256)) &&
		    IS_ALIGNED_TO(dst, sizeof(__m256))) {
			__m256 x = _mm256_load_ps(src);
			__m256 y = _mm256_load_ps(dst);

			y = _mm256_fmadd_ps(ygain, x, y);
			_mm256_store_ps(dst, y);

			src += 8;
			dst += 8;
			frames -= 8;
			continue;
		}

		if (IS_ALIGNED_TO(src, sizeof(__m128)) &&
		    IS_ALIGNED_TO(dst, sizeof(__m128))) {
			__m128 x = _mm_load_ps(src);
			__m128 y = _mm_load_ps(dst);

			y = _mm_fmadd_ps(xgain, x, y);
			_mm_store_ps(dst, y);

			src += 4;
			dst += 4;
			frames -= 4;
			continue;
		}

		// Pointers are aligned to float boundaries (4 bytes)
		__m128 x = _mm_load_ss(src);
		__m128 y = _mm_load_ss(dst);

		y = _mm_fmadd_ss(xgain, x, y);
		_mm_store_ss(dst, y);

		++src;
		++dst;
		--frames;
	}

	// Process the remaining samples 128 at a time
	while (frames >= 128) {
		_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
		_mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0));

		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);
		__m512 x4 = _mm512_load_ps(src + 64);
		__m512 x5 = _mm512_load_ps(src + 80);
		__m512 x6 = _mm512_load_ps(src + 96);
		__m512 x7 = _mm512_load_ps(src + 112);

		__m512 y0 = _mm512_load_ps(dst + 0);
		__m512 y1 = _mm512_load_ps(dst + 16);
		__m512 y2 = _mm512_load_ps(dst + 32);
		__m512 y3 = _mm512_load_ps(dst + 48);
		__m512 y4 = _mm512_load_ps(dst + 64);
		__m512 y5 = _mm512_load_ps(dst + 80);
		__m512 y6 = _mm512_load_ps(dst + 96);
		__m512 y7 = _mm512_load_ps(dst + 112);

		y0 = _mm512_fmadd_ps(zgain, x0, y0);
		y1 = _mm512_fmadd_ps(zgain, x1, y1);
		y2 = _mm512_fmadd_ps(zgain, x2, y2);
		y3 = _mm512_fmadd_ps(zgain, x3, y3);
		y4 = _mm512_fmadd_ps(zgain, x4, y4);
		y5 = _mm512_fmadd_ps(zgain, x5, y5);
		y6 = _mm512_fmadd_ps(zgain, x6, y6);
		y7 = _mm512_fmadd_ps(zgain, x7, y7);

		_mm512_store_ps(dst + 0, y0);
		_mm512_store_ps(dst + 16, y1);
		_mm512_store_ps(dst + 32, y2);
		_mm512_store_ps(dst + 48, y3);
		_mm512_store_ps(dst + 64, y4);
		_mm512_store_ps(dst + 80, y5);
		_mm512_store_ps(dst + 96, y6);
		_mm512_store_ps(dst + 112, y7);

		src += 128;
		dst += 128;
		frames -= 128;
	}

	// Process the remaining samples 16 at a time
	while (frames >= 16) {
		__m512 x = _mm512_load_ps(src);
		__m512 y = _mm512_load_ps(dst);
		y = _mm512_fmadd_ps(zgain, x, y);
		_mm512_store_ps(dst, y);

		src += 16;
		dst += 16;
		frames -= 16;
	}

	// Process remaining samples x8
	while (frames >= 8) {
		__m256 x = _mm256_load_ps(src);
		__m256 y = _mm256_load_ps(dst);

		y = _mm256_fmadd_ps(ygain, x, y);
		_mm256_store_ps(dst, y);

		src += 8;
		dst += 8;
		frames -= 8;
	}

	// Process remaining samples x4
	while (frames >= 4) {
		__m128 x = _mm_load_ps(src);
		__m128 y = _mm_load_ps(dst);

		y = _mm_fmadd_ps(xgain, x, y);
		_mm_store_ps(dst, y);

		src += 4;
		dst += 4;
		frames -= 4;
	}

	// Process remaining samples
	while (frames > 0) {
		__m128 x = _mm_load_ss(src);
		__m128 y = _mm_load_ss(dst);

		y = _mm_fmadd_ss(xgain, x, y);
		_mm_store_ss(dst, y);

		++src;
		++dst;
		--frames;
	}

	// There's a penalty going from AVX mode to SSE mode. This can
	// be avoided by ensuring the CPU that rest of the routine is no
	// longer interested in the upper portion of the YMM register.
	//
	_mm256_zeroupper(); // zeros the upper portion of YMM register
}

/**
 * @brief x86-64 AVX optimized routine for mixing buffer with no gain.
 *
 * This function may choose SSE over AVX if the pointers are aligned
 * to 16 byte boundary instead of 32 byte boundary to reduce time to
 * process.
 *
 * @param[in,out] dst Pointer to destination buffer, which gets updated
 * @param[in] src Pointer to source buffer (not updated)
 * @param nframes Number of samples to process
 */
C_FUNC void
x86_avx512f_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes)
{
        // Convert to signed integer to prevent any arithmetic overflow errors
	int32_t frames = (int32_t)nframes;

	while (frames > 0)
	{
		if (IS_ALIGNED_TO(src, sizeof(__m512)) &&
		    IS_ALIGNED_TO(dst, sizeof(__m512))) {
			break;
		}

		if (IS_ALIGNED_TO(src, sizeof(__m256)) &&
		    IS_ALIGNED_TO(dst, sizeof(__m256))) {
			__m256 x = _mm256_load_ps(src);
			__m256 y = _mm256_load_ps(dst);
			y = _mm256_add_ps(x, y);
			_mm256_store_ps(dst, y);
			src += 8;
			dst += 8;
			frames -= 8;
			continue;
		}

		if (IS_ALIGNED_TO(src, sizeof(__m128)) &&
		    IS_ALIGNED_TO(dst, sizeof(__m128))) {
			__m128 x = _mm_load_ps(src);
			__m128 y = _mm_load_ps(dst);
			y = _mm_add_ps(x, y);
			_mm_store_ps(dst, y);
			src += 4;
			dst += 4;
			frames -= 4;
			continue;
		}

		// Pointers are aligned to float boundaries (4 bytes)
		__m128 x = _mm_load_ss(src);
		__m128 y = _mm_load_ss(dst);
		y = _mm_add_ss(x, y);
		_mm_store_ss(dst, y);
		++src;
		++dst;
		--frames;
	}

	// Process the remaining samples 128 at a time
	while (frames >= 128) {
		_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
		_mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0));

		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);
		__m512 x4 = _mm512_load_ps(src + 64);
		__m512 x5 = _mm512_load_ps(src + 80);
		__m512 x6 = _mm512_load_ps(src + 96);
		__m512 x7 = _mm512_load_ps(src + 112);

		__m512 y0 = _mm512_load_ps(dst + 0);
		__m512 y1 = _mm512_load_ps(dst + 16);
		__m512 y2 = _mm512_load_ps(dst + 32);
		__m512 y3 = _mm512_load_ps(dst + 48);
		__m512 y4 = _mm512_load_ps(dst + 64);
		__m512 y5 = _mm512_load_ps(dst + 80);
		__m512 y6 = _mm512_load_ps(dst + 96);
		__m512 y7 = _mm512_load_ps(dst + 112);

		y0 = _mm512_add_ps(x0, y0);
		y1 = _mm512_add_ps(x1, y1);
		y2 = _mm512_add_ps(x2, y2);
		y3 = _mm512_add_ps(x3, y3);
		y4 = _mm512_add_ps(x4, y4);
		y5 = _mm512_add_ps(x5, y5);
		y6 = _mm512_add_ps(x6, y6);
		y7 = _mm512_add_ps(x7, y7);

		_mm512_store_ps(dst + 0, y0);
		_mm512_store_ps(dst + 16, y1);
		_mm512_store_ps(dst + 32, y2);
		_mm512_store_ps(dst + 48, y3);
		_mm512_store_ps(dst + 64, y4);
		_mm512_store_ps(dst + 80, y5);
		_mm512_store_ps(dst + 96, y6);
		_mm512_store_ps(dst + 112, y7);

		src += 128;
		dst += 128;
		frames -= 128;
	}

	// Process the remaining samples 16 at a time
	while (frames >= 16) {
		__m512 x = _mm512_load_ps(src);
		__m512 y = _mm512_load_ps(dst);

		y = _mm512_add_ps(x, y);
		_mm512_store_ps(dst, y);

		src += 16;
		dst += 16;
		frames -= 16;
	}

	// Process remaining samples x8
	while (frames >= 8) {
		__m256 x = _mm256_load_ps(src);
		__m256 y = _mm256_load_ps(dst);

		y = _mm256_add_ps(x, y);
		_mm256_store_ps(dst, y);

		src += 8;
		dst += 8;
		frames -= 8;
	}

	// Process remaining samples x4
	while (frames >= 4) {
		__m128 x = _mm_load_ps(src);
		__m128 y = _mm_load_ps(dst);

		y = _mm_add_ps(x, y);
		_mm_store_ps(dst, y);

		src += 4;
		dst += 4;
		frames -= 4;
	}

	// Process remaining samples
	while (frames > 0) {
		__m128 x = _mm_load_ss(src);
		__m128 y = _mm_load_ss(dst);

		y = _mm_add_ss(x, y);
		_mm_store_ss(dst, y);

		++src;
		++dst;
		--frames;
	}

	// There's a penalty going from AVX mode to SSE mode. This can
	// be avoided by ensuring the CPU that rest of the routine is no
	// longer interested in the upper portion of the YMM register.
	//
	_mm256_zeroupper(); // zeros the upper portion of YMM register
}

C_FUNC void
x86_avx512f_copy_vector(float *dst, const float *src, uint32_t nframes)
{
        // Convert to signed integer to prevent any arithmetic overflow errors
	int32_t frames = (int32_t)nframes;

	while (frames > 0) {
		if (LIKELY(IS_ALIGNED_TO(dst, sizeof(__m512)))) {
			break;
		}

		if (IS_ALIGNED_TO(dst, sizeof(__m256))) {
			__m256 x = _mm256_load_ps(src);
			_mm256_store_ps(dst, x);
			src += 8;
			dst += 8;
			frames -= 8;
			continue;
		}

		if (IS_ALIGNED_TO(dst, sizeof(__m128))) {
			__m128 x = _mm_load_ps(src);
			_mm_store_ps(dst, x);
			src += 4;
			dst += 4;
			frames -= 4;
			continue;
		}

		// Pointers are aligned to float boundaries (4 bytes)
		__m128 x = _mm_load_ss(src);
		_mm_store_ss(dst, x);
		++src;
		++dst;
		--frames;
	}

	while (frames >= 256) {
		_mm_prefetch(reinterpret_cast<void const *>(src + 256), _mm_hint(0));
		_mm_prefetch(reinterpret_cast<void const *>(dst + 256), _mm_hint(0));

		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);
		__m512 x4 = _mm512_load_ps(src + 64);
		__m512 x5 = _mm512_load_ps(src + 80);
		__m512 x6 = _mm512_load_ps(src + 96);
		__m512 x7 = _mm512_load_ps(src + 112);

		__m512 x8 = _mm512_load_ps(src +  128);
		__m512 x9 = _mm512_load_ps(src +  144);
		__m512 x10 = _mm512_load_ps(src + 160);
		__m512 x11 = _mm512_load_ps(src + 176);
		__m512 x12 = _mm512_load_ps(src + 192);
		__m512 x13 = _mm512_load_ps(src + 208);
		__m512 x14 = _mm512_load_ps(src + 224);
		__m512 x15 = _mm512_load_ps(src + 240);

		_mm512_store_ps(dst + 0, x0);
		_mm512_store_ps(dst + 16, x1);
		_mm512_store_ps(dst + 32, x2);
		_mm512_store_ps(dst + 48, x3);
		_mm512_store_ps(dst + 64, x4);
		_mm512_store_ps(dst + 80, x5);
		_mm512_store_ps(dst + 96, x6);
		_mm512_store_ps(dst + 112, x7);

		_mm512_store_ps(dst + 128, x8);
		_mm512_store_ps(dst + 144, x9);
		_mm512_store_ps(dst + 160, x10);
		_mm512_store_ps(dst + 176, x11);
		_mm512_store_ps(dst + 192, x12);
		_mm512_store_ps(dst + 208, x13);
		_mm512_store_ps(dst + 224, x14);
		_mm512_store_ps(dst + 240, x15);

		src += 256;
		dst += 256;
		frames -= 256;
	}

	while (frames >= 128) {
		_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
		_mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0));

		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);
		__m512 x4 = _mm512_load_ps(src + 64);
		__m512 x5 = _mm512_load_ps(src + 80);
		__m512 x6 = _mm512_load_ps(src + 96);
		__m512 x7 = _mm512_load_ps(src + 112);

		_mm512_store_ps(dst + 0, x0);
		_mm512_store_ps(dst + 16, x1);
		_mm512_store_ps(dst + 32, x2);
		_mm512_store_ps(dst + 48, x3);
		_mm512_store_ps(dst + 64, x4);
		_mm512_store_ps(dst + 80, x5);
		_mm512_store_ps(dst + 96, x6);
		_mm512_store_ps(dst + 112, x7);

		src += 128;
		dst += 128;
		frames -= 128;
	}

	// Process the remaining samples 16 at a time
	while (frames >= 16) {
		__m512 x = _mm512_load_ps(src);
		_mm512_store_ps(dst, x);

		src += 16;
		dst += 16;
		frames -= 16;
	}

	// Process remaining samples x8
	while (frames >= 8) {
		__m256 x = _mm256_load_ps(src);
		_mm256_store_ps(dst, x);

		src += 8;
		dst += 8;
		frames -= 8;
	}

	// Process remaining samples x4
	while (frames >= 4) {
		__m128 x = _mm_load_ps(src);
		_mm_store_ps(dst, x);

		src += 4;
		dst += 4;
		frames -= 4;
	}

	// Process remaining samples
	while (frames > 0) {
		__m128 x = _mm_load_ss(src);
		_mm_store_ss(dst, x);

		++src;
		++dst;
		--frames;
	}

	// There's a penalty going from AVX mode to SSE mode. This can
	// be avoided by ensuring the CPU that rest of the routine is no
	// longer interested in the upper portion of the YMM register.

	_mm256_zeroupper(); // zeros the upper portion of YMM register
}
	azureuser@Ardour:~/src2$ g++ -fsanitize=address -O3 -ffast-math -fopenmp -mfma -mavx512f -o test test.cc -lm && ./test 3333 4
	compute_peak [def, AVX]: 5.000000e+00, 5.000000e+00
	find_peaks [def, AVX]: (-5.000000e+00, 5.000000e+00) (-5.000000e+00, 5.000000e+00)
	apply_gain_to_bufer [Error]: 0.000000e+00
	mix_buffers_no_gain [Error]: 0.000000e+00
	mix_buffers_with_gain [Error]: 0.000000e+00
	MICRO_BENCH: default_compute_peak
	Average time: 2.86104e-13
	MICRO_BENCH: x86_avx_compute_peak
	Average time: 6.29599e-07
	MICRO_BENCH: default_find_peaks
	Average time: 6.59345e-06
	MICRO_BENCH: x86_avx512f_find_peaks
	Average time: 6.56885e-07
	MICRO_BENCH: default_apply_gain_to_buffer
	Average time: 3.89993e-06
	MICRO_BENCH: x86_avx512f_apply_gain_to_buffer
	Average time: 5.88325e-07
	MICRO_BENCH: default_mix_buffers_no_gain
	Average time: 1.04519e-05
	MICRO_BENCH: x86_avx512f_mix_buffers_no_gain
	Average time: 1.18523e-06
	MICRO_BENCH: default_mix_buffers_with_gain
	Average time: 1.03626e-05
	MICRO_BENCH: x86_avx512f_mix_buffers_with_gain
	Average time: 1.79619e-06
	MICRO_BENCH: default_copy_vector
	Average time: 6.84109e-07
	MICRO_BENCH: x86_avx512f_copy_vector
	Average time: 1.15288e-06