Last active
February 4, 2023 15:59
-
-
Save ashafq/50880bbbf7769f4f307e9ea9c7e71cf9 to your computer and use it in GitHub Desktop.
WIP AVX512F mix function optimization
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
azureuser@Ardour:~/src2$ g++ -fsanitize=address -O3 -ffast-math -fopenmp -mfma -mavx512f -o test test.cc -lm && ./test 3333 4 | |
compute_peak [def, AVX]: 5.000000e+00, 5.000000e+00 | |
find_peaks [def, AVX]: (-5.000000e+00, 5.000000e+00) (-5.000000e+00, 5.000000e+00) | |
apply_gain_to_bufer [Error]: 0.000000e+00 | |
mix_buffers_no_gain [Error]: 0.000000e+00 | |
mix_buffers_with_gain [Error]: 0.000000e+00 | |
MICRO_BENCH: default_compute_peak | |
Average time: 2.86104e-13 | |
MICRO_BENCH: x86_avx_compute_peak | |
Average time: 6.29599e-07 | |
MICRO_BENCH: default_find_peaks | |
Average time: 6.59345e-06 | |
MICRO_BENCH: x86_avx512f_find_peaks | |
Average time: 6.56885e-07 | |
MICRO_BENCH: default_apply_gain_to_buffer | |
Average time: 3.89993e-06 | |
MICRO_BENCH: x86_avx512f_apply_gain_to_buffer | |
Average time: 5.88325e-07 | |
MICRO_BENCH: default_mix_buffers_no_gain | |
Average time: 1.04519e-05 | |
MICRO_BENCH: x86_avx512f_mix_buffers_no_gain | |
Average time: 1.18523e-06 | |
MICRO_BENCH: default_mix_buffers_with_gain | |
Average time: 1.03626e-05 | |
MICRO_BENCH: x86_avx512f_mix_buffers_with_gain | |
Average time: 1.79619e-06 | |
MICRO_BENCH: default_copy_vector | |
Average time: 6.84109e-07 | |
MICRO_BENCH: x86_avx512f_copy_vector | |
Average time: 1.15288e-06 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright (C) 2023 Ayan Shafqat <ayan@shafq.at> | |
* Copyright (C) 2023 Paul Davix <paul@linuxaudiosystems.com> | |
* Copyright (C) 2023 Robin Gareus <robin@gareus.org> | |
* | |
* This program is free software; you can redistribute it and/or modify | |
* it under the terms of the GNU General Public License as published by | |
* the Free Software Foundation; either version 2 of the License, or | |
* (at your option) any later version. | |
* | |
* This program is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
* GNU General Public License for more details. | |
* | |
* You should have received a copy of the GNU General Public License along | |
* with this program; if not, write to the Free Software Foundation, Inc., | |
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | |
*/ | |
// To compile: gcc -fopenmp -fsanitize=address -Os -mavx512f -mfma -o test test.cc -lm && ./mix 32768 8 | |
#include <assert.h> | |
#include <math.h> | |
#include <stddef.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
void fill_rand_f32(float *dst, size_t nframes); | |
float sum_abs_diff_f32(const float *src_a, const float *src_b, uint32_t nframes); | |
float frandf(void); | |
/** | |
* Default "unoptimized" functions | |
**/ | |
float default_compute_peak(const float *src, uint32_t nframes, float current); | |
void default_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain); | |
void default_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes, float gain); | |
void default_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes); | |
void default_copy_vector(float *dst, const float *src, uint32_t nframes); | |
void default_find_peaks(const float *buf, uint32_t nsamples, float *minf, float *maxf); | |
/** | |
* Optimized AVX functions | |
**/ | |
extern "C" { | |
float x86_avx512f_compute_peak(const float *src, uint32_t nframes, float current); | |
void x86_avx512f_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain); | |
void x86_avx512f_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes, float gain); | |
void x86_avx512f_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes); | |
void x86_avx512f_copy_vector(float *dst, const float *src, uint32_t nframes); | |
void x86_avx512f_find_peaks(const float *buf, uint32_t nsamples, float *minf, float *maxf); | |
} | |
#include <omp.h> | |
#define MICRO_BENCH(__statement, iter) \ | |
do \ | |
{ \ | |
double start = omp_get_wtime(); \ | |
for (long i = 0; i < iter; ++i) \ | |
{ \ | |
do \ | |
{ \ | |
__statement \ | |
} while (0); \ | |
} \ | |
double end = omp_get_wtime(); \ | |
double duration = (end - start) / ((double)iter); \ | |
printf("Average time: %g\n", duration); \ | |
} while (0) | |
int main(int argc, char **argv) | |
{ | |
if (argc != 3) { | |
puts("D [num] [alignment]"); | |
return 1; | |
} | |
uint32_t nframes = atoi(argv[1]); | |
size_t alignment = atoi(argv[2]); | |
float *src = (float *) aligned_alloc(alignment, nframes * sizeof(float)); | |
float *dst = (float *) aligned_alloc(alignment, nframes * sizeof(float)); | |
float *ref = (float *) aligned_alloc(alignment, nframes * sizeof(float)); | |
srand(((uintptr_t)src) ^ ((uintptr_t)dst)); | |
assert(src && "src is NULL"); | |
assert(dst && "dst is NULL"); | |
assert(ref && "ref is NULL"); | |
fill_rand_f32(src, nframes); | |
fill_rand_f32(dst, nframes); | |
/* Compute peak */ | |
{ | |
src[5] = 5.0F; | |
src[6] = -5.0F; | |
float peak_d = default_compute_peak(src, nframes, 0.0F); | |
float peak_a = x86_avx512f_compute_peak(src, nframes, 0.0F); | |
printf("compute_peak [def, AVX]: %e, %e\n", peak_d, peak_a); | |
} | |
/* Find peak */ | |
{ | |
float amin, bmin, amax, bmax; | |
amin = bmin = __builtin_inf(); | |
amax = bmax = 0.0F; | |
default_find_peaks(src, nframes, &amin, &amax); | |
x86_avx512f_find_peaks(src + 1, nframes - 1, &bmin, &bmax); | |
printf("find_peaks [def, AVX]: (%e, %e) (%e, %e)\n", amin, amax, bmin, bmax); | |
} | |
/* Apply gain */ | |
{ | |
float gain = frandf(); | |
fill_rand_f32(src, nframes); | |
default_copy_vector(ref, src, nframes); | |
default_apply_gain_to_buffer(src, nframes, gain); | |
x86_avx512f_apply_gain_to_buffer(ref, nframes, gain); | |
printf("apply_gain_to_bufer [Error]: %e\n", sum_abs_diff_f32(ref, src, nframes)); | |
} | |
/* Mix buffer no gain */ | |
{ | |
fill_rand_f32(src, nframes); | |
fill_rand_f32(dst, nframes); | |
default_copy_vector(ref, dst, nframes); | |
default_mix_buffers_no_gain(ref, src, nframes); | |
x86_avx512f_mix_buffers_no_gain(dst, src, nframes); | |
printf("mix_buffers_no_gain [Error]: %e\n", sum_abs_diff_f32(ref, dst, nframes)); | |
} | |
/* Mix buffer with gain */ | |
{ | |
float gain = frandf(); | |
fill_rand_f32(src, nframes); | |
fill_rand_f32(dst, nframes); | |
default_copy_vector(ref, dst, nframes); | |
default_mix_buffers_with_gain(ref, src, nframes, gain); | |
x86_avx512f_mix_buffers_with_gain(dst, src, nframes, gain); | |
printf("mix_buffers_with_gain [Error]: %e\n", sum_abs_diff_f32(ref, dst, nframes)); | |
} | |
#define ITER (1 << 20) | |
puts("MICRO_BENCH: default_compute_peak"); | |
MICRO_BENCH({ | |
(void)default_compute_peak(src, nframes, 0.0F); | |
},ITER); | |
puts("MICRO_BENCH: x86_avx_compute_peak"); | |
MICRO_BENCH({ | |
(void)x86_avx512f_compute_peak(src, nframes, 0.0F); | |
},ITER); | |
puts("MICRO_BENCH: default_find_peaks"); | |
float a,b; | |
MICRO_BENCH({ | |
(void)default_find_peaks(src, nframes, &a, &b); | |
},ITER); | |
puts("MICRO_BENCH: x86_avx512f_find_peaks"); | |
MICRO_BENCH({ | |
(void)x86_avx512f_find_peaks(src, nframes, &a, &b); | |
},ITER); | |
float gain = frandf(); | |
puts("MICRO_BENCH: default_apply_gain_to_buffer"); | |
MICRO_BENCH({ | |
default_apply_gain_to_buffer(src, nframes, gain); | |
},ITER); | |
puts("MICRO_BENCH: x86_avx512f_apply_gain_to_buffer"); | |
MICRO_BENCH({ | |
x86_avx512f_apply_gain_to_buffer(src, nframes, gain); | |
},ITER); | |
puts("MICRO_BENCH: default_mix_buffers_no_gain"); | |
MICRO_BENCH({ | |
default_mix_buffers_no_gain(dst, src, nframes); | |
},ITER); | |
puts("MICRO_BENCH: x86_avx512f_mix_buffers_no_gain"); | |
MICRO_BENCH({ | |
x86_avx512f_mix_buffers_no_gain(dst, src, nframes); | |
},ITER); | |
puts("MICRO_BENCH: default_mix_buffers_with_gain"); | |
MICRO_BENCH({ | |
default_mix_buffers_with_gain(dst, src, nframes, gain); | |
},ITER); | |
puts("MICRO_BENCH: x86_avx512f_mix_buffers_with_gain"); | |
MICRO_BENCH({ | |
x86_avx512f_mix_buffers_with_gain(dst, src, nframes, gain); | |
},ITER); | |
puts("MICRO_BENCH: default_copy_vector"); | |
MICRO_BENCH({ | |
default_copy_vector(dst, src, nframes); | |
},ITER); | |
puts("MICRO_BENCH: x86_avx512f_copy_vector"); | |
MICRO_BENCH({ | |
x86_avx512f_copy_vector(dst, src, nframes); | |
},ITER); | |
free(src); | |
free(dst); | |
free(ref); | |
} | |
float frandf(void) | |
{ | |
const float scale = 1.0F / ((float)RAND_MAX); | |
return scale * ((float)(rand())); | |
} | |
void fill_rand_f32(float *dst, size_t nframes) | |
{ | |
const float scale = 2.0F / ((float)RAND_MAX); | |
for (size_t i = 0; i < nframes; ++i) { | |
float rval = rand(); | |
dst[i] = rval * scale - 1.0F; | |
} | |
} | |
float sum_abs_diff_f32(const float *src_a, const float *src_b, uint32_t nframes) | |
{ | |
float sum = 0.0F; | |
for (uint32_t i = 0; i < nframes; ++i) { | |
sum += fabsf(src_a[i] - src_b[i]); | |
} | |
return sum; | |
} | |
/** | |
* Default "unoptimized" functions | |
**/ | |
float default_compute_peak(const float *src, uint32_t nframes, float current) | |
{ | |
for (uint32_t i = 0; i < nframes; ++i) { | |
current = fmaxf(current, fabsf(src[i])); | |
} | |
return current; | |
} | |
void default_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain) | |
{ | |
for (uint32_t i = 0; i < nframes; ++i) | |
dst[i] *= gain; | |
} | |
void default_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes, float gain) | |
{ | |
for (uint32_t i = 0; i < nframes; ++i) | |
dst[i] = dst[i] + (src[i] * gain); | |
} | |
void default_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes) | |
{ | |
for (uint32_t i = 0; i < nframes; ++i) | |
dst[i] += src[i]; | |
} | |
void default_copy_vector(float *dst, const float *src, uint32_t nframes) | |
{ | |
memcpy(dst, src, nframes * sizeof(float)); | |
} | |
void default_find_peaks(const float *buf, uint32_t nframes, float *minf, float *maxf) | |
{ | |
uint32_t i; | |
float a, b; | |
a = *maxf; | |
b = *minf; | |
for (i = 0; i < nframes; i++) | |
{ | |
a = fmaxf(buf[i], a); | |
b = fminf(buf[i], b); | |
} | |
*maxf = a; | |
*minf = b; | |
} | |
/** | |
* Optimized supercharged method! | |
**/ | |
#include <immintrin.h> | |
#include <xmmintrin.h> | |
#ifndef __AVX512F__ | |
#error "__AVX512F__ must be eanbled for this module to work" | |
#endif | |
#define LIKELY(cond) \ | |
__builtin_expect((cond), 1) | |
#define UNLIKELY(cond) \ | |
__builtin_expect((cond), 0) | |
#define IS_ALIGNED_TO(ptr, bytes) (reinterpret_cast<uintptr_t>(ptr) % (bytes) == 0) | |
#if defined(__GNUC__) | |
#define IS_NOT_ALIGNED_TO(ptr, bytes) \ | |
__builtin_expect(!!(reinterpret_cast<intptr_t>(ptr) % (bytes)), 0) | |
#else | |
#define IS_NOT_ALIGNED_TO(ptr, bytes) \ | |
(!!(reinterpret_cast<intptr_t>(ptr) % (bytes))) | |
#endif | |
#ifdef __cplusplus | |
#define C_FUNC extern "C" | |
#else | |
#define C_FUNC | |
#endif | |
/** | |
* Local functions | |
*/ | |
static inline __m256 avx_abs_ps(__m256 x); | |
static inline __m256 avx_getmax_ps(__m256 vmax); | |
static inline __m256 avx_getmin_ps(__m256 vmin); | |
static void | |
x86_avx512f_mix_buffers_with_gain_unaligned(float *dst, const float *src, uint32_t nframes, float gain); | |
static void | |
x86_avx512f_mix_buffers_with_gain_aligned(float *dst, const float *src, uint32_t nframes, float gain); | |
static void | |
x86_avx512f_mix_buffers_no_gain_unaligned(float *dst, const float *src, uint32_t nframes); | |
static void | |
x86_avx512f_mix_buffers_no_gain_aligned(float *dst, const float *src, uint32_t nframes); | |
/** | |
* Module implementation | |
*/ | |
/** | |
* @brief x86-64 AVX optimized routine for compute peak procedure | |
* @param src Pointer to source buffer | |
* @param nframes Number of frames to process | |
* @param current Current peak value | |
* @return float New peak value | |
*/ | |
C_FUNC float | |
x86_avx512f_compute_peak(const float *src, uint32_t nframes, float current) | |
{ | |
// Convert to signed integer to prevent any arithmetic overflow errors | |
int32_t frames = (int32_t)nframes; | |
// Broadcast the current max values to all elements of the ZMM register | |
__m512 zmax = _mm512_set1_ps(current); | |
// Compute single/4/8 min/max of unaligned portion until alignment is reached | |
while (frames > 0) { | |
if (IS_ALIGNED_TO(src, sizeof(__m512))) { | |
break; | |
} | |
if (IS_ALIGNED_TO(src, sizeof(__m256))) { | |
__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src)); | |
x = _mm512_abs_ps(x); | |
zmax = _mm512_max_ps(zmax, x); | |
src += 8; | |
frames -= 8; | |
continue; | |
} | |
if (IS_ALIGNED_TO(src, sizeof(__m128))) { | |
__m512 x = _mm512_castps128_ps512(_mm_load_ps(src)); | |
x = _mm512_abs_ps(x); | |
zmax = _mm512_max_ps(zmax, x); | |
src += 4; | |
frames -= 4; | |
continue; | |
} | |
// Pointers are aligned to float boundaries (4 bytes) | |
__m512 x = _mm512_castps128_ps512(_mm_load_ss(src)); | |
x = _mm512_abs_ps(x); | |
zmax = _mm512_max_ps(zmax, x); | |
++src; | |
--frames; | |
} | |
while (frames >= 256) { | |
_mm_prefetch(reinterpret_cast<void const *>(src + 256), _mm_hint(0)); | |
__m512 x0 = _mm512_load_ps(src + 0); | |
__m512 x1 = _mm512_load_ps(src + 16); | |
__m512 x2 = _mm512_load_ps(src + 32); | |
__m512 x3 = _mm512_load_ps(src + 48); | |
__m512 x4 = _mm512_load_ps(src + 64); | |
__m512 x5 = _mm512_load_ps(src + 80); | |
__m512 x6 = _mm512_load_ps(src + 96); | |
__m512 x7 = _mm512_load_ps(src + 112); | |
__m512 x8 = _mm512_load_ps(src + 128); | |
__m512 x9 = _mm512_load_ps(src + 144); | |
__m512 x10 = _mm512_load_ps(src + 160); | |
__m512 x11 = _mm512_load_ps(src + 176); | |
__m512 x12 = _mm512_load_ps(src + 192); | |
__m512 x13 = _mm512_load_ps(src + 208); | |
__m512 x14 = _mm512_load_ps(src + 224); | |
__m512 x15 = _mm512_load_ps(src + 240); | |
x0 = _mm512_abs_ps(x0); | |
x1 = _mm512_abs_ps(x1); | |
x2 = _mm512_abs_ps(x2); | |
x3 = _mm512_abs_ps(x3); | |
x4 = _mm512_abs_ps(x4); | |
x5 = _mm512_abs_ps(x5); | |
x6 = _mm512_abs_ps(x6); | |
x7 = _mm512_abs_ps(x7); | |
x8 = _mm512_abs_ps(x8); | |
x9 = _mm512_abs_ps(x9); | |
x10 = _mm512_abs_ps(x10); | |
x11 = _mm512_abs_ps(x11); | |
x12 = _mm512_abs_ps(x12); | |
x13 = _mm512_abs_ps(x13); | |
x14 = _mm512_abs_ps(x14); | |
x15 = _mm512_abs_ps(x15); | |
zmax = _mm512_max_ps(zmax, x0); | |
zmax = _mm512_max_ps(zmax, x1); | |
zmax = _mm512_max_ps(zmax, x2); | |
zmax = _mm512_max_ps(zmax, x3); | |
zmax = _mm512_max_ps(zmax, x4); | |
zmax = _mm512_max_ps(zmax, x5); | |
zmax = _mm512_max_ps(zmax, x6); | |
zmax = _mm512_max_ps(zmax, x7); | |
zmax = _mm512_max_ps(zmax, x8); | |
zmax = _mm512_max_ps(zmax, x9); | |
zmax = _mm512_max_ps(zmax, x10); | |
zmax = _mm512_max_ps(zmax, x11); | |
zmax = _mm512_max_ps(zmax, x12); | |
zmax = _mm512_max_ps(zmax, x13); | |
zmax = _mm512_max_ps(zmax, x14); | |
zmax = _mm512_max_ps(zmax, x15); | |
src += 256; | |
frames -= 256; | |
} | |
while (frames >= 128) { | |
_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0)); | |
__m512 x0 = _mm512_load_ps(src + 0); | |
__m512 x1 = _mm512_load_ps(src + 16); | |
__m512 x2 = _mm512_load_ps(src + 32); | |
__m512 x3 = _mm512_load_ps(src + 48); | |
__m512 x4 = _mm512_load_ps(src + 64); | |
__m512 x5 = _mm512_load_ps(src + 80); | |
__m512 x6 = _mm512_load_ps(src + 96); | |
__m512 x7 = _mm512_load_ps(src + 112); | |
x0 = _mm512_abs_ps(x0); | |
x1 = _mm512_abs_ps(x1); | |
x2 = _mm512_abs_ps(x2); | |
x3 = _mm512_abs_ps(x3); | |
x4 = _mm512_abs_ps(x4); | |
x5 = _mm512_abs_ps(x5); | |
x6 = _mm512_abs_ps(x6); | |
x7 = _mm512_abs_ps(x7); | |
zmax = _mm512_max_ps(zmax, x0); | |
zmax = _mm512_max_ps(zmax, x1); | |
zmax = _mm512_max_ps(zmax, x2); | |
zmax = _mm512_max_ps(zmax, x3); | |
zmax = _mm512_max_ps(zmax, x4); | |
zmax = _mm512_max_ps(zmax, x5); | |
zmax = _mm512_max_ps(zmax, x6); | |
zmax = _mm512_max_ps(zmax, x7); | |
src += 128; | |
frames -= 128; | |
} | |
while (frames >= 64) { | |
_mm_prefetch(reinterpret_cast<void const *>(src + 64), _mm_hint(0)); | |
__m512 x0 = _mm512_load_ps(src + 0); | |
__m512 x1 = _mm512_load_ps(src + 16); | |
__m512 x2 = _mm512_load_ps(src + 32); | |
__m512 x3 = _mm512_load_ps(src + 48); | |
x0 = _mm512_abs_ps(x0); | |
x1 = _mm512_abs_ps(x1); | |
x2 = _mm512_abs_ps(x2); | |
x3 = _mm512_abs_ps(x3); | |
zmax = _mm512_max_ps(zmax, x0); | |
zmax = _mm512_max_ps(zmax, x1); | |
zmax = _mm512_max_ps(zmax, x2); | |
zmax = _mm512_max_ps(zmax, x3); | |
src += 64; | |
frames -= 64; | |
} | |
// Process the remaining samples 16 at a time | |
while (frames >= 16) { | |
__m512 x = _mm512_load_ps(src); | |
x = _mm512_abs_ps(x); | |
zmax = _mm512_max_ps(zmax, x); | |
src += 16; | |
frames -= 16; | |
} | |
// Process the remaining samples 8 at a time | |
while (frames >= 8) { | |
__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src)); | |
x = _mm512_abs_ps(x); | |
zmax = _mm512_max_ps(zmax, x); | |
src += 8; | |
frames -= 8; | |
} | |
// Process the remaining samples 4 at a time | |
while (frames >= 4) { | |
__m512 x = _mm512_castps128_ps512(_mm_load_ps(src)); | |
x = _mm512_abs_ps(x); | |
zmax = _mm512_max_ps(zmax, x); | |
src += 4; | |
frames -= 4; | |
} | |
// If there are still some left 2-4 samples, process them one at a time. | |
while (frames > 0) { | |
__m512 x = _mm512_castps128_ps512(_mm_load_ss(src)); | |
x = _mm512_abs_ps(x); | |
zmax = _mm512_max_ps(zmax, x); | |
++src; | |
--frames; | |
} | |
// Get the max of the ZMM registers | |
current = _mm512_reduce_max_ps(zmax); | |
// There's a penalty going from AVX mode to SSE mode. This can | |
// be avoided by ensuring the CPU that rest of the routine is no | |
// longer interested in the upper portion of the YMM register. | |
_mm256_zeroupper(); // zeros the upper portion of YMM register | |
return current; | |
} | |
/** | |
* @brief x86-64 AVX optimized routine for find peak procedure | |
* @param src Pointer to source buffer | |
* @param nframes Number of frames to process | |
* @param[in,out] minf Current minimum value, updated | |
* @param[in,out] maxf Current maximum value, updated | |
*/ | |
C_FUNC void | |
x86_avx512f_find_peaks(const float *src, uint32_t nframes, float *minf, float *maxf) | |
{ | |
// Convert to signed integer to prevent any arithmetic overflow errors | |
int32_t frames = (int32_t)nframes; | |
// Broadcast the current min and max values to all elements of the ZMM register | |
__m512 zmin = _mm512_set1_ps(*minf); | |
__m512 zmax = _mm512_set1_ps(*maxf); | |
// Compute single/4/8 min/max of unaligned portion until alignment is reached | |
while (frames > 0) { | |
if (IS_ALIGNED_TO(src, sizeof(__m512))) { | |
break; | |
} | |
if (IS_ALIGNED_TO(src, sizeof(__m256))) { | |
__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src)); | |
zmin = _mm512_min_ps(zmin, x); | |
zmax = _mm512_max_ps(zmax, x); | |
src += 8; | |
frames -= 8; | |
continue; | |
} | |
if (IS_ALIGNED_TO(src, sizeof(__m128))) { | |
__m512 x = _mm512_castps128_ps512(_mm_load_ps(src)); | |
zmin = _mm512_min_ps(zmin, x); | |
zmax = _mm512_max_ps(zmax, x); | |
src += 4; | |
frames -= 4; | |
continue; | |
} | |
// Pointers are aligned to float boundaries (4 bytes) | |
__m512 x = _mm512_castps128_ps512(_mm_load_ss(src)); | |
zmin = _mm512_min_ps(zmin, x); | |
zmax = _mm512_max_ps(zmax, x); | |
++src; | |
--frames; | |
} | |
while (frames >= 256) { | |
_mm_prefetch(reinterpret_cast<void const *>(src + 256), _mm_hint(0)); | |
__m512 x0 = _mm512_load_ps(src + 0); | |
__m512 x1 = _mm512_load_ps(src + 16); | |
__m512 x2 = _mm512_load_ps(src + 32); | |
__m512 x3 = _mm512_load_ps(src + 48); | |
__m512 x4 = _mm512_load_ps(src + 64); | |
__m512 x5 = _mm512_load_ps(src + 80); | |
__m512 x6 = _mm512_load_ps(src + 96); | |
__m512 x7 = _mm512_load_ps(src + 112); | |
__m512 x8 = _mm512_load_ps(src + 128); | |
__m512 x9 = _mm512_load_ps(src + 144); | |
__m512 x10 = _mm512_load_ps(src + 160); | |
__m512 x11 = _mm512_load_ps(src + 176); | |
__m512 x12 = _mm512_load_ps(src + 192); | |
__m512 x13 = _mm512_load_ps(src + 208); | |
__m512 x14 = _mm512_load_ps(src + 224); | |
__m512 x15 = _mm512_load_ps(src + 240); | |
zmin = _mm512_min_ps(zmin, x0); | |
zmin = _mm512_min_ps(zmin, x1); | |
zmin = _mm512_min_ps(zmin, x2); | |
zmin = _mm512_min_ps(zmin, x3); | |
zmin = _mm512_min_ps(zmin, x4); | |
zmin = _mm512_min_ps(zmin, x5); | |
zmin = _mm512_min_ps(zmin, x6); | |
zmin = _mm512_min_ps(zmin, x7); | |
zmin = _mm512_min_ps(zmin, x8); | |
zmin = _mm512_min_ps(zmin, x9); | |
zmin = _mm512_min_ps(zmin, x10); | |
zmin = _mm512_min_ps(zmin, x11); | |
zmin = _mm512_min_ps(zmin, x12); | |
zmin = _mm512_min_ps(zmin, x13); | |
zmin = _mm512_min_ps(zmin, x14); | |
zmin = _mm512_min_ps(zmin, x15); | |
zmax = _mm512_max_ps(zmax, x0); | |
zmax = _mm512_max_ps(zmax, x1); | |
zmax = _mm512_max_ps(zmax, x2); | |
zmax = _mm512_max_ps(zmax, x3); | |
zmax = _mm512_max_ps(zmax, x4); | |
zmax = _mm512_max_ps(zmax, x5); | |
zmax = _mm512_max_ps(zmax, x6); | |
zmax = _mm512_max_ps(zmax, x7); | |
zmax = _mm512_max_ps(zmax, x8); | |
zmax = _mm512_max_ps(zmax, x9); | |
zmax = _mm512_max_ps(zmax, x10); | |
zmax = _mm512_max_ps(zmax, x11); | |
zmax = _mm512_max_ps(zmax, x12); | |
zmax = _mm512_max_ps(zmax, x13); | |
zmax = _mm512_max_ps(zmax, x14); | |
zmax = _mm512_max_ps(zmax, x15); | |
src += 256; | |
frames -= 256; | |
} | |
while (frames >= 128) { | |
_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0)); | |
__m512 x0 = _mm512_load_ps(src + 0); | |
__m512 x1 = _mm512_load_ps(src + 16); | |
__m512 x2 = _mm512_load_ps(src + 32); | |
__m512 x3 = _mm512_load_ps(src + 48); | |
__m512 x4 = _mm512_load_ps(src + 64); | |
__m512 x5 = _mm512_load_ps(src + 80); | |
__m512 x6 = _mm512_load_ps(src + 96); | |
__m512 x7 = _mm512_load_ps(src + 112); | |
zmin = _mm512_min_ps(zmin, x0); | |
zmin = _mm512_min_ps(zmin, x1); | |
zmin = _mm512_min_ps(zmin, x2); | |
zmin = _mm512_min_ps(zmin, x3); | |
zmin = _mm512_min_ps(zmin, x4); | |
zmin = _mm512_min_ps(zmin, x5); | |
zmin = _mm512_min_ps(zmin, x6); | |
zmin = _mm512_min_ps(zmin, x7); | |
zmax = _mm512_max_ps(zmax, x0); | |
zmax = _mm512_max_ps(zmax, x1); | |
zmax = _mm512_max_ps(zmax, x2); | |
zmax = _mm512_max_ps(zmax, x3); | |
zmax = _mm512_max_ps(zmax, x4); | |
zmax = _mm512_max_ps(zmax, x5); | |
zmax = _mm512_max_ps(zmax, x6); | |
zmax = _mm512_max_ps(zmax, x7); | |
src += 128; | |
frames -= 128; | |
} | |
while (frames >= 64) { | |
_mm_prefetch(reinterpret_cast<void const *>(src + 64), _mm_hint(0)); | |
__m512 x0 = _mm512_load_ps(src + 0); | |
__m512 x1 = _mm512_load_ps(src + 16); | |
__m512 x2 = _mm512_load_ps(src + 32); | |
__m512 x3 = _mm512_load_ps(src + 48); | |
zmin = _mm512_min_ps(zmin, x0); | |
zmin = _mm512_min_ps(zmin, x1); | |
zmin = _mm512_min_ps(zmin, x2); | |
zmin = _mm512_min_ps(zmin, x3); | |
zmax = _mm512_max_ps(zmax, x0); | |
zmax = _mm512_max_ps(zmax, x1); | |
zmax = _mm512_max_ps(zmax, x2); | |
zmax = _mm512_max_ps(zmax, x3); | |
src += 64; | |
frames -= 64; | |
} | |
// Process the remaining samples 16 at a time | |
while (frames >= 16) { | |
__m512 x = _mm512_load_ps(src); | |
zmin = _mm512_min_ps(zmin, x); | |
zmax = _mm512_max_ps(zmax, x); | |
src += 16; | |
frames -= 16; | |
} | |
// Process the remaining samples 8 at a time | |
while (frames >= 8) { | |
__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src)); | |
zmin = _mm512_min_ps(zmin, x); | |
zmax = _mm512_max_ps(zmax, x); | |
src += 8; | |
frames -= 8; | |
} | |
// Process the remaining samples 4 at a time | |
while (frames >= 4) { | |
__m512 x = _mm512_castps128_ps512(_mm_load_ps(src)); | |
zmin = _mm512_min_ps(zmin, x); | |
zmax = _mm512_max_ps(zmax, x); | |
src += 4; | |
frames -= 4; | |
} | |
// If there are still some left 2-4 samples, process them one at a time. | |
while (frames > 0) { | |
__m512 x = _mm512_castps128_ps512(_mm_load_ss(src)); | |
zmin = _mm512_min_ps(zmin, x); | |
zmax = _mm512_max_ps(zmax, x); | |
++src; | |
--frames; | |
} | |
// Get min and max of the ZMM registers | |
*minf = _mm512_reduce_min_ps(zmin); | |
*maxf = _mm512_reduce_max_ps(zmax); | |
// There's a penalty going from AVX mode to SSE mode. This can | |
// be avoided by ensuring the CPU that rest of the routine is no | |
// longer interested in the upper portion of the YMM register. | |
_mm256_zeroupper(); // zeros the upper portion of YMM register | |
} | |
/** | |
* @brief x86-64 AVX optimized routine for apply gain routine | |
* @param[in,out] dst Pointer to the destination buffer, which gets updated | |
* @param nframes Number of frames (or samples) to process | |
* @param gain Gain to apply | |
*/ | |
C_FUNC void | |
x86_avx512f_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain) | |
{ | |
// Convert to signed integer to prevent any arithmetic overflow errors | |
int32_t frames = (int32_t)nframes; | |
// Load gain vector to all elements of XMM, YMM, and ZMM register | |
// It's the same register, but used for SSE, AVX, and AVX512 calculation | |
__m512 zgain = _mm512_set1_ps(gain); | |
__m256 ygain = _mm512_castps512_ps256(zgain); | |
__m128 xgain = _mm512_castps512_ps128(zgain); | |
while (frames > 0) { | |
if (LIKELY(IS_ALIGNED_TO(dst, sizeof(__m512)))) { | |
break; | |
} | |
if (IS_ALIGNED_TO(dst, sizeof(__m256))) { | |
__m256 x = _mm256_load_ps(dst); | |
__m256 y = _mm256_mul_ps(ygain, x); | |
_mm256_store_ps(dst, y); | |
dst += 8; | |
frames -= 8; | |
continue; | |
} | |
if (IS_ALIGNED_TO(dst, sizeof(__m128))) { | |
__m128 x = _mm_load_ps(dst); | |
__m128 y = _mm_mul_ps(xgain, x); | |
_mm_store_ps(dst, y); | |
dst += 4; | |
frames -= 4; | |
continue; | |
} | |
// Pointers are aligned to float boundaries (4 bytes) | |
__m128 x = _mm_load_ss(dst); | |
__m128 y = _mm_mul_ss(xgain, x); | |
_mm_store_ss(dst, y); | |
++dst; | |
--frames; | |
} | |
// Process the remaining samples 128 at a time | |
while (frames >= 128) { | |
_mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0)); | |
__m512 x0 = _mm512_load_ps(dst + 0); | |
__m512 x1 = _mm512_load_ps(dst + 16); | |
__m512 x2 = _mm512_load_ps(dst + 32); | |
__m512 x3 = _mm512_load_ps(dst + 48); | |
__m512 x4 = _mm512_load_ps(dst + 64); | |
__m512 x5 = _mm512_load_ps(dst + 80); | |
__m512 x6 = _mm512_load_ps(dst + 96); | |
__m512 x7 = _mm512_load_ps(dst + 112); | |
__m512 y0 = _mm512_mul_ps(zgain, x0); | |
__m512 y1 = _mm512_mul_ps(zgain, x1); | |
__m512 y2 = _mm512_mul_ps(zgain, x2); | |
__m512 y3 = _mm512_mul_ps(zgain, x3); | |
__m512 y4 = _mm512_mul_ps(zgain, x4); | |
__m512 y5 = _mm512_mul_ps(zgain, x5); | |
__m512 y6 = _mm512_mul_ps(zgain, x6); | |
__m512 y7 = _mm512_mul_ps(zgain, x7); | |
_mm512_store_ps(dst + 0, y0); | |
_mm512_store_ps(dst + 16, y1); | |
_mm512_store_ps(dst + 32, y2); | |
_mm512_store_ps(dst + 48, y3); | |
_mm512_store_ps(dst + 64, y4); | |
_mm512_store_ps(dst + 80, y5); | |
_mm512_store_ps(dst + 96, y6); | |
_mm512_store_ps(dst + 112, y7); | |
dst += 128; | |
frames -= 128; | |
} | |
// Process the remaining samples 16 at a time | |
while (frames >= 16) { | |
__m512 x = _mm512_load_ps(dst); | |
__m512 y = _mm512_mul_ps(zgain, x); | |
_mm512_store_ps(dst, y); | |
dst += 16; | |
frames -= 16; | |
} | |
// Process remaining samples x8 | |
while (frames >= 8) { | |
__m256 x = _mm256_load_ps(dst); | |
__m256 y = _mm256_mul_ps(ygain, x); | |
_mm256_store_ps(dst, y); | |
dst += 8; | |
frames -= 8; | |
} | |
// Process remaining samples x4 | |
while (frames >= 4) { | |
__m128 x = _mm_load_ps(dst); | |
__m128 y = _mm_mul_ps(xgain, x); | |
_mm_store_ps(dst, y); | |
dst += 4; | |
frames -= 4; | |
} | |
// Process remaining samples | |
while (frames > 0) { | |
__m128 x = _mm_load_ss(dst); | |
__m128 y = _mm_mul_ss(xgain, x); | |
_mm_store_ss(dst, y); | |
++dst; | |
--frames; | |
} | |
// There's a penalty going from AVX mode to SSE mode. This can | |
// be avoided by ensuring the CPU that rest of the routine is no | |
// longer interested in the upper portion of the YMM register. | |
// | |
_mm256_zeroupper(); // zeros the upper portion of YMM register | |
} | |
/** | |
* @brief x86-64 AVX optimized routine for mixing buffer with gain. | |
* | |
* This function may choose SSE over AVX if the pointers are aligned | |
* to 16 byte boundary instead of 32 byte boundary to reduce time to | |
* process. | |
* | |
* @param[in,out] dst Pointer to destination buffer, which gets updated | |
* @param[in] src Pointer to source buffer (not updated) | |
* @param nframes Number of samples to process | |
* @param gain Gain to apply | |
*/ | |
C_FUNC void | |
x86_avx512f_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes, float gain) | |
{ | |
// Convert to signed integer to prevent any arithmetic overflow errors | |
int32_t frames = (int32_t)nframes; | |
// Load gain vector to all elements of XMM, YMM, and ZMM register | |
// It's the same register, but used for SSE, AVX, and AVX512 calculation | |
__m512 zgain = _mm512_set1_ps(gain); | |
__m256 ygain = _mm512_castps512_ps256(zgain); | |
__m128 xgain = _mm512_castps512_ps128(zgain); | |
while (frames > 0) | |
{ | |
if (IS_ALIGNED_TO(src, sizeof(__m512)) && | |
IS_ALIGNED_TO(dst, sizeof(__m512))) { | |
break; | |
} | |
if (IS_ALIGNED_TO(src, sizeof(__m256)) && | |
IS_ALIGNED_TO(dst, sizeof(__m256))) { | |
__m256 x = _mm256_load_ps(src); | |
__m256 y = _mm256_load_ps(dst); | |
y = _mm256_fmadd_ps(ygain, x, y); | |
_mm256_store_ps(dst, y); | |
src += 8; | |
dst += 8; | |
frames -= 8; | |
continue; | |
} | |
if (IS_ALIGNED_TO(src, sizeof(__m128)) && | |
IS_ALIGNED_TO(dst, sizeof(__m128))) { | |
__m128 x = _mm_load_ps(src); | |
__m128 y = _mm_load_ps(dst); | |
y = _mm_fmadd_ps(xgain, x, y); | |
_mm_store_ps(dst, y); | |
src += 4; | |
dst += 4; | |
frames -= 4; | |
continue; | |
} | |
// Pointers are aligned to float boundaries (4 bytes) | |
__m128 x = _mm_load_ss(src); | |
__m128 y = _mm_load_ss(dst); | |
y = _mm_fmadd_ss(xgain, x, y); | |
_mm_store_ss(dst, y); | |
++src; | |
++dst; | |
--frames; | |
} | |
// Process the remaining samples 128 at a time | |
while (frames >= 128) { | |
_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0)); | |
_mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0)); | |
__m512 x0 = _mm512_load_ps(src + 0); | |
__m512 x1 = _mm512_load_ps(src + 16); | |
__m512 x2 = _mm512_load_ps(src + 32); | |
__m512 x3 = _mm512_load_ps(src + 48); | |
__m512 x4 = _mm512_load_ps(src + 64); | |
__m512 x5 = _mm512_load_ps(src + 80); | |
__m512 x6 = _mm512_load_ps(src + 96); | |
__m512 x7 = _mm512_load_ps(src + 112); | |
__m512 y0 = _mm512_load_ps(dst + 0); | |
__m512 y1 = _mm512_load_ps(dst + 16); | |
__m512 y2 = _mm512_load_ps(dst + 32); | |
__m512 y3 = _mm512_load_ps(dst + 48); | |
__m512 y4 = _mm512_load_ps(dst + 64); | |
__m512 y5 = _mm512_load_ps(dst + 80); | |
__m512 y6 = _mm512_load_ps(dst + 96); | |
__m512 y7 = _mm512_load_ps(dst + 112); | |
y0 = _mm512_fmadd_ps(zgain, x0, y0); | |
y1 = _mm512_fmadd_ps(zgain, x1, y1); | |
y2 = _mm512_fmadd_ps(zgain, x2, y2); | |
y3 = _mm512_fmadd_ps(zgain, x3, y3); | |
y4 = _mm512_fmadd_ps(zgain, x4, y4); | |
y5 = _mm512_fmadd_ps(zgain, x5, y5); | |
y6 = _mm512_fmadd_ps(zgain, x6, y6); | |
y7 = _mm512_fmadd_ps(zgain, x7, y7); | |
_mm512_store_ps(dst + 0, y0); | |
_mm512_store_ps(dst + 16, y1); | |
_mm512_store_ps(dst + 32, y2); | |
_mm512_store_ps(dst + 48, y3); | |
_mm512_store_ps(dst + 64, y4); | |
_mm512_store_ps(dst + 80, y5); | |
_mm512_store_ps(dst + 96, y6); | |
_mm512_store_ps(dst + 112, y7); | |
src += 128; | |
dst += 128; | |
frames -= 128; | |
} | |
// Process the remaining samples 16 at a time | |
while (frames >= 16) { | |
__m512 x = _mm512_load_ps(src); | |
__m512 y = _mm512_load_ps(dst); | |
y = _mm512_fmadd_ps(zgain, x, y); | |
_mm512_store_ps(dst, y); | |
src += 16; | |
dst += 16; | |
frames -= 16; | |
} | |
// Process remaining samples x8 | |
while (frames >= 8) { | |
__m256 x = _mm256_load_ps(src); | |
__m256 y = _mm256_load_ps(dst); | |
y = _mm256_fmadd_ps(ygain, x, y); | |
_mm256_store_ps(dst, y); | |
src += 8; | |
dst += 8; | |
frames -= 8; | |
} | |
// Process remaining samples x4 | |
while (frames >= 4) { | |
__m128 x = _mm_load_ps(src); | |
__m128 y = _mm_load_ps(dst); | |
y = _mm_fmadd_ps(xgain, x, y); | |
_mm_store_ps(dst, y); | |
src += 4; | |
dst += 4; | |
frames -= 4; | |
} | |
// Process remaining samples | |
while (frames > 0) { | |
__m128 x = _mm_load_ss(src); | |
__m128 y = _mm_load_ss(dst); | |
y = _mm_fmadd_ss(xgain, x, y); | |
_mm_store_ss(dst, y); | |
++src; | |
++dst; | |
--frames; | |
} | |
// There's a penalty going from AVX mode to SSE mode. This can | |
// be avoided by ensuring the CPU that rest of the routine is no | |
// longer interested in the upper portion of the YMM register. | |
// | |
_mm256_zeroupper(); // zeros the upper portion of YMM register | |
} | |
/** | |
* @brief x86-64 AVX optimized routine for mixing buffer with no gain. | |
* | |
* This function may choose SSE over AVX if the pointers are aligned | |
* to 16 byte boundary instead of 32 byte boundary to reduce time to | |
* process. | |
* | |
* @param[in,out] dst Pointer to destination buffer, which gets updated | |
* @param[in] src Pointer to source buffer (not updated) | |
* @param nframes Number of samples to process | |
*/ | |
C_FUNC void | |
x86_avx512f_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes) | |
{ | |
// Convert to signed integer to prevent any arithmetic overflow errors | |
int32_t frames = (int32_t)nframes; | |
while (frames > 0) | |
{ | |
if (IS_ALIGNED_TO(src, sizeof(__m512)) && | |
IS_ALIGNED_TO(dst, sizeof(__m512))) { | |
break; | |
} | |
if (IS_ALIGNED_TO(src, sizeof(__m256)) && | |
IS_ALIGNED_TO(dst, sizeof(__m256))) { | |
__m256 x = _mm256_load_ps(src); | |
__m256 y = _mm256_load_ps(dst); | |
y = _mm256_add_ps(x, y); | |
_mm256_store_ps(dst, y); | |
src += 8; | |
dst += 8; | |
frames -= 8; | |
continue; | |
} | |
if (IS_ALIGNED_TO(src, sizeof(__m128)) && | |
IS_ALIGNED_TO(dst, sizeof(__m128))) { | |
__m128 x = _mm_load_ps(src); | |
__m128 y = _mm_load_ps(dst); | |
y = _mm_add_ps(x, y); | |
_mm_store_ps(dst, y); | |
src += 4; | |
dst += 4; | |
frames -= 4; | |
continue; | |
} | |
// Pointers are aligned to float boundaries (4 bytes) | |
__m128 x = _mm_load_ss(src); | |
__m128 y = _mm_load_ss(dst); | |
y = _mm_add_ss(x, y); | |
_mm_store_ss(dst, y); | |
++src; | |
++dst; | |
--frames; | |
} | |
// Process the remaining samples 128 at a time | |
while (frames >= 128) { | |
_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0)); | |
_mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0)); | |
__m512 x0 = _mm512_load_ps(src + 0); | |
__m512 x1 = _mm512_load_ps(src + 16); | |
__m512 x2 = _mm512_load_ps(src + 32); | |
__m512 x3 = _mm512_load_ps(src + 48); | |
__m512 x4 = _mm512_load_ps(src + 64); | |
__m512 x5 = _mm512_load_ps(src + 80); | |
__m512 x6 = _mm512_load_ps(src + 96); | |
__m512 x7 = _mm512_load_ps(src + 112); | |
__m512 y0 = _mm512_load_ps(dst + 0); | |
__m512 y1 = _mm512_load_ps(dst + 16); | |
__m512 y2 = _mm512_load_ps(dst + 32); | |
__m512 y3 = _mm512_load_ps(dst + 48); | |
__m512 y4 = _mm512_load_ps(dst + 64); | |
__m512 y5 = _mm512_load_ps(dst + 80); | |
__m512 y6 = _mm512_load_ps(dst + 96); | |
__m512 y7 = _mm512_load_ps(dst + 112); | |
y0 = _mm512_add_ps(x0, y0); | |
y1 = _mm512_add_ps(x1, y1); | |
y2 = _mm512_add_ps(x2, y2); | |
y3 = _mm512_add_ps(x3, y3); | |
y4 = _mm512_add_ps(x4, y4); | |
y5 = _mm512_add_ps(x5, y5); | |
y6 = _mm512_add_ps(x6, y6); | |
y7 = _mm512_add_ps(x7, y7); | |
_mm512_store_ps(dst + 0, y0); | |
_mm512_store_ps(dst + 16, y1); | |
_mm512_store_ps(dst + 32, y2); | |
_mm512_store_ps(dst + 48, y3); | |
_mm512_store_ps(dst + 64, y4); | |
_mm512_store_ps(dst + 80, y5); | |
_mm512_store_ps(dst + 96, y6); | |
_mm512_store_ps(dst + 112, y7); | |
src += 128; | |
dst += 128; | |
frames -= 128; | |
} | |
// Process the remaining samples 16 at a time | |
while (frames >= 16) { | |
__m512 x = _mm512_load_ps(src); | |
__m512 y = _mm512_load_ps(dst); | |
y = _mm512_add_ps(x, y); | |
_mm512_store_ps(dst, y); | |
src += 16; | |
dst += 16; | |
frames -= 16; | |
} | |
// Process remaining samples x8 | |
while (frames >= 8) { | |
__m256 x = _mm256_load_ps(src); | |
__m256 y = _mm256_load_ps(dst); | |
y = _mm256_add_ps(x, y); | |
_mm256_store_ps(dst, y); | |
src += 8; | |
dst += 8; | |
frames -= 8; | |
} | |
// Process remaining samples x4 | |
while (frames >= 4) { | |
__m128 x = _mm_load_ps(src); | |
__m128 y = _mm_load_ps(dst); | |
y = _mm_add_ps(x, y); | |
_mm_store_ps(dst, y); | |
src += 4; | |
dst += 4; | |
frames -= 4; | |
} | |
// Process remaining samples | |
while (frames > 0) { | |
__m128 x = _mm_load_ss(src); | |
__m128 y = _mm_load_ss(dst); | |
y = _mm_add_ss(x, y); | |
_mm_store_ss(dst, y); | |
++src; | |
++dst; | |
--frames; | |
} | |
// There's a penalty going from AVX mode to SSE mode. This can | |
// be avoided by ensuring the CPU that rest of the routine is no | |
// longer interested in the upper portion of the YMM register. | |
// | |
_mm256_zeroupper(); // zeros the upper portion of YMM register | |
} | |
C_FUNC void | |
x86_avx512f_copy_vector(float *dst, const float *src, uint32_t nframes) | |
{ | |
// Convert to signed integer to prevent any arithmetic overflow errors | |
int32_t frames = (int32_t)nframes; | |
while (frames > 0) { | |
if (LIKELY(IS_ALIGNED_TO(dst, sizeof(__m512)))) { | |
break; | |
} | |
if (IS_ALIGNED_TO(dst, sizeof(__m256))) { | |
__m256 x = _mm256_load_ps(src); | |
_mm256_store_ps(dst, x); | |
src += 8; | |
dst += 8; | |
frames -= 8; | |
continue; | |
} | |
if (IS_ALIGNED_TO(dst, sizeof(__m128))) { | |
__m128 x = _mm_load_ps(src); | |
_mm_store_ps(dst, x); | |
src += 4; | |
dst += 4; | |
frames -= 4; | |
continue; | |
} | |
// Pointers are aligned to float boundaries (4 bytes) | |
__m128 x = _mm_load_ss(src); | |
_mm_store_ss(dst, x); | |
++src; | |
++dst; | |
--frames; | |
} | |
while (frames >= 256) { | |
_mm_prefetch(reinterpret_cast<void const *>(src + 256), _mm_hint(0)); | |
_mm_prefetch(reinterpret_cast<void const *>(dst + 256), _mm_hint(0)); | |
__m512 x0 = _mm512_load_ps(src + 0); | |
__m512 x1 = _mm512_load_ps(src + 16); | |
__m512 x2 = _mm512_load_ps(src + 32); | |
__m512 x3 = _mm512_load_ps(src + 48); | |
__m512 x4 = _mm512_load_ps(src + 64); | |
__m512 x5 = _mm512_load_ps(src + 80); | |
__m512 x6 = _mm512_load_ps(src + 96); | |
__m512 x7 = _mm512_load_ps(src + 112); | |
__m512 x8 = _mm512_load_ps(src + 128); | |
__m512 x9 = _mm512_load_ps(src + 144); | |
__m512 x10 = _mm512_load_ps(src + 160); | |
__m512 x11 = _mm512_load_ps(src + 176); | |
__m512 x12 = _mm512_load_ps(src + 192); | |
__m512 x13 = _mm512_load_ps(src + 208); | |
__m512 x14 = _mm512_load_ps(src + 224); | |
__m512 x15 = _mm512_load_ps(src + 240); | |
_mm512_store_ps(dst + 0, x0); | |
_mm512_store_ps(dst + 16, x1); | |
_mm512_store_ps(dst + 32, x2); | |
_mm512_store_ps(dst + 48, x3); | |
_mm512_store_ps(dst + 64, x4); | |
_mm512_store_ps(dst + 80, x5); | |
_mm512_store_ps(dst + 96, x6); | |
_mm512_store_ps(dst + 112, x7); | |
_mm512_store_ps(dst + 128, x8); | |
_mm512_store_ps(dst + 144, x9); | |
_mm512_store_ps(dst + 160, x10); | |
_mm512_store_ps(dst + 176, x11); | |
_mm512_store_ps(dst + 192, x12); | |
_mm512_store_ps(dst + 208, x13); | |
_mm512_store_ps(dst + 224, x14); | |
_mm512_store_ps(dst + 240, x15); | |
src += 256; | |
dst += 256; | |
frames -= 256; | |
} | |
while (frames >= 128) { | |
_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0)); | |
_mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0)); | |
__m512 x0 = _mm512_load_ps(src + 0); | |
__m512 x1 = _mm512_load_ps(src + 16); | |
__m512 x2 = _mm512_load_ps(src + 32); | |
__m512 x3 = _mm512_load_ps(src + 48); | |
__m512 x4 = _mm512_load_ps(src + 64); | |
__m512 x5 = _mm512_load_ps(src + 80); | |
__m512 x6 = _mm512_load_ps(src + 96); | |
__m512 x7 = _mm512_load_ps(src + 112); | |
_mm512_store_ps(dst + 0, x0); | |
_mm512_store_ps(dst + 16, x1); | |
_mm512_store_ps(dst + 32, x2); | |
_mm512_store_ps(dst + 48, x3); | |
_mm512_store_ps(dst + 64, x4); | |
_mm512_store_ps(dst + 80, x5); | |
_mm512_store_ps(dst + 96, x6); | |
_mm512_store_ps(dst + 112, x7); | |
src += 128; | |
dst += 128; | |
frames -= 128; | |
} | |
// Process the remaining samples 16 at a time | |
while (frames >= 16) { | |
__m512 x = _mm512_load_ps(src); | |
_mm512_store_ps(dst, x); | |
src += 16; | |
dst += 16; | |
frames -= 16; | |
} | |
// Process remaining samples x8 | |
while (frames >= 8) { | |
__m256 x = _mm256_load_ps(src); | |
_mm256_store_ps(dst, x); | |
src += 8; | |
dst += 8; | |
frames -= 8; | |
} | |
// Process remaining samples x4 | |
while (frames >= 4) { | |
__m128 x = _mm_load_ps(src); | |
_mm_store_ps(dst, x); | |
src += 4; | |
dst += 4; | |
frames -= 4; | |
} | |
// Process remaining samples | |
while (frames > 0) { | |
__m128 x = _mm_load_ss(src); | |
_mm_store_ss(dst, x); | |
++src; | |
++dst; | |
--frames; | |
} | |
// There's a penalty going from AVX mode to SSE mode. This can | |
// be avoided by ensuring the CPU that rest of the routine is no | |
// longer interested in the upper portion of the YMM register. | |
_mm256_zeroupper(); // zeros the upper portion of YMM register | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment