Created
November 14, 2023 03:59
-
-
Save ashafq/1dce3e54095f6ce75042c04a2c1df313 to your computer and use it in GitHub Desktop.
A5EQ Benchmark code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* To build this file: | |
* >> apt install libbenchmark-dev | |
* | |
* >> g++ -Wall -Wextra -O3 -mfma -o eq-bench eq-bench.cc \ | |
* `pkg-config --cflags --libs benchmark` | |
* >> ./eq-bench | |
*/ | |
/* | |
* Copyright (C) 2023 Ayan Shafqat <ayan.x.shafqat@gmail.com> | |
* | |
* This program is free software; you can redistribute it and/or modify | |
* it under the terms of the GNU General Public License as published by | |
* the Free Software Foundation; either version 2, or (at your option) | |
* any later version. | |
* | |
* This program is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
* GNU General Public License for more details. | |
* | |
* You should have received a copy of the GNU General Public License | |
* along with this program. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
#ifndef A5EQ_DSP_BIQUAD_HPP_ | |
#define A5EQ_DSP_BIQUAD_HPP_ | |
#include <algorithm> | |
#include <cstddef> | |
#define ARM_NEON_OPTIMIZATION (3) | |
#define AVX2_FMA_OPTIMIZATION (2) | |
#define SSE2_OPTIMIZATION (1) | |
#define NO_OPTIMIZATION (0) | |
#if (defined(__aarch64__)) && (defined(__ARM_NEON) || defined(__ARM_NEON__)) | |
// Include NEON intrinsic header file | |
#include <arm_neon.h> | |
#define DSP_OPTIMIZATION (ARM_NEON_OPTIMIZATION) | |
#pragma message "Compiling with ARM NEON optimizations" | |
#elif defined(__x86_64__) && defined(__FMA__) | |
// Include FMA intrinsic header file | |
#include <immintrin.h> | |
#define DSP_OPTIMIZATION (AVX2_FMA_OPTIMIZATION) | |
#pragma message "Compiling with AVX2 FMA optimizations" | |
#elif defined(__x86_64__) && defined(__SSE2__) | |
// Include SSE+ intrinsic header file | |
#include <xmmintrin.h> | |
#define DSP_OPTIMIZATION (SSE2_OPTIMIZATION) | |
#pragma message "Compiling with SSE2 optimizations" | |
#else | |
#define DSP_OPTIMIZATION (NO_OPTIMIZATION) | |
#pragma message "Compiling with no specific optimizations" | |
#endif // Select optimization levels | |
/************************ | |
* Filter implementations | |
************************/ | |
#if (DSP_OPTIMIZATION == ARM_NEON_OPTIMIZATION) | |
// The PROCESS_BIQUAD macro would be redefined for NEON | |
#define PROCESS_BIQUAD(x0, y0, b0, b1, b2, a1, a2, s1, s2) \ | |
do { \ | |
/* y0 = (b0 * x0) + s1 */ \ | |
y0 = vmlaq_f64(s1, b0, x0); \ | |
/* s1 = (b1 * x0) + (a1 * y0) + s2 */ \ | |
s1 = vmlaq_f64(s2, b1, x0); \ | |
s1 = vmlaq_f64(s1, a1, y0); \ | |
/* s2 = (b2 * x0) + (a2 * y0) */ \ | |
s2 = vmlaq_f64(vmulq_f64(a2, y0), b2, x0); \ | |
} while (0) | |
/** | |
* @brief Process biquad filter for a single stage of biquad | |
* | |
* @details | |
* | |
* Inspired from the author's old code with 4 channel biquad here: | |
* | |
* https://gist.github.com/ashafq/0db953125a033b783c6e100acd5e64d9 | |
* | |
* This is optimized for AARCH64 NEON | |
* | |
* @param[in] coeff Filter coefficients | |
* @param[in,out] state Filter state | |
* @param[out] dst Destination/output buffer | |
* @param[in] src Source/input buffer | |
* @param len Number of samples to process | |
* | |
* @note Filter is executed in Direct form II topology | |
* @see https://ccrma.stanford.edu/~jos/fp/Transposed_Direct_Forms.html | |
* | |
* @note Memory segments in @p src and @p dst may overlap | |
*/ | |
void process_biquad_stereo(const double *coeff, double *state, float *out_left, | |
const float *in_left, float *out_right, | |
const float *in_right, size_t frames) { | |
// Copy the same coefficients for both left and right channels | |
float64x2_t b0 = vdupq_n_f64(coeff[0]); | |
float64x2_t b1 = vdupq_n_f64(coeff[1]); | |
float64x2_t b2 = vdupq_n_f64(coeff[2]); | |
float64x2_t a1 = vdupq_n_f64(-coeff[3]); | |
float64x2_t a2 = vdupq_n_f64(-coeff[4]); | |
float64x2_t s1 = vld1q_f64(state + 0); | |
float64x2_t s2 = vld1q_f64(state + 2); | |
size_t simd_frames = frames & (~static_cast<size_t>(3)); | |
size_t count = 0; | |
while (count < simd_frames) { | |
// Load input samples into SSE registers and convert to double | |
float32x4_t x_left = vld1q_f32(in_left + count); // {l0, l1, l2, l3} | |
float32x4_t x_right = vld1q_f32(in_right + count); // {l0, l1, l2, l3} | |
float64x2_t x_left_0 = | |
vcvt_f64_f32(vget_low_f32(x_left)); // {l0, l1} (double) | |
float64x2_t x_right_0 = | |
vcvt_f64_f32(vget_low_f32(x_right)); // {r0, r1} (double) | |
float64x2_t x_left_1 = | |
vcvt_f64_f32(vget_high_f32(x_left)); // {l2, l3} (double) | |
float64x2_t x_right_1 = | |
vcvt_f64_f32(vget_high_f32(x_right)); // {r2, r3} (double) | |
// Interleave left and right inputs for stereo pair | |
float64x2_t x0 = vtrn1q_f64(x_left_0, x_right_0); // {l0, r0} | |
float64x2_t x1 = vtrn2q_f64(x_left_0, x_right_0); // {l1, r1} | |
float64x2_t x2 = vtrn1q_f64(x_left_1, x_right_1); // {l2, r2} | |
float64x2_t x3 = vtrn2q_f64(x_left_1, x_right_1); // {l3, r3} | |
// Outputs | |
float64x2_t y0, y1, y2, y3; | |
PROCESS_BIQUAD(x0, y0, b0, b1, b2, a1, a2, s1, s2); | |
PROCESS_BIQUAD(x1, y1, b0, b1, b2, a1, a2, s1, s2); | |
PROCESS_BIQUAD(x2, y2, b0, b1, b2, a1, a2, s1, s2); | |
PROCESS_BIQUAD(x3, y3, b0, b1, b2, a1, a2, s1, s2); | |
float64x2_t y_left_0 = vuzp1q_f64(y0, y1); // {l0, l1} | |
float64x2_t y_right_0 = vuzp2q_f64(y0, y1); // {r0, r1} | |
float64x2_t y_left_1 = vuzp1q_f64(y2, y3); // {l2, l3} | |
float64x2_t y_right_1 = vuzp2q_f64(y2, y3); // {r2, r3} | |
// Convert the output to float and combine them into two registers | |
float32x4_t y_left = vcombine_f32( | |
vcvt_f32_f64(y_left_0), vcvt_f32_f64(y_left_1)); // {l0, l1, l2, l3} | |
float32x4_t y_right = vcombine_f32( | |
vcvt_f32_f64(y_right_0), vcvt_f32_f64(y_right_1)); // {r0, r1, r2, r3} | |
// Store the results | |
vst1q_f32(out_left + count, y_left); | |
vst1q_f32(out_right + count, y_right); | |
count += 4; | |
} | |
// Process any remaining frames | |
while (count < frames) { | |
// Load single float values and create float32x2_t vectors | |
float64x1_t x_left = vdup_n_f64(static_cast<double>(in_left[count])); | |
float64x1_t x_right = vdup_n_f64(static_cast<double>(in_right[count])); | |
// Interleave left and right inputs for stereo pair | |
float64x2_t x0 = vcombine_f64(x_left, x_right); | |
// Outputs | |
float64x2_t y0; | |
PROCESS_BIQUAD(x0, y0, b0, b1, b2, a1, a2, s1, s2); | |
// y0 = {yl, yr} | |
float64x1_t yl = vget_low_f64(y0); // {yl} | |
float64x1_t yr = vget_high_f64(y0); // {yr} | |
// Store the results | |
out_left[count] = static_cast<float32_t>(vget_lane_f64(yl, 0)); | |
out_right[count] = static_cast<float32_t>(vget_lane_f64(yr, 0)); | |
++count; | |
} | |
// Store states | |
vst1q_f64(state + 0, s1); | |
vst1q_f64(state + 2, s2); | |
} | |
#undef PROCESS_BIQUAD // No longer needed | |
#endif // ARM_NEON_OPTIMIZATION | |
#if (DSP_OPTIMIZATION == AVX2_FMA_OPTIMIZATION) || \ | |
(DSP_OPTIMIZATION == SSE2_OPTIMIZATION) | |
/** | |
* Since FMA and SSE2 are mostly shared code, it's best to consolidate them | |
*/ | |
#if (DSP_OPTIMIZATION == AVX2_FMA_OPTIMIZATION) | |
#define PROCESS_BIQUAD(x0, y0, b0, b1, b2, a1, a2, s1, s2) \ | |
do { \ | |
/* y0 = (b0 * x0) + s1 */ \ | |
y0 = _mm_fmadd_pd(b0, x0, s1); \ | |
/* s1 = (b1 * x0) + (a1 * y0) + s2 */ \ | |
s1 = _mm_fmadd_pd(b1, x0, _mm_fmadd_pd(a1, y0, s2)); \ | |
/* s2 = (b2 * x0) + (a2 * y0) */ \ | |
s2 = _mm_fmadd_pd(b2, x0, _mm_mul_pd(a2, y0)); \ | |
} while (false) | |
#elif (DSP_OPTIMIZATION == SSE2_OPTIMIZATION) | |
#define PROCESS_BIQUAD(x0, y0, b0, b1, b2, a1, a2, s1, s2) \ | |
do { \ | |
/* y0 = (b0 * x0) + s1 */ \ | |
y0 = _mm_add_pd(_mm_mul_pd(b0, x0), s1); \ | |
/* s1 = (b1 * x0) + (a1 * y0) + s2 */ \ | |
s1 = _mm_add_pd(_mm_add_pd(_mm_mul_pd(b1, x0), _mm_mul_pd(a1, y0)), s2); \ | |
/* s2 = (b2 * x0) + (a2 * y0) */ \ | |
s2 = _mm_add_pd(_mm_mul_pd(b2, x0), _mm_mul_pd(a2, y0)); \ | |
} while (false) | |
#endif // DSP_OPTIMIZATION | |
/** | |
* @brief Process biquad filter for a single stage of biquad | |
* | |
* @details | |
* | |
* Inspired from the author's old code with 4 channel biquad here: | |
* | |
* https://gist.github.com/ashafq/0db953125a033b783c6e100acd5e64d9 | |
* | |
* @param[in] coeff Filter coefficients | |
* @param[in,out] state Filter state | |
* @param[out] dst Destination/output buffer | |
* @param[in] src Source/input buffer | |
* @param len Number of samples to process | |
* | |
* @note Filter is executed in Direct form II topology | |
* @see https://ccrma.stanford.edu/~jos/fp/Transposed_Direct_Forms.html | |
* | |
* @note Memory segments in @p src and @p dst may overlap | |
*/ | |
static void process_biquad_stereo(const double *coeff, double *state, | |
float *out_left, const float *in_left, | |
float *out_right, const float *in_right, | |
size_t frames) { | |
// Copy the same coefficients for both left and right channels | |
__m128d b0 = _mm_set1_pd(coeff[0]); | |
__m128d b1 = _mm_set1_pd(coeff[1]); | |
__m128d b2 = _mm_set1_pd(coeff[2]); | |
__m128d a1 = _mm_set1_pd(-coeff[3]); | |
__m128d a2 = _mm_set1_pd(-coeff[4]); | |
__m128d s1 = _mm_loadu_pd(state + 0); | |
__m128d s2 = _mm_loadu_pd(state + 2); | |
size_t simd_frames = frames & (~static_cast<size_t>(3)); | |
size_t count = 0; | |
for (count = 0; count < simd_frames; count += 4) { | |
// Load input samples into SSE registers and convert to double | |
__m128 x_left = _mm_loadu_ps(in_left + count); // {l0, l1, l2, l3} | |
__m128 x_right = _mm_loadu_ps(in_right + count); // {r0, r1, r2, r3} | |
__m128d x_left_0 = _mm_cvtps_pd(x_left); // {l0, l1} (double) | |
__m128d x_right_0 = _mm_cvtps_pd(x_right); // {r0, r1} (double) | |
__m128d x_left_1 = | |
_mm_cvtps_pd(_mm_movehl_ps(x_left, x_left)); // {l2, l3} (double) | |
__m128d x_right_1 = | |
_mm_cvtps_pd(_mm_movehl_ps(x_right, x_right)); // {r2, r3} (double) | |
// Interleave left and right inputs for stereo pair | |
__m128d x0 = _mm_unpacklo_pd(x_left_0, x_right_0); // {l0, r0} | |
__m128d x1 = _mm_unpackhi_pd(x_left_0, x_right_0); // {l1, r1} | |
__m128d x2 = _mm_unpacklo_pd(x_left_1, x_right_1); // {l2, r2} | |
__m128d x3 = _mm_unpackhi_pd(x_left_1, x_right_1); // {l3, r3}) | |
// Outputs | |
__m128d y0, y1, y2, y3; | |
PROCESS_BIQUAD(x0, y0, b0, b1, b2, a1, a2, s1, s2); | |
PROCESS_BIQUAD(x1, y1, b0, b1, b2, a1, a2, s1, s2); | |
PROCESS_BIQUAD(x2, y2, b0, b1, b2, a1, a2, s1, s2); | |
PROCESS_BIQUAD(x3, y3, b0, b1, b2, a1, a2, s1, s2); | |
// De-interleave the pairs | |
__m128d y_left_0 = _mm_unpacklo_pd(y0, y1); // {l0, l1} | |
__m128d y_right_0 = _mm_unpackhi_pd(y0, y1); // {r0, r1} | |
__m128d y_left_1 = _mm_unpacklo_pd(y2, y3); // {l2, l3} | |
__m128d y_right_1 = _mm_unpackhi_pd(y2, y3); // {r2, r3} | |
// Convert the output to float and pack them into two registers | |
__m128 y_left = _mm_movelh_ps(_mm_cvtpd_ps(y_left_0), | |
_mm_cvtpd_ps(y_left_1)); // {l0, l1, l2, l3} | |
__m128 y_right = _mm_movelh_ps(_mm_cvtpd_ps(y_right_0), | |
_mm_cvtpd_ps(y_right_1)); // {r0, r1, r2, r3} | |
_mm_storeu_ps(out_left + count, y_left); | |
_mm_storeu_ps(out_right + count, y_right); | |
} | |
// Process any remaining frames | |
while (count < frames) { | |
// Load input samples into SSE registers and convert to double | |
__m128d x_left = _mm_cvtps_pd(_mm_load_ss(in_left + count)); | |
__m128d x_right = _mm_cvtps_pd(_mm_load_ss(in_right + count)); | |
// Interleave left and right inputs for stereo pair | |
__m128d x0 = _mm_unpacklo_pd(x_left, x_right); | |
// Outputs | |
__m128d y0; | |
PROCESS_BIQUAD(x0, y0, b0, b1, b2, a1, a2, s1, s2); | |
// y0 = {yl, yr} | |
__m128d yl = _mm_unpacklo_pd(y0, y0); // {yl, yl} | |
__m128d yr = _mm_unpackhi_pd(y0, y0); // {yr, yr} | |
// Store the result into out_left and out_right | |
_mm_store_ss(out_left + count, _mm_cvtpd_ps(yl)); | |
_mm_store_ss(out_right + count, _mm_cvtpd_ps(yr)); | |
++count; | |
} | |
// Store states | |
_mm_storeu_pd(state + 0, s1); | |
_mm_storeu_pd(state + 2, s2); | |
} | |
#undef PROCESS_BIQUAD // No longer needed | |
#endif // DSP_OPTIMIZATION == {AVX2_FMA_OPTIMIZATION, SSE2_OPTIMIZATION} | |
#if (DSP_OPTIMIZATION == NO_OPTIMIZATION) | |
/** | |
* @brief Process biquad filter for a single stage of biquad | |
* | |
* @param[in] coeff Filter coefficients | |
* @param[in,out] state Filter state | |
* @param[out] dst Destination/output buffer | |
* @param[in] src Source/input buffer | |
* @param len Number of samples to process | |
* | |
* @note Filter is executed in Direct form II topology | |
* @see https://ccrma.stanford.edu/~jos/fp/Transposed_Direct_Forms.html | |
* | |
* @note Memory segments in @p src and @p dst may overlap | |
*/ | |
static void process_biquad_stereo(const double *coeff, double *state, | |
float *out_left, const float *in_left, | |
float *out_right, const float *in_right, | |
size_t frames) { | |
const auto b0 = *coeff++; | |
const auto b1 = *coeff++; | |
const auto b2 = *coeff++; | |
const auto a1 = *coeff++; | |
const auto a2 = *coeff++; | |
// Process left channel | |
auto s1 = state[0]; | |
auto s2 = state[2]; | |
for (size_t i = 0; i < frames; ++i) { | |
auto in = static_cast<double>(in_left[i]); | |
auto out = (b0 * in) + s1; | |
s1 = (b1 * in) - (a1 * out) + s2; | |
s2 = (b2 * in) - (a2 * out); | |
out_left[i] = static_cast<float>(out); | |
} | |
state[0] = s1; | |
state[2] = s2; | |
// Process right channel | |
s1 = state[1]; | |
s2 = state[3]; | |
for (size_t i = 0; i < frames; ++i) { | |
auto in = static_cast<double>(in_right[i]); | |
auto out = (b0 * in) + s1; | |
s1 = (b1 * in) - (a1 * out) + s2; | |
s2 = (b2 * in) - (a2 * out); | |
out_right[i] = static_cast<float>(out); | |
} | |
state[1] = s1; | |
state[3] = s2; | |
} | |
#endif // (DSP_OPTIMIZATION == NO_OPTIMIZATION) | |
/** | |
* @brief Process biquad filter for a single stage of biquad and transition | |
* | |
* @param[in] old_coeff Old filter coefficients | |
* @param[in] new_coeff New filter coefficients | |
* @param[in,out] state Filter state | |
* @param[out] dst Destination/output buffer | |
* @param[in] src Source/input buffer | |
* @param len Number of samples to process | |
* | |
* @note Memory segments in @p src and @p dst may overlap | |
*/ | |
static void process_biquad_stereo_xfade(const double *old_coeff, | |
const double *new_coeff, double *state, | |
float *out_left, const float *in_left, | |
float *out_right, const float *in_right, | |
size_t frames) { | |
// Implement the xfade utilizing the other biquad routine | |
constexpr size_t MIN_FRAMES = 64; | |
constexpr size_t STATE_SIZE = 4; | |
float out_old_left[MIN_FRAMES]; | |
float out_old_right[MIN_FRAMES]; | |
double temp_state[STATE_SIZE] = {0}; | |
float gain = 0.0F; | |
const float delta = 1.0F / static_cast<float>(frames); | |
size_t count = 0; | |
while (count < frames) { | |
// compute the block size | |
size_t block_size = std::min(MIN_FRAMES, frames - count); | |
// Compute the offset | |
const float *xl = in_left + count; | |
const float *xr = in_right + count; | |
float *yl = out_old_left; | |
float *yr = out_old_right; | |
float *zl = out_left + count; | |
float *zr = out_right + count; | |
// Process the biquad with old coefficients, discard IIR state | |
process_biquad_stereo(old_coeff, temp_state, yl, xl, yr, xr, block_size); | |
// Process biquad with new coefficients | |
process_biquad_stereo(new_coeff, state, zl, xl, zr, xr, block_size); | |
for (size_t i = 0; i < block_size; ++i) { | |
zl[i] = ((1.0F - gain) * yl[i]) + (gain * xl[i]); | |
zr[i] = ((1.0F - gain) * yr[i]) + (gain * xr[i]); | |
gain += delta; // = std::min(delta + gain, 1.0F); | |
} | |
count += block_size; | |
} | |
} | |
#endif // A5EQ_DSP_BIQUAD_HPP_ | |
static void process_biquad_stereo_cpp(const double *coeff, double *state, | |
float *out_left, const float *in_left, | |
float *out_right, const float *in_right, | |
size_t frames) { | |
const auto b0 = *coeff++; | |
const auto b1 = *coeff++; | |
const auto b2 = *coeff++; | |
const auto a1 = *coeff++; | |
const auto a2 = *coeff++; | |
// Process left channel | |
auto s1 = state[0]; | |
auto s2 = state[2]; | |
for (size_t i = 0; i < frames; ++i) { | |
auto in = static_cast<double>(in_left[i]); | |
auto out = (b0 * in) + s1; | |
s1 = (b1 * in) - (a1 * out) + s2; | |
s2 = (b2 * in) - (a2 * out); | |
out_left[i] = static_cast<float>(out); | |
} | |
state[0] = s1; | |
state[2] = s2; | |
// Process right channel | |
s1 = state[1]; | |
s2 = state[3]; | |
for (size_t i = 0; i < frames; ++i) { | |
auto in = static_cast<double>(in_right[i]); | |
auto out = (b0 * in) + s1; | |
s1 = (b1 * in) - (a1 * out) + s2; | |
s2 = (b2 * in) - (a2 * out); | |
out_right[i] = static_cast<float>(out); | |
} | |
state[1] = s1; | |
state[3] = s2; | |
} | |
#include <benchmark/benchmark.h> | |
constexpr size_t BUFFER_SIZE = 256; | |
static void CppBiquad(benchmark::State &state) { | |
// Code inside this loop is measured repeatedly | |
float buffer[BUFFER_SIZE]; | |
double eq_coeff[5] = {1.0, 0.0, 0.0, 0.0, 0.0}; | |
double eq_state[4] = {0}; | |
for (auto _ : state) { | |
process_biquad_stereo_cpp(eq_coeff, eq_state, buffer, buffer, buffer, | |
buffer, BUFFER_SIZE); | |
} | |
} | |
static void AsmBiquad(benchmark::State &state) { | |
// Code inside this loop is measured repeatedly | |
float buffer[BUFFER_SIZE]; | |
double eq_coeff[5] = {1.0, 0.0, 0.0, 0.0, 0.0}; | |
double eq_state[4] = {0}; | |
for (auto _ : state) { | |
process_biquad_stereo(eq_coeff, eq_state, buffer, buffer, buffer, buffer, | |
BUFFER_SIZE); | |
} | |
} | |
BENCHMARK(AsmBiquad); | |
BENCHMARK(CppBiquad); | |
BENCHMARK_MAIN(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
On an Intel Alderlake: