Last active
February 18, 2025 14:11
-
-
Save ashafq/0db953125a033b783c6e100acd5e64d9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright (C) 2019-2025 Ayan Shafqat <ayan.x.shafqat@gmail.com> | |
* | |
* This program is free software; you can redistribute it and/or modify | |
* it under the terms of the GNU General Public License as published by | |
* the Free Software Foundation; either version 2, or (at your option) | |
* any later version. | |
* | |
* This program is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
* GNU General Public License for more details. | |
* | |
* You should have received a copy of the GNU General Public License | |
* along with this program. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
#include <stddef.h> | |
#include <xmmintrin.h> | |
void biquad_proc_x4(const float *coeff, | |
float *state, | |
float *io, | |
size_t len) | |
{ | |
// Set up pointers | |
const __m128 *vcoeff = (__m128 *) __builtin_assume_aligned(coeff, 16); | |
__m128 *vstate = (__m128 *) __builtin_assume_aligned(state, 16); | |
__m128 *vio = (__m128 *) __builtin_assume_aligned(io, 16); | |
// Load coefficients | |
const __m128 vb0 = _mm_load_ps((float *) vcoeff++); | |
const __m128 vb1 = _mm_load_ps((float *) vcoeff++); | |
const __m128 vb2 = _mm_load_ps((float *) vcoeff++); | |
const __m128 va1 = _mm_load_ps((float *) vcoeff++); | |
const __m128 va2 = _mm_load_ps((float *) vcoeff++); | |
// Load states | |
__m128 vw1 = _mm_load_ps((float *) vstate++); | |
__m128 vw2 = _mm_load_ps((float *) vstate++); | |
// Process samples | |
for (size_t i = 0; i < len; i++) { | |
// Load input | |
__m128 vx = _mm_load_ps((float *) vio); | |
// Compute output | |
// y = b0 * x + w1 | |
__m128 vb0_vx = _mm_mul_ps(vb0, vx); | |
__m128 vy = _mm_add_ps(vb0_vx, vw1); | |
// Update state: w1 | |
// w1 = b1 * x - a1 * y + w2 | |
__m128 vb1_vx = _mm_mul_ps(vb1, vx); | |
__m128 va1_vy = _mm_mul_ps(va1, vy); | |
vw1 = _mm_sub_ps(vb1_vx, va1_vy); | |
vw1 = _mm_add_ps(vw1, vw2); | |
// Update state: w2 | |
// w2 = b2 * x - a2 * y | |
__m128 vb2_vx = _mm_mul_ps(vb2, vx); | |
__m128 va2_vy = _mm_mul_ps(va2, vy); | |
vw2 = _mm_sub_ps(vb2_vx, va2_vy); | |
// Store output to buffer, and update pointer | |
_mm_store_ps((float *) vio, vy); | |
++vio; | |
} | |
// Store state in state buffer | |
_mm_store_ps((float *) --vstate, vw2); | |
_mm_store_ps((float *) --vstate, vw1); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This
biquad_proc_x4
function implements a biquad (second-order IIR) filter process using SSE (Streaming SIMD Extensions). This function is specifically designed to process four audio samples at once.It takes the following arguments:
coeff
: an array of coefficients for the biquad filter. They should be aligned on a 16-byte boundary because the SSE operations require it. The length of thecoeff
array must be 20 (5 coeffs per channel times 4 channels). The memory map of the coefficient is shown in the next section.state
: an array that keeps the state of the filter. This is also required to be aligned on a 16-byte boundary. The length of this array should be 8, (2 state x 4 channels).io
: an array of input and output values. The function takes the input values from this array, processes them, and writes the output back into the same array.len
: the length of the io array. The function will process this many audio frames (4 x samples)The memory map of the coefficient should be:
Biquad coefficients are denoted as: b0, b1, b2, a1, a2 (a0 is assumed to be 1.0)
Channels are indexed as: c0, c1, c2, c3
The function first loads the filter coefficients and state into SSE registers. It then loops over the input/output array, applying the filter to each input sample and storing the result in place. After it's done processing the samples, it stores the updated state back into the state array.
Here is a simple usage example (untested code):