Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Fast 1D convolution with AVX
// Fast 1D convolution with AXV
// By Joannes Vermorel, Lokad, January 2018
// Released under MIT license
#include <string.h>
#include <stdio.h>
#include <malloc.h>
/* A simple implementation of a 1D convolution that just iterates over
* scalar values of the input array.
*
* Returns the same as numpy.convolve(in, kernel, mode='full')
* */
extern "C" _declspec(dllexport) int convolve_naive(float* in, int input_length,
float* kernel, int kernel_length, float* out)
{
for (int i = 0; i < input_length + kernel_length - 1; i++) {
out[i] = 0.0;
int startk = i >= input_length ? i - input_length + 1 : 0;
int endk = i < kernel_length ? i : kernel_length - 1;
for (int k = startk; k <= endk; k++) {
out[i] += in[i - k] * kernel[k];
}
}
return 0;
}
/* Vectorize the algorithm to compute 4 output samples in parallel.
*
* Each kernel value is repeated 4 times, which can then be used on
* 4 input samples in parallel. Stepping over these as in naive
* means that we get 4 output samples for each inner kernel loop.
*
*/
extern "C" _declspec(dllexport) int convolve_sse(float* in, int input_length,
float* kernel, int kernel_length, float* out)
{
float* in_padded = (float*)(_alloca(sizeof(float) * (input_length + 8)));
__m128* kernel_many = (__m128*)(_alloca(16 * kernel_length));
__m128 block;
__m128 prod;
__m128 acc;
// surrounding zeroes, before and after
_mm_storeu_ps(in_padded, _mm_set1_ps(0));
memcpy(&in_padded[4], in, sizeof(float) * input_length);
_mm_storeu_ps(in_padded + input_length + 4, _mm_set1_ps(0));
// Repeat each kernal value across a 4-vector
int i;
for (i = 0; i < kernel_length; i++) {
kernel_many[i] = _mm_set1_ps(kernel[i]); // broadcast
}
for (i = 0; i < input_length + kernel_length - 4; i += 4) {
// Zero the accumulator
acc = _mm_setzero_ps();
int startk = i > (input_length - 1) ? i - (input_length - 1) : 0;
int endk = (i + 3) < kernel_length ? (i + 3) : kernel_length - 1;
/* After this loop, we have computed 4 output samples
* for the price of one.
* */
for (int k = startk; k <= endk; k++) {
// Load 4-float data block. These needs to be an unaliged
// load (_mm_loadu_ps) as we step one sample at a time.
block = _mm_loadu_ps(in_padded + 4 + i - k);
prod = _mm_mul_ps(block, kernel_many[k]);
// Accumulate the 4 parallel values
acc = _mm_add_ps(acc, prod);
}
_mm_storeu_ps(out + i, acc);
}
// Left-overs
for (; i < input_length + kernel_length - 1; i++) {
out[i] = 0.0;
int startk = i >= input_length ? i - input_length + 1 : 0;
int endk = i < kernel_length ? i : kernel_length - 1;
for (int k = startk; k <= endk; k++) {
out[i] += in[i - k] * kernel[k];
}
}
return 0;
}
/* Vectorize the algorithm to compute 8 output samples in parallel.
*
* Each kernel value is repeated 8 times, which can then be used on
* 8 input samples in parallel. Stepping over these as in naive
* means that we get 8 output samples for each inner kernel loop.
*
*/
extern "C" _declspec(dllexport) int convolve_avx(float* in, int input_length,
float* kernel, int kernel_length, float* out)
{
float* in_padded = (float*)(_alloca(sizeof(float) * (input_length + 16)));
__m256* kernel_many = (__m256*)(_alloca(sizeof(__m256) * kernel_length));
__m256 block;
__m256 prod;
__m256 acc;
// Repeat the kernel across the vector
int i;
for (i = 0; i < kernel_length; i++) {
kernel_many[i] = _mm256_broadcast_ss(&kernel[i]); // broadcast
}
/* Create a set of 4 aligned arrays
* Each array is offset by one sample from the one before
*/
block = _mm256_setzero_ps();
_mm256_storeu_ps(in_padded, block);
memcpy(&(in_padded[8]), in, input_length * sizeof(float));
_mm256_storeu_ps(in_padded + input_length + 8, block);
for (i = 0; i < input_length + kernel_length - 8; i += 8) {
// Zero the accumulator
acc = _mm256_setzero_ps();
int startk = i >(input_length - 1) ? i - (input_length - 1) : 0;
int endk = (i + 7) < kernel_length ? (i + 7) : kernel_length - 1;
int k = startk;
// Manual unrolling of the loop to trigger pipelining speed-up (x2 perf)
for (; k + 3 <= endk; k += 4)
{
block = _mm256_loadu_ps(in_padded + 8 + i - k);
prod = _mm256_mul_ps(block, kernel_many[k]);
acc = _mm256_add_ps(acc, prod);
block = _mm256_loadu_ps(in_padded + 7 + i - k);
prod = _mm256_mul_ps(block, kernel_many[k + 1]);
acc = _mm256_add_ps(acc, prod);
block = _mm256_loadu_ps(in_padded + 6 + i - k);
prod = _mm256_mul_ps(block, kernel_many[k + 2]);
acc = _mm256_add_ps(acc, prod);
block = _mm256_loadu_ps(in_padded + 5 + i - k);
prod = _mm256_mul_ps(block, kernel_many[k + 3]);
acc = _mm256_add_ps(acc, prod);
}
for (; k <= endk; k++) {
block = _mm256_loadu_ps(in_padded + 8 + i - k);
prod = _mm256_mul_ps(block, kernel_many[k]);
acc = _mm256_add_ps(acc, prod);
}
_mm256_storeu_ps(out + i, acc);
}
// Left-overs
for (; i < input_length + kernel_length - 1; i++) {
out[i] = 0.0;
int startk = i >= input_length ? i - input_length + 1 : 0;
int endk = i < kernel_length ? i : kernel_length - 1;
for (int k = startk; k <= endk; k++) {
out[i] += in[i - k] * kernel[k];
}
}
return 0;
}
@not-availabIe

This comment has been minimized.

Copy link

@not-availabIe not-availabIe commented Dec 17, 2019

although convolve_avx doesn't have a description, it works with 8 inputs in parallel right?

@not-availabIe

This comment has been minimized.

Copy link

@not-availabIe not-availabIe commented Dec 17, 2019

awesome!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.