Fast 1D convolution with AVX
// Fast 1D convolution with AXV | |
// By Joannes Vermorel, Lokad, January 2018 | |
// Released under MIT license | |
#include <string.h> | |
#include <stdio.h> | |
#include <malloc.h> | |
/* A simple implementation of a 1D convolution that just iterates over | |
* scalar values of the input array. | |
* | |
* Returns the same as numpy.convolve(in, kernel, mode='full') | |
* */ | |
extern "C" _declspec(dllexport) int convolve_naive(float* in, int input_length, | |
float* kernel, int kernel_length, float* out) | |
{ | |
for (int i = 0; i < input_length + kernel_length - 1; i++) { | |
out[i] = 0.0; | |
int startk = i >= input_length ? i - input_length + 1 : 0; | |
int endk = i < kernel_length ? i : kernel_length - 1; | |
for (int k = startk; k <= endk; k++) { | |
out[i] += in[i - k] * kernel[k]; | |
} | |
} | |
return 0; | |
} | |
/* Vectorize the algorithm to compute 4 output samples in parallel. | |
* | |
* Each kernel value is repeated 4 times, which can then be used on | |
* 4 input samples in parallel. Stepping over these as in naive | |
* means that we get 4 output samples for each inner kernel loop. | |
* | |
*/ | |
extern "C" _declspec(dllexport) int convolve_sse(float* in, int input_length, | |
float* kernel, int kernel_length, float* out) | |
{ | |
float* in_padded = (float*)(_alloca(sizeof(float) * (input_length + 8))); | |
__m128* kernel_many = (__m128*)(_alloca(16 * kernel_length)); | |
__m128 block; | |
__m128 prod; | |
__m128 acc; | |
// surrounding zeroes, before and after | |
_mm_storeu_ps(in_padded, _mm_set1_ps(0)); | |
memcpy(&in_padded[4], in, sizeof(float) * input_length); | |
_mm_storeu_ps(in_padded + input_length + 4, _mm_set1_ps(0)); | |
// Repeat each kernal value across a 4-vector | |
int i; | |
for (i = 0; i < kernel_length; i++) { | |
kernel_many[i] = _mm_set1_ps(kernel[i]); // broadcast | |
} | |
for (i = 0; i < input_length + kernel_length - 4; i += 4) { | |
// Zero the accumulator | |
acc = _mm_setzero_ps(); | |
int startk = i > (input_length - 1) ? i - (input_length - 1) : 0; | |
int endk = (i + 3) < kernel_length ? (i + 3) : kernel_length - 1; | |
/* After this loop, we have computed 4 output samples | |
* for the price of one. | |
* */ | |
for (int k = startk; k <= endk; k++) { | |
// Load 4-float data block. These needs to be an unaliged | |
// load (_mm_loadu_ps) as we step one sample at a time. | |
block = _mm_loadu_ps(in_padded + 4 + i - k); | |
prod = _mm_mul_ps(block, kernel_many[k]); | |
// Accumulate the 4 parallel values | |
acc = _mm_add_ps(acc, prod); | |
} | |
_mm_storeu_ps(out + i, acc); | |
} | |
// Left-overs | |
for (; i < input_length + kernel_length - 1; i++) { | |
out[i] = 0.0; | |
int startk = i >= input_length ? i - input_length + 1 : 0; | |
int endk = i < kernel_length ? i : kernel_length - 1; | |
for (int k = startk; k <= endk; k++) { | |
out[i] += in[i - k] * kernel[k]; | |
} | |
} | |
return 0; | |
} | |
/* Vectorize the algorithm to compute 8 output samples in parallel. | |
* | |
* Each kernel value is repeated 8 times, which can then be used on | |
* 8 input samples in parallel. Stepping over these as in naive | |
* means that we get 8 output samples for each inner kernel loop. | |
* | |
*/ | |
extern "C" _declspec(dllexport) int convolve_avx(float* in, int input_length, | |
float* kernel, int kernel_length, float* out) | |
{ | |
float* in_padded = (float*)(_alloca(sizeof(float) * (input_length + 16))); | |
__m256* kernel_many = (__m256*)(_alloca(sizeof(__m256) * kernel_length)); | |
__m256 block; | |
__m256 prod; | |
__m256 acc; | |
// Repeat the kernel across the vector | |
int i; | |
for (i = 0; i < kernel_length; i++) { | |
kernel_many[i] = _mm256_broadcast_ss(&kernel[i]); // broadcast | |
} | |
/* Create a set of 4 aligned arrays | |
* Each array is offset by one sample from the one before | |
*/ | |
block = _mm256_setzero_ps(); | |
_mm256_storeu_ps(in_padded, block); | |
memcpy(&(in_padded[8]), in, input_length * sizeof(float)); | |
_mm256_storeu_ps(in_padded + input_length + 8, block); | |
for (i = 0; i < input_length + kernel_length - 8; i += 8) { | |
// Zero the accumulator | |
acc = _mm256_setzero_ps(); | |
int startk = i >(input_length - 1) ? i - (input_length - 1) : 0; | |
int endk = (i + 7) < kernel_length ? (i + 7) : kernel_length - 1; | |
int k = startk; | |
// Manual unrolling of the loop to trigger pipelining speed-up (x2 perf) | |
for (; k + 3 <= endk; k += 4) | |
{ | |
block = _mm256_loadu_ps(in_padded + 8 + i - k); | |
prod = _mm256_mul_ps(block, kernel_many[k]); | |
acc = _mm256_add_ps(acc, prod); | |
block = _mm256_loadu_ps(in_padded + 7 + i - k); | |
prod = _mm256_mul_ps(block, kernel_many[k + 1]); | |
acc = _mm256_add_ps(acc, prod); | |
block = _mm256_loadu_ps(in_padded + 6 + i - k); | |
prod = _mm256_mul_ps(block, kernel_many[k + 2]); | |
acc = _mm256_add_ps(acc, prod); | |
block = _mm256_loadu_ps(in_padded + 5 + i - k); | |
prod = _mm256_mul_ps(block, kernel_many[k + 3]); | |
acc = _mm256_add_ps(acc, prod); | |
} | |
for (; k <= endk; k++) { | |
block = _mm256_loadu_ps(in_padded + 8 + i - k); | |
prod = _mm256_mul_ps(block, kernel_many[k]); | |
acc = _mm256_add_ps(acc, prod); | |
} | |
_mm256_storeu_ps(out + i, acc); | |
} | |
// Left-overs | |
for (; i < input_length + kernel_length - 1; i++) { | |
out[i] = 0.0; | |
int startk = i >= input_length ? i - input_length + 1 : 0; | |
int endk = i < kernel_length ? i : kernel_length - 1; | |
for (int k = startk; k <= endk; k++) { | |
out[i] += in[i - k] * kernel[k]; | |
} | |
} | |
return 0; | |
} |
This comment has been minimized.
This comment has been minimized.
awesome! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
although convolve_avx doesn't have a description, it works with 8 inputs in parallel right?