Last active
October 28, 2023 21:11
-
-
Save hansemro/81f7cd5330c92e53b37c69829d78e4e4 to your computer and use it in GitHub Desktop.
[Benchmark] Convert 1-bit digital samples to bool array via AVX2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// SPDX-License-Identifier: MIT | |
// Copyright (c) 2023 Hansem Ro | |
// Benchmark to compare AVX2 and generic implementations of converting 1-bit | |
// array to bool array. | |
// Build on x86_64 machine with AVX2 with: | |
// g++ -mavx2 -o ./dsconvert dsconvert.cpp | |
// Usage: ./dsconvert [-n <NUM_SAMPLES>] [-m <0|1|2|3|4>] | |
// Options: | |
// -m <0|1|2|3|4> 0 for Generic, | |
// 1 for AVX2, | |
// 2 for alternative AVX2, | |
// 3 for LUT, | |
// 4 for AVX2-Gather with LUT | |
// -n <NUM> Number of samples | |
#include <cassert> | |
#include <chrono> | |
#include <cstdio> | |
#include <cstdint> | |
#include <unistd.h> | |
#include <stdlib.h> | |
#ifdef __x86_64__ | |
#include <immintrin.h> | |
#endif | |
/** | |
Generic implementation of ConvertDigitalSamples | |
*/ | |
static void ConvertDigitalSamplesGeneric(bool* pout, uint8_t* pin, size_t count) | |
{ | |
for(size_t i = 0; i < count; i++) | |
{ | |
for(int j = 0; j < 8; j++) | |
{ | |
pout[8*i + j] = (pin[i] >> j) & 0x1; | |
} | |
} | |
} | |
//LUT for mapping 8x1-bit digital sample pattern to 8xbool value | |
static uint64_t DigitalSamplesToBoolArrayLUT[256] = { | |
0x0000000000000000, 0x0000000000000001, 0x0000000000000100, 0x0000000000000101, | |
0x0000000000010000, 0x0000000000010001, 0x0000000000010100, 0x0000000000010101, | |
0x0000000001000000, 0x0000000001000001, 0x0000000001000100, 0x0000000001000101, | |
0x0000000001010000, 0x0000000001010001, 0x0000000001010100, 0x0000000001010101, | |
0x0000000100000000, 0x0000000100000001, 0x0000000100000100, 0x0000000100000101, | |
0x0000000100010000, 0x0000000100010001, 0x0000000100010100, 0x0000000100010101, | |
0x0000000101000000, 0x0000000101000001, 0x0000000101000100, 0x0000000101000101, | |
0x0000000101010000, 0x0000000101010001, 0x0000000101010100, 0x0000000101010101, | |
0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000000101, | |
0x0000010000010000, 0x0000010000010001, 0x0000010000010100, 0x0000010000010101, | |
0x0000010001000000, 0x0000010001000001, 0x0000010001000100, 0x0000010001000101, | |
0x0000010001010000, 0x0000010001010001, 0x0000010001010100, 0x0000010001010101, | |
0x0000010100000000, 0x0000010100000001, 0x0000010100000100, 0x0000010100000101, | |
0x0000010100010000, 0x0000010100010001, 0x0000010100010100, 0x0000010100010101, | |
0x0000010101000000, 0x0000010101000001, 0x0000010101000100, 0x0000010101000101, | |
0x0000010101010000, 0x0000010101010001, 0x0000010101010100, 0x0000010101010101, | |
0x0001000000000000, 0x0001000000000001, 0x0001000000000100, 0x0001000000000101, | |
0x0001000000010000, 0x0001000000010001, 0x0001000000010100, 0x0001000000010101, | |
0x0001000001000000, 0x0001000001000001, 0x0001000001000100, 0x0001000001000101, | |
0x0001000001010000, 0x0001000001010001, 0x0001000001010100, 0x0001000001010101, | |
0x0001000100000000, 0x0001000100000001, 0x0001000100000100, 0x0001000100000101, | |
0x0001000100010000, 0x0001000100010001, 0x0001000100010100, 0x0001000100010101, | |
0x0001000101000000, 0x0001000101000001, 0x0001000101000100, 0x0001000101000101, | |
0x0001000101010000, 0x0001000101010001, 0x0001000101010100, 0x0001000101010101, | |
0x0001010000000000, 0x0001010000000001, 0x0001010000000100, 0x0001010000000101, | |
0x0001010000010000, 0x0001010000010001, 0x0001010000010100, 0x0001010000010101, | |
0x0001010001000000, 0x0001010001000001, 0x0001010001000100, 0x0001010001000101, | |
0x0001010001010000, 0x0001010001010001, 0x0001010001010100, 0x0001010001010101, | |
0x0001010100000000, 0x0001010100000001, 0x0001010100000100, 0x0001010100000101, | |
0x0001010100010000, 0x0001010100010001, 0x0001010100010100, 0x0001010100010101, | |
0x0001010101000000, 0x0001010101000001, 0x0001010101000100, 0x0001010101000101, | |
0x0001010101010000, 0x0001010101010001, 0x0001010101010100, 0x0001010101010101, | |
0x0100000000000000, 0x0100000000000001, 0x0100000000000100, 0x0100000000000101, | |
0x0100000000010000, 0x0100000000010001, 0x0100000000010100, 0x0100000000010101, | |
0x0100000001000000, 0x0100000001000001, 0x0100000001000100, 0x0100000001000101, | |
0x0100000001010000, 0x0100000001010001, 0x0100000001010100, 0x0100000001010101, | |
0x0100000100000000, 0x0100000100000001, 0x0100000100000100, 0x0100000100000101, | |
0x0100000100010000, 0x0100000100010001, 0x0100000100010100, 0x0100000100010101, | |
0x0100000101000000, 0x0100000101000001, 0x0100000101000100, 0x0100000101000101, | |
0x0100000101010000, 0x0100000101010001, 0x0100000101010100, 0x0100000101010101, | |
0x0100010000000000, 0x0100010000000001, 0x0100010000000100, 0x0100010000000101, | |
0x0100010000010000, 0x0100010000010001, 0x0100010000010100, 0x0100010000010101, | |
0x0100010001000000, 0x0100010001000001, 0x0100010001000100, 0x0100010001000101, | |
0x0100010001010000, 0x0100010001010001, 0x0100010001010100, 0x0100010001010101, | |
0x0100010100000000, 0x0100010100000001, 0x0100010100000100, 0x0100010100000101, | |
0x0100010100010000, 0x0100010100010001, 0x0100010100010100, 0x0100010100010101, | |
0x0100010101000000, 0x0100010101000001, 0x0100010101000100, 0x0100010101000101, | |
0x0100010101010000, 0x0100010101010001, 0x0100010101010100, 0x0100010101010101, | |
0x0101000000000000, 0x0101000000000001, 0x0101000000000100, 0x0101000000000101, | |
0x0101000000010000, 0x0101000000010001, 0x0101000000010100, 0x0101000000010101, | |
0x0101000001000000, 0x0101000001000001, 0x0101000001000100, 0x0101000001000101, | |
0x0101000001010000, 0x0101000001010001, 0x0101000001010100, 0x0101000001010101, | |
0x0101000100000000, 0x0101000100000001, 0x0101000100000100, 0x0101000100000101, | |
0x0101000100010000, 0x0101000100010001, 0x0101000100010100, 0x0101000100010101, | |
0x0101000101000000, 0x0101000101000001, 0x0101000101000100, 0x0101000101000101, | |
0x0101000101010000, 0x0101000101010001, 0x0101000101010100, 0x0101000101010101, | |
0x0101010000000000, 0x0101010000000001, 0x0101010000000100, 0x0101010000000101, | |
0x0101010000010000, 0x0101010000010001, 0x0101010000010100, 0x0101010000010101, | |
0x0101010001000000, 0x0101010001000001, 0x0101010001000100, 0x0101010001000101, | |
0x0101010001010000, 0x0101010001010001, 0x0101010001010100, 0x0101010001010101, | |
0x0101010100000000, 0x0101010100000001, 0x0101010100000100, 0x0101010100000101, | |
0x0101010100010000, 0x0101010100010001, 0x0101010100010100, 0x0101010100010101, | |
0x0101010101000000, 0x0101010101000001, 0x0101010101000100, 0x0101010101000101, | |
0x0101010101010000, 0x0101010101010001, 0x0101010101010100, 0x0101010101010101 | |
}; | |
/** | |
Generic implementation of ConvertDigitalSamples using a lookup table. | |
*/ | |
static void ConvertDigitalSamplesLUT(bool* pout, uint8_t* pin, size_t count) | |
{ | |
for(size_t i = 0; i < count; i++) | |
{ | |
reinterpret_cast<uint64_t*>(pout)[i] = DigitalSamplesToBoolArrayLUT[pin[i]]; | |
} | |
} | |
#ifdef __x86_64__ | |
static void print_m256i_epi64(__m256i d) | |
{ | |
printf("0x%016lx 0x%016lx 0x%016lx 0x%016lx\n\n", | |
static_cast<uint64_t>(_mm256_extract_epi64(d, 0)), | |
static_cast<uint64_t>(_mm256_extract_epi64(d, 1)), | |
static_cast<uint64_t>(_mm256_extract_epi64(d, 2)), | |
static_cast<uint64_t>(_mm256_extract_epi64(d, 3))); | |
} | |
static void print_m256i_epi8(__m256i d) | |
{ | |
printf("0x%02x 0x%02x 0x%02x 0x%02x\n", | |
_mm256_extract_epi8(d, 0), | |
_mm256_extract_epi8(d, 1), | |
_mm256_extract_epi8(d, 2), | |
_mm256_extract_epi8(d, 3)); | |
printf("0x%02x 0x%02x 0x%02x 0x%02x\n", | |
_mm256_extract_epi8(d, 4), | |
_mm256_extract_epi8(d, 5), | |
_mm256_extract_epi8(d, 6), | |
_mm256_extract_epi8(d, 7)); | |
printf("0x%02x 0x%02x 0x%02x 0x%02x\n", | |
_mm256_extract_epi8(d, 8), | |
_mm256_extract_epi8(d, 9), | |
_mm256_extract_epi8(d, 10), | |
_mm256_extract_epi8(d, 11)); | |
printf("0x%02x 0x%02x 0x%02x 0x%02x\n", | |
_mm256_extract_epi8(d, 12), | |
_mm256_extract_epi8(d, 13), | |
_mm256_extract_epi8(d, 14), | |
_mm256_extract_epi8(d, 15)); | |
printf("0x%02x 0x%02x 0x%02x 0x%02x\n", | |
_mm256_extract_epi8(d, 16), | |
_mm256_extract_epi8(d, 17), | |
_mm256_extract_epi8(d, 18), | |
_mm256_extract_epi8(d, 19)); | |
printf("0x%02x 0x%02x 0x%02x 0x%02x\n", | |
_mm256_extract_epi8(d, 20), | |
_mm256_extract_epi8(d, 21), | |
_mm256_extract_epi8(d, 22), | |
_mm256_extract_epi8(d, 23)); | |
printf("0x%02x 0x%02x 0x%02x 0x%02x\n", | |
_mm256_extract_epi8(d, 24), | |
_mm256_extract_epi8(d, 25), | |
_mm256_extract_epi8(d, 26), | |
_mm256_extract_epi8(d, 27)); | |
printf("0x%02x 0x%02x 0x%02x 0x%02x\n\n", | |
_mm256_extract_epi8(d, 28), | |
_mm256_extract_epi8(d, 29), | |
_mm256_extract_epi8(d, 30), | |
_mm256_extract_epi8(d, 31)); | |
} | |
/** | |
AVX2 implementation of ConvertDigitalSamples | |
*/ | |
__attribute__((target("avx2"))) | |
static void ConvertDigitalSamplesAVX2(bool* pout, uint8_t* pin, size_t count) | |
{ | |
unsigned int end = count - (count % 4); | |
//Mask to get n-th bit in n-th byte, where n is in 0..7. | |
//Broadcast mask for 4 blocks (with 32 samples); | |
const __m256i bitmask = _mm256_set1_epi64x(0x8040201008040201); | |
//Mask to extract first bit of each byte. | |
const __m256i ones = _mm256_set1_epi8(0x1); | |
for(unsigned int k = 0; k < end; k += 4) | |
{ | |
uint8_t block0 = pin[k]; | |
uint8_t block1 = pin[k + 1]; | |
uint8_t block2 = pin[k + 2]; | |
uint8_t block3 = pin[k + 3]; | |
//Broadcast each block 8 times (such that each sample occupies its own byte) | |
__m256i b_sample = _mm256_set_epi8( block3,block3,block3,block3, | |
block3,block3,block3,block3, | |
block2,block2,block2,block2, | |
block2,block2,block2,block2, | |
block1,block1,block1,block1, | |
block1,block1,block1,block1, | |
block0,block0,block0,block0, | |
block0,block0,block0,block0); | |
//Extract nth bit of nth byte for each block | |
__m256i result = _mm256_and_si256(b_sample, bitmask); | |
//Fills each byte with 1s if it matches bitmask | |
result = _mm256_cmpeq_epi8(result, bitmask); | |
//Mask to get first bit of each byte. This gives us our clean bool array! | |
result = _mm256_and_si256(result, ones); | |
//Store results | |
_mm256_storeu_si256(reinterpret_cast<__m256i*>(pout + (k*8)), result); | |
} | |
//Get any extras we didn't get in the SIMD loop | |
for(size_t i = end; i < count; i++) | |
{ | |
for(size_t j = 0; j < 8; j++) | |
{ | |
pout[8*i + j] = (pin[i] >> j) & 0x1; | |
} | |
} | |
} | |
/** | |
Alternative AVX2 implementation of ConvertDigitalSamples | |
*/ | |
__attribute__((target("avx2"))) | |
static void ConvertDigitalSamplesAVX2_Alt(bool* pout, uint8_t* pin, size_t count) | |
{ | |
unsigned int end = count - (count % 4); | |
//Mask to extract first bit of each byte. | |
const __m256i ones = _mm256_set1_epi8(0x1); | |
for(unsigned int k = 0; k < end; k += 4) | |
{ | |
uint8_t block0 = pin[k]; | |
uint8_t block1 = pin[k + 1]; | |
uint8_t block2 = pin[k + 2]; | |
uint8_t block3 = pin[k + 3]; | |
//Prepare for sampling on first bit of each sample byte | |
__m256i b_sample = _mm256_set_epi8( block3>>7,block3>>6,block3>>5,block3>>4, | |
block3>>3,block3>>2,block3>>1,block3, | |
block2>>7,block2>>6,block2>>5,block2>>4, | |
block2>>3,block2>>2,block2>>1,block2, | |
block1>>7,block1>>6,block1>>5,block1>>4, | |
block1>>3,block1>>2,block1>>1,block1, | |
block0>>7,block0>>6,block0>>5,block0>>4, | |
block0>>3,block0>>2,block0>>1,block0); | |
//Mask to get first bit of each byte. This gives us our clean bool array! | |
__m256i result = _mm256_and_si256(b_sample, ones); | |
//Store results | |
_mm256_storeu_si256(reinterpret_cast<__m256i*>(pout + (k*8)), result); | |
} | |
//Get any extras we didn't get in the SIMD loop | |
for(size_t i = end; i < count; i++) | |
{ | |
for(size_t j = 0; j < 8; j++) | |
{ | |
pout[8*i + j] = (pin[i] >> j) & 0x1; | |
} | |
} | |
} | |
/** | |
Alternative AVX2 implementation of ConvertDigitalSamples using a gather | |
operation on a LUT | |
*/ | |
__attribute__((target("avx2"))) | |
static void ConvertDigitalSamplesAVX2_Gather(bool* pout, uint8_t* pin, size_t count) | |
{ | |
unsigned int end = count - (count % 4); | |
for(unsigned int k = 0; k < end; k += 4) | |
{ | |
//Gather values from LUT | |
__m256i vindex = _mm256_set_epi64x(pin[k+3], pin[k+2], pin[k+1], pin[k]); | |
__m256i result = _mm256_i64gather_epi64(DigitalSamplesToBoolArrayLUT, vindex, 8); | |
//Store results | |
_mm256_storeu_si256(reinterpret_cast<__m256i*>(pout + (k*8)), result); | |
} | |
//Get any extras we didn't get in the SIMD loop | |
for(size_t i = end; i < count; i++) | |
{ | |
for(size_t j = 0; j < 8; j++) | |
{ | |
pout[8*i + j] = (pin[i] >> j) & 0x1; | |
} | |
} | |
} | |
#endif | |
/** | |
Convert array of 8x1-bit digital samples to bool array. Note that `count` | |
is the number of 8x1-bit elements in the input array. | |
*/ | |
static void ConvertDigitalSamples(int method, bool* pout, uint8_t* pin, size_t count) | |
{ | |
switch(method) | |
{ | |
case 3: | |
ConvertDigitalSamplesLUT(pout, pin, count); | |
break; | |
#ifdef __x86_64__ | |
case 4: | |
ConvertDigitalSamplesAVX2_Gather(pout, pin, count); | |
break; | |
case 2: | |
ConvertDigitalSamplesAVX2_Alt(pout, pin, count); | |
break; | |
case 1: | |
ConvertDigitalSamplesAVX2(pout, pin, count); | |
break; | |
#endif | |
case 0: | |
default: | |
ConvertDigitalSamplesGeneric(pout, pin, count); | |
} | |
} | |
int main(int argc, char** argv) | |
{ | |
using namespace std::literals; | |
int c; | |
bool print_en = false; | |
long int num_samples = 0; | |
int method = 0; | |
while ((c = getopt(argc, argv, "pn:m:")) != -1) | |
{ | |
switch(c) | |
{ | |
case 'p': | |
print_en = true; | |
break; | |
case 'n': | |
num_samples = strtol(optarg, NULL, 10); | |
break; | |
case 'm': | |
method = atoi(optarg); | |
break; | |
case ':': | |
printf("Option -%c requires an operand\n", optopt); | |
break; | |
case '?': | |
printf("Unrecognized argument -%c given\n", optopt); | |
return -1; | |
} | |
} | |
printf("Num Samples: %ld\n", num_samples); | |
switch(method) | |
{ | |
case 3: | |
printf("Method: Generic with LUT\n"); | |
break; | |
#ifdef __x86_64__ | |
case 4: | |
printf("Method: AVX2_Gather\n"); | |
break; | |
case 2: | |
printf("Method: AVX2_Alt\n"); | |
break; | |
case 1: | |
printf("Method: AVX2\n"); | |
break; | |
#endif | |
case 0: | |
default: | |
printf("Method: Generic (%d)\n", method); | |
} | |
srand(time(NULL)); | |
//Digital signal stored as packed 1-bit samples | |
uint8_t* data = reinterpret_cast<uint8_t*>(calloc(num_samples/8 + 1, sizeof(uint8_t))); | |
for(size_t i = 0; i < num_samples/8 + 1; i++) | |
{ | |
data[i] = rand(); | |
} | |
//Sample Destinations | |
bool* samples = reinterpret_cast<bool*>(calloc(num_samples, sizeof(bool))); | |
bool* golden = reinterpret_cast<bool*>(calloc(num_samples, sizeof(bool))); | |
//Golden Reference Conversion | |
const auto g_start = std::chrono::system_clock::now(); | |
ConvertDigitalSamples(0, golden, (uint8_t*)data, num_samples/8); | |
const auto g_diff = std::chrono::system_clock::now() - g_start; | |
//Sample Conversion | |
const auto start = std::chrono::system_clock::now(); | |
ConvertDigitalSamples(method, samples, (uint8_t*)data, num_samples/8); | |
const auto diff = std::chrono::system_clock::now() - start; | |
printf("Time elapsed for conversion: %ld ns; generic: %ld ns\n", diff / 1ns, g_diff / 1ns); | |
for(size_t i = 0; i < num_samples; i++) | |
{ | |
if(print_en) | |
printf("samples[%ld]: %d; golden: %d\n", i, samples[i], golden[i]); | |
//Compare to Golden Reference | |
assert(samples[i] == golden[i]); | |
} | |
free(data); | |
free(samples); | |
free(golden); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Fastest AVX2 implementation does parallel lookup using a gather operation, but does not always yield a faster solution than non-AVX LUT implementation.