Last active
October 28, 2023 21:11
-
-
Save hansemro/81f7cd5330c92e53b37c69829d78e4e4 to your computer and use it in GitHub Desktop.
[Benchmark] Convert 1-bit digital samples to bool array via AVX2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// SPDX-License-Identifier: MIT | |
// Copyright (c) 2023 Hansem Ro | |
// Benchmark to compare AVX2 and generic implementations of converting 1-bit | |
// array to bool array. | |
// Build on x86_64 machine with AVX2 with: | |
// g++ -mavx2 -o ./dsconvert dsconvert.cpp | |
// Usage: ./dsconvert [-n <NUM_SAMPLES>] [-m <0|1|2|3|4>] | |
// Options: | |
// -m <0|1|2|3|4> 0 for Generic, | |
// 1 for AVX2, | |
// 2 for alternative AVX2, | |
// 3 for LUT, | |
// 4 for AVX2-Gather with LUT | |
// -n <NUM> Number of samples | |
#include <cassert> | |
#include <chrono> | |
#include <cstdio> | |
#include <cstdint> | |
#include <unistd.h> | |
#include <stdlib.h> | |
#ifdef __x86_64__ | |
#include <immintrin.h> | |
#endif | |
/** | |
Generic implementation of ConvertDigitalSamples | |
*/ | |
static void ConvertDigitalSamplesGeneric(bool* pout, uint8_t* pin, size_t count) | |
{ | |
for(size_t i = 0; i < count; i++) | |
{ | |
for(int j = 0; j < 8; j++) | |
{ | |
pout[8*i + j] = (pin[i] >> j) & 0x1; | |
} | |
} | |
} | |
//LUT for mapping 8x1-bit digital sample pattern to 8xbool value | |
static uint64_t DigitalSamplesToBoolArrayLUT[256] = { | |
0x0000000000000000, 0x0000000000000001, 0x0000000000000100, 0x0000000000000101, | |
0x0000000000010000, 0x0000000000010001, 0x0000000000010100, 0x0000000000010101, | |
0x0000000001000000, 0x0000000001000001, 0x0000000001000100, 0x0000000001000101, | |
0x0000000001010000, 0x0000000001010001, 0x0000000001010100, 0x0000000001010101, | |
0x0000000100000000, 0x0000000100000001, 0x0000000100000100, 0x0000000100000101, | |
0x0000000100010000, 0x0000000100010001, 0x0000000100010100, 0x0000000100010101, | |
0x0000000101000000, 0x0000000101000001, 0x0000000101000100, 0x0000000101000101, | |
0x0000000101010000, 0x0000000101010001, 0x0000000101010100, 0x0000000101010101, | |
0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000000101, | |
0x0000010000010000, 0x0000010000010001, 0x0000010000010100, 0x0000010000010101, | |
0x0000010001000000, 0x0000010001000001, 0x0000010001000100, 0x0000010001000101, | |
0x0000010001010000, 0x0000010001010001, 0x0000010001010100, 0x0000010001010101, | |
0x0000010100000000, 0x0000010100000001, 0x0000010100000100, 0x0000010100000101, | |
0x0000010100010000, 0x0000010100010001, 0x0000010100010100, 0x0000010100010101, | |
0x0000010101000000, 0x0000010101000001, 0x0000010101000100, 0x0000010101000101, | |
0x0000010101010000, 0x0000010101010001, 0x0000010101010100, 0x0000010101010101, | |
0x0001000000000000, 0x0001000000000001, 0x0001000000000100, 0x0001000000000101, | |
0x0001000000010000, 0x0001000000010001, 0x0001000000010100, 0x0001000000010101, | |
0x0001000001000000, 0x0001000001000001, 0x0001000001000100, 0x0001000001000101, | |
0x0001000001010000, 0x0001000001010001, 0x0001000001010100, 0x0001000001010101, | |
0x0001000100000000, 0x0001000100000001, 0x0001000100000100, 0x0001000100000101, | |
0x0001000100010000, 0x0001000100010001, 0x0001000100010100, 0x0001000100010101, | |
0x0001000101000000, 0x0001000101000001, 0x0001000101000100, 0x0001000101000101, | |
0x0001000101010000, 0x0001000101010001, 0x0001000101010100, 0x0001000101010101, | |
0x0001010000000000, 0x0001010000000001, 0x0001010000000100, 0x0001010000000101, | |
0x0001010000010000, 0x0001010000010001, 0x0001010000010100, 0x0001010000010101, | |
0x0001010001000000, 0x0001010001000001, 0x0001010001000100, 0x0001010001000101, | |
0x0001010001010000, 0x0001010001010001, 0x0001010001010100, 0x0001010001010101, | |
0x0001010100000000, 0x0001010100000001, 0x0001010100000100, 0x0001010100000101, | |
0x0001010100010000, 0x0001010100010001, 0x0001010100010100, 0x0001010100010101, | |
0x0001010101000000, 0x0001010101000001, 0x0001010101000100, 0x0001010101000101, | |
0x0001010101010000, 0x0001010101010001, 0x0001010101010100, 0x0001010101010101, | |
0x0100000000000000, 0x0100000000000001, 0x0100000000000100, 0x0100000000000101, | |
0x0100000000010000, 0x0100000000010001, 0x0100000000010100, 0x0100000000010101, | |
0x0100000001000000, 0x0100000001000001, 0x0100000001000100, 0x0100000001000101, | |
0x0100000001010000, 0x0100000001010001, 0x0100000001010100, 0x0100000001010101, | |
0x0100000100000000, 0x0100000100000001, 0x0100000100000100, 0x0100000100000101, | |
0x0100000100010000, 0x0100000100010001, 0x0100000100010100, 0x0100000100010101, | |
0x0100000101000000, 0x0100000101000001, 0x0100000101000100, 0x0100000101000101, | |
0x0100000101010000, 0x0100000101010001, 0x0100000101010100, 0x0100000101010101, | |
0x0100010000000000, 0x0100010000000001, 0x0100010000000100, 0x0100010000000101, | |
0x0100010000010000, 0x0100010000010001, 0x0100010000010100, 0x0100010000010101, | |
0x0100010001000000, 0x0100010001000001, 0x0100010001000100, 0x0100010001000101, | |
0x0100010001010000, 0x0100010001010001, 0x0100010001010100, 0x0100010001010101, | |
0x0100010100000000, 0x0100010100000001, 0x0100010100000100, 0x0100010100000101, | |
0x0100010100010000, 0x0100010100010001, 0x0100010100010100, 0x0100010100010101, | |
0x0100010101000000, 0x0100010101000001, 0x0100010101000100, 0x0100010101000101, | |
0x0100010101010000, 0x0100010101010001, 0x0100010101010100, 0x0100010101010101, | |
0x0101000000000000, 0x0101000000000001, 0x0101000000000100, 0x0101000000000101, | |
0x0101000000010000, 0x0101000000010001, 0x0101000000010100, 0x0101000000010101, | |
0x0101000001000000, 0x0101000001000001, 0x0101000001000100, 0x0101000001000101, | |
0x0101000001010000, 0x0101000001010001, 0x0101000001010100, 0x0101000001010101, | |
0x0101000100000000, 0x0101000100000001, 0x0101000100000100, 0x0101000100000101, | |
0x0101000100010000, 0x0101000100010001, 0x0101000100010100, 0x0101000100010101, | |
0x0101000101000000, 0x0101000101000001, 0x0101000101000100, 0x0101000101000101, | |
0x0101000101010000, 0x0101000101010001, 0x0101000101010100, 0x0101000101010101, | |
0x0101010000000000, 0x0101010000000001, 0x0101010000000100, 0x0101010000000101, | |
0x0101010000010000, 0x0101010000010001, 0x0101010000010100, 0x0101010000010101, | |
0x0101010001000000, 0x0101010001000001, 0x0101010001000100, 0x0101010001000101, | |
0x0101010001010000, 0x0101010001010001, 0x0101010001010100, 0x0101010001010101, | |
0x0101010100000000, 0x0101010100000001, 0x0101010100000100, 0x0101010100000101, | |
0x0101010100010000, 0x0101010100010001, 0x0101010100010100, 0x0101010100010101, | |
0x0101010101000000, 0x0101010101000001, 0x0101010101000100, 0x0101010101000101, | |
0x0101010101010000, 0x0101010101010001, 0x0101010101010100, 0x0101010101010101 | |
}; | |
/** | |
Generic implementation of ConvertDigitalSamples using a lookup table. | |
*/ | |
static void ConvertDigitalSamplesLUT(bool* pout, uint8_t* pin, size_t count) | |
{ | |
for(size_t i = 0; i < count; i++) | |
{ | |
reinterpret_cast<uint64_t*>(pout)[i] = DigitalSamplesToBoolArrayLUT[pin[i]]; | |
} | |
} | |
#ifdef __x86_64__ | |
static void print_m256i_epi64(__m256i d) | |
{ | |
printf("0x%016lx 0x%016lx 0x%016lx 0x%016lx\n\n", | |
static_cast<uint64_t>(_mm256_extract_epi64(d, 0)), | |
static_cast<uint64_t>(_mm256_extract_epi64(d, 1)), | |
static_cast<uint64_t>(_mm256_extract_epi64(d, 2)), | |
static_cast<uint64_t>(_mm256_extract_epi64(d, 3))); | |
} | |
static void print_m256i_epi8(__m256i d) | |
{ | |
printf("0x%02x 0x%02x 0x%02x 0x%02x\n", | |
_mm256_extract_epi8(d, 0), | |
_mm256_extract_epi8(d, 1), | |
_mm256_extract_epi8(d, 2), | |
_mm256_extract_epi8(d, 3)); | |
printf("0x%02x 0x%02x 0x%02x 0x%02x\n", | |
_mm256_extract_epi8(d, 4), | |
_mm256_extract_epi8(d, 5), | |
_mm256_extract_epi8(d, 6), | |
_mm256_extract_epi8(d, 7)); | |
printf("0x%02x 0x%02x 0x%02x 0x%02x\n", | |
_mm256_extract_epi8(d, 8), | |
_mm256_extract_epi8(d, 9), | |
_mm256_extract_epi8(d, 10), | |
_mm256_extract_epi8(d, 11)); | |
printf("0x%02x 0x%02x 0x%02x 0x%02x\n", | |
_mm256_extract_epi8(d, 12), | |
_mm256_extract_epi8(d, 13), | |
_mm256_extract_epi8(d, 14), | |
_mm256_extract_epi8(d, 15)); | |
printf("0x%02x 0x%02x 0x%02x 0x%02x\n", | |
_mm256_extract_epi8(d, 16), | |
_mm256_extract_epi8(d, 17), | |
_mm256_extract_epi8(d, 18), | |
_mm256_extract_epi8(d, 19)); | |
printf("0x%02x 0x%02x 0x%02x 0x%02x\n", | |
_mm256_extract_epi8(d, 20), | |
_mm256_extract_epi8(d, 21), | |
_mm256_extract_epi8(d, 22), | |
_mm256_extract_epi8(d, 23)); | |
printf("0x%02x 0x%02x 0x%02x 0x%02x\n", | |
_mm256_extract_epi8(d, 24), | |
_mm256_extract_epi8(d, 25), | |
_mm256_extract_epi8(d, 26), | |
_mm256_extract_epi8(d, 27)); | |
printf("0x%02x 0x%02x 0x%02x 0x%02x\n\n", | |
_mm256_extract_epi8(d, 28), | |
_mm256_extract_epi8(d, 29), | |
_mm256_extract_epi8(d, 30), | |
_mm256_extract_epi8(d, 31)); | |
} | |
/** | |
AVX2 implementation of ConvertDigitalSamples | |
*/ | |
__attribute__((target("avx2"))) | |
static void ConvertDigitalSamplesAVX2(bool* pout, uint8_t* pin, size_t count) | |
{ | |
unsigned int end = count - (count % 4); | |
//Mask to get n-th bit in n-th byte, where n is in 0..7. | |
//Broadcast mask for 4 blocks (with 32 samples); | |
const __m256i bitmask = _mm256_set1_epi64x(0x8040201008040201); | |
//Mask to extract first bit of each byte. | |
const __m256i ones = _mm256_set1_epi8(0x1); | |
for(unsigned int k = 0; k < end; k += 4) | |
{ | |
uint8_t block0 = pin[k]; | |
uint8_t block1 = pin[k + 1]; | |
uint8_t block2 = pin[k + 2]; | |
uint8_t block3 = pin[k + 3]; | |
//Broadcast each block 8 times (such that each sample occupies its own byte) | |
__m256i b_sample = _mm256_set_epi8( block3,block3,block3,block3, | |
block3,block3,block3,block3, | |
block2,block2,block2,block2, | |
block2,block2,block2,block2, | |
block1,block1,block1,block1, | |
block1,block1,block1,block1, | |
block0,block0,block0,block0, | |
block0,block0,block0,block0); | |
//Extract nth bit of nth byte for each block | |
__m256i result = _mm256_and_si256(b_sample, bitmask); | |
//Fills each byte with 1s if it matches bitmask | |
result = _mm256_cmpeq_epi8(result, bitmask); | |
//Mask to get first bit of each byte. This gives us our clean bool array! | |
result = _mm256_and_si256(result, ones); | |
//Store results | |
_mm256_storeu_si256(reinterpret_cast<__m256i*>(pout + (k*8)), result); | |
} | |
//Get any extras we didn't get in the SIMD loop | |
for(size_t i = end; i < count; i++) | |
{ | |
for(size_t j = 0; j < 8; j++) | |
{ | |
pout[8*i + j] = (pin[i] >> j) & 0x1; | |
} | |
} | |
} | |
/** | |
Alternative AVX2 implementation of ConvertDigitalSamples | |
*/ | |
__attribute__((target("avx2"))) | |
static void ConvertDigitalSamplesAVX2_Alt(bool* pout, uint8_t* pin, size_t count) | |
{ | |
unsigned int end = count - (count % 4); | |
//Mask to extract first bit of each byte. | |
const __m256i ones = _mm256_set1_epi8(0x1); | |
for(unsigned int k = 0; k < end; k += 4) | |
{ | |
uint8_t block0 = pin[k]; | |
uint8_t block1 = pin[k + 1]; | |
uint8_t block2 = pin[k + 2]; | |
uint8_t block3 = pin[k + 3]; | |
//Prepare for sampling on first bit of each sample byte | |
__m256i b_sample = _mm256_set_epi8( block3>>7,block3>>6,block3>>5,block3>>4, | |
block3>>3,block3>>2,block3>>1,block3, | |
block2>>7,block2>>6,block2>>5,block2>>4, | |
block2>>3,block2>>2,block2>>1,block2, | |
block1>>7,block1>>6,block1>>5,block1>>4, | |
block1>>3,block1>>2,block1>>1,block1, | |
block0>>7,block0>>6,block0>>5,block0>>4, | |
block0>>3,block0>>2,block0>>1,block0); | |
//Mask to get first bit of each byte. This gives us our clean bool array! | |
__m256i result = _mm256_and_si256(b_sample, ones); | |
//Store results | |
_mm256_storeu_si256(reinterpret_cast<__m256i*>(pout + (k*8)), result); | |
} | |
//Get any extras we didn't get in the SIMD loop | |
for(size_t i = end; i < count; i++) | |
{ | |
for(size_t j = 0; j < 8; j++) | |
{ | |
pout[8*i + j] = (pin[i] >> j) & 0x1; | |
} | |
} | |
} | |
/** | |
Alternative AVX2 implementation of ConvertDigitalSamples using a gather | |
operation on a LUT | |
*/ | |
__attribute__((target("avx2"))) | |
static void ConvertDigitalSamplesAVX2_Gather(bool* pout, uint8_t* pin, size_t count) | |
{ | |
unsigned int end = count - (count % 4); | |
for(unsigned int k = 0; k < end; k += 4) | |
{ | |
//Gather values from LUT | |
__m256i vindex = _mm256_set_epi64x(pin[k+3], pin[k+2], pin[k+1], pin[k]); | |
__m256i result = _mm256_i64gather_epi64(DigitalSamplesToBoolArrayLUT, vindex, 8); | |
//Store results | |
_mm256_storeu_si256(reinterpret_cast<__m256i*>(pout + (k*8)), result); | |
} | |
//Get any extras we didn't get in the SIMD loop | |
for(size_t i = end; i < count; i++) | |
{ | |
for(size_t j = 0; j < 8; j++) | |
{ | |
pout[8*i + j] = (pin[i] >> j) & 0x1; | |
} | |
} | |
} | |
#endif | |
/** | |
Convert array of 8x1-bit digital samples to bool array. Note that `count` | |
is the number of 8x1-bit elements in the input array. | |
*/ | |
static void ConvertDigitalSamples(int method, bool* pout, uint8_t* pin, size_t count) | |
{ | |
switch(method) | |
{ | |
case 3: | |
ConvertDigitalSamplesLUT(pout, pin, count); | |
break; | |
#ifdef __x86_64__ | |
case 4: | |
ConvertDigitalSamplesAVX2_Gather(pout, pin, count); | |
break; | |
case 2: | |
ConvertDigitalSamplesAVX2_Alt(pout, pin, count); | |
break; | |
case 1: | |
ConvertDigitalSamplesAVX2(pout, pin, count); | |
break; | |
#endif | |
case 0: | |
default: | |
ConvertDigitalSamplesGeneric(pout, pin, count); | |
} | |
} | |
int main(int argc, char** argv) | |
{ | |
using namespace std::literals; | |
int c; | |
bool print_en = false; | |
long int num_samples = 0; | |
int method = 0; | |
while ((c = getopt(argc, argv, "pn:m:")) != -1) | |
{ | |
switch(c) | |
{ | |
case 'p': | |
print_en = true; | |
break; | |
case 'n': | |
num_samples = strtol(optarg, NULL, 10); | |
break; | |
case 'm': | |
method = atoi(optarg); | |
break; | |
case ':': | |
printf("Option -%c requires an operand\n", optopt); | |
break; | |
case '?': | |
printf("Unrecognized argument -%c given\n", optopt); | |
return -1; | |
} | |
} | |
printf("Num Samples: %ld\n", num_samples); | |
switch(method) | |
{ | |
case 3: | |
printf("Method: Generic with LUT\n"); | |
break; | |
#ifdef __x86_64__ | |
case 4: | |
printf("Method: AVX2_Gather\n"); | |
break; | |
case 2: | |
printf("Method: AVX2_Alt\n"); | |
break; | |
case 1: | |
printf("Method: AVX2\n"); | |
break; | |
#endif | |
case 0: | |
default: | |
printf("Method: Generic (%d)\n", method); | |
} | |
srand(time(NULL)); | |
//Digital signal stored as packed 1-bit samples | |
uint8_t* data = reinterpret_cast<uint8_t*>(calloc(num_samples/8 + 1, sizeof(uint8_t))); | |
for(size_t i = 0; i < num_samples/8 + 1; i++) | |
{ | |
data[i] = rand(); | |
} | |
//Sample Destinations | |
bool* samples = reinterpret_cast<bool*>(calloc(num_samples, sizeof(bool))); | |
bool* golden = reinterpret_cast<bool*>(calloc(num_samples, sizeof(bool))); | |
//Golden Reference Conversion | |
const auto g_start = std::chrono::system_clock::now(); | |
ConvertDigitalSamples(0, golden, (uint8_t*)data, num_samples/8); | |
const auto g_diff = std::chrono::system_clock::now() - g_start; | |
//Sample Conversion | |
const auto start = std::chrono::system_clock::now(); | |
ConvertDigitalSamples(method, samples, (uint8_t*)data, num_samples/8); | |
const auto diff = std::chrono::system_clock::now() - start; | |
printf("Time elapsed for conversion: %ld ns; generic: %ld ns\n", diff / 1ns, g_diff / 1ns); | |
for(size_t i = 0; i < num_samples; i++) | |
{ | |
if(print_en) | |
printf("samples[%ld]: %d; golden: %d\n", i, samples[i], golden[i]); | |
//Compare to Golden Reference | |
assert(samples[i] == golden[i]); | |
} | |
free(data); | |
free(samples); | |
free(golden); | |
return 0; | |
} |
AMD Ryzen 7 3750H Benchmark results:
$ ./dsconvert -n 320000000 -m 0
Num Samples: 320000000
Method: Generic (0)
Time elapsed for conversion: 1457322614 ns
$ ./dsconvert -n 320000000 -m 0
Num Samples: 320000000
Method: Generic (0)
Time elapsed for conversion: 1457238744 ns
$ ./dsconvert -n 320000000 -m 0
Num Samples: 320000000
Method: Generic (0)
Time elapsed for conversion: 1479732995 ns
$ ./dsconvert -n 320000000 -m 1
Num Samples: 320000000
Method: AVX2
Time elapsed for conversion: 496241561 ns
$ ./dsconvert -n 320000000 -m 1
Num Samples: 320000000
Method: AVX2
Time elapsed for conversion: 497851702 ns
$ ./dsconvert -n 320000000 -m 1
Num Samples: 320000000
Method: AVX2
Time elapsed for conversion: 501259858 ns
$ ./dsconvert -n 320000000 -m 2
Num Samples: 320000000
Method: AVX2_Alt
Time elapsed for conversion: 386648209 ns
$ ./dsconvert -n 320000000 -m 2
Num Samples: 320000000
Method: AVX2_Alt
Time elapsed for conversion: 386676497 ns
$ ./dsconvert -n 320000000 -m 2
Num Samples: 320000000
Method: AVX2_Alt
Time elapsed for conversion: 393613003 ns
Added architecture-agnostic/generic lookup table implementation that outperforms AVX2 implementations.
Fastest AVX2 implementation does parallel lookup using a gather operation, but does not always yield a faster solution than non-AVX LUT implementation.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
AMD Ryzen 5 5600X Benchmark results: