Skip to content

Instantly share code, notes, and snippets.

@hansemro
Last active October 28, 2023 21:11
Show Gist options
  • Save hansemro/81f7cd5330c92e53b37c69829d78e4e4 to your computer and use it in GitHub Desktop.
Save hansemro/81f7cd5330c92e53b37c69829d78e4e4 to your computer and use it in GitHub Desktop.
[Benchmark] Convert 1-bit digital samples to bool array via AVX2
// SPDX-License-Identifier: MIT
// Copyright (c) 2023 Hansem Ro
// Benchmark to compare AVX2 and generic implementations of converting 1-bit
// array to bool array.
// Build on x86_64 machine with AVX2 with:
// g++ -mavx2 -o ./dsconvert dsconvert.cpp
// Usage: ./dsconvert [-n <NUM_SAMPLES>] [-m <0|1|2|3|4>]
// Options:
// -m <0|1|2|3|4> 0 for Generic,
// 1 for AVX2,
// 2 for alternative AVX2,
// 3 for LUT,
// 4 for AVX2-Gather with LUT
// -n <NUM> Number of samples
#include <cassert>
#include <chrono>
#include <cstdio>
#include <cstdint>
#include <unistd.h>
#include <stdlib.h>
#ifdef __x86_64__
#include <immintrin.h>
#endif
/**
Generic implementation of ConvertDigitalSamples
*/
static void ConvertDigitalSamplesGeneric(bool* pout, uint8_t* pin, size_t count)
{
for(size_t i = 0; i < count; i++)
{
for(int j = 0; j < 8; j++)
{
pout[8*i + j] = (pin[i] >> j) & 0x1;
}
}
}
//LUT for mapping 8x1-bit digital sample pattern to 8xbool value
static uint64_t DigitalSamplesToBoolArrayLUT[256] = {
0x0000000000000000, 0x0000000000000001, 0x0000000000000100, 0x0000000000000101,
0x0000000000010000, 0x0000000000010001, 0x0000000000010100, 0x0000000000010101,
0x0000000001000000, 0x0000000001000001, 0x0000000001000100, 0x0000000001000101,
0x0000000001010000, 0x0000000001010001, 0x0000000001010100, 0x0000000001010101,
0x0000000100000000, 0x0000000100000001, 0x0000000100000100, 0x0000000100000101,
0x0000000100010000, 0x0000000100010001, 0x0000000100010100, 0x0000000100010101,
0x0000000101000000, 0x0000000101000001, 0x0000000101000100, 0x0000000101000101,
0x0000000101010000, 0x0000000101010001, 0x0000000101010100, 0x0000000101010101,
0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000000101,
0x0000010000010000, 0x0000010000010001, 0x0000010000010100, 0x0000010000010101,
0x0000010001000000, 0x0000010001000001, 0x0000010001000100, 0x0000010001000101,
0x0000010001010000, 0x0000010001010001, 0x0000010001010100, 0x0000010001010101,
0x0000010100000000, 0x0000010100000001, 0x0000010100000100, 0x0000010100000101,
0x0000010100010000, 0x0000010100010001, 0x0000010100010100, 0x0000010100010101,
0x0000010101000000, 0x0000010101000001, 0x0000010101000100, 0x0000010101000101,
0x0000010101010000, 0x0000010101010001, 0x0000010101010100, 0x0000010101010101,
0x0001000000000000, 0x0001000000000001, 0x0001000000000100, 0x0001000000000101,
0x0001000000010000, 0x0001000000010001, 0x0001000000010100, 0x0001000000010101,
0x0001000001000000, 0x0001000001000001, 0x0001000001000100, 0x0001000001000101,
0x0001000001010000, 0x0001000001010001, 0x0001000001010100, 0x0001000001010101,
0x0001000100000000, 0x0001000100000001, 0x0001000100000100, 0x0001000100000101,
0x0001000100010000, 0x0001000100010001, 0x0001000100010100, 0x0001000100010101,
0x0001000101000000, 0x0001000101000001, 0x0001000101000100, 0x0001000101000101,
0x0001000101010000, 0x0001000101010001, 0x0001000101010100, 0x0001000101010101,
0x0001010000000000, 0x0001010000000001, 0x0001010000000100, 0x0001010000000101,
0x0001010000010000, 0x0001010000010001, 0x0001010000010100, 0x0001010000010101,
0x0001010001000000, 0x0001010001000001, 0x0001010001000100, 0x0001010001000101,
0x0001010001010000, 0x0001010001010001, 0x0001010001010100, 0x0001010001010101,
0x0001010100000000, 0x0001010100000001, 0x0001010100000100, 0x0001010100000101,
0x0001010100010000, 0x0001010100010001, 0x0001010100010100, 0x0001010100010101,
0x0001010101000000, 0x0001010101000001, 0x0001010101000100, 0x0001010101000101,
0x0001010101010000, 0x0001010101010001, 0x0001010101010100, 0x0001010101010101,
0x0100000000000000, 0x0100000000000001, 0x0100000000000100, 0x0100000000000101,
0x0100000000010000, 0x0100000000010001, 0x0100000000010100, 0x0100000000010101,
0x0100000001000000, 0x0100000001000001, 0x0100000001000100, 0x0100000001000101,
0x0100000001010000, 0x0100000001010001, 0x0100000001010100, 0x0100000001010101,
0x0100000100000000, 0x0100000100000001, 0x0100000100000100, 0x0100000100000101,
0x0100000100010000, 0x0100000100010001, 0x0100000100010100, 0x0100000100010101,
0x0100000101000000, 0x0100000101000001, 0x0100000101000100, 0x0100000101000101,
0x0100000101010000, 0x0100000101010001, 0x0100000101010100, 0x0100000101010101,
0x0100010000000000, 0x0100010000000001, 0x0100010000000100, 0x0100010000000101,
0x0100010000010000, 0x0100010000010001, 0x0100010000010100, 0x0100010000010101,
0x0100010001000000, 0x0100010001000001, 0x0100010001000100, 0x0100010001000101,
0x0100010001010000, 0x0100010001010001, 0x0100010001010100, 0x0100010001010101,
0x0100010100000000, 0x0100010100000001, 0x0100010100000100, 0x0100010100000101,
0x0100010100010000, 0x0100010100010001, 0x0100010100010100, 0x0100010100010101,
0x0100010101000000, 0x0100010101000001, 0x0100010101000100, 0x0100010101000101,
0x0100010101010000, 0x0100010101010001, 0x0100010101010100, 0x0100010101010101,
0x0101000000000000, 0x0101000000000001, 0x0101000000000100, 0x0101000000000101,
0x0101000000010000, 0x0101000000010001, 0x0101000000010100, 0x0101000000010101,
0x0101000001000000, 0x0101000001000001, 0x0101000001000100, 0x0101000001000101,
0x0101000001010000, 0x0101000001010001, 0x0101000001010100, 0x0101000001010101,
0x0101000100000000, 0x0101000100000001, 0x0101000100000100, 0x0101000100000101,
0x0101000100010000, 0x0101000100010001, 0x0101000100010100, 0x0101000100010101,
0x0101000101000000, 0x0101000101000001, 0x0101000101000100, 0x0101000101000101,
0x0101000101010000, 0x0101000101010001, 0x0101000101010100, 0x0101000101010101,
0x0101010000000000, 0x0101010000000001, 0x0101010000000100, 0x0101010000000101,
0x0101010000010000, 0x0101010000010001, 0x0101010000010100, 0x0101010000010101,
0x0101010001000000, 0x0101010001000001, 0x0101010001000100, 0x0101010001000101,
0x0101010001010000, 0x0101010001010001, 0x0101010001010100, 0x0101010001010101,
0x0101010100000000, 0x0101010100000001, 0x0101010100000100, 0x0101010100000101,
0x0101010100010000, 0x0101010100010001, 0x0101010100010100, 0x0101010100010101,
0x0101010101000000, 0x0101010101000001, 0x0101010101000100, 0x0101010101000101,
0x0101010101010000, 0x0101010101010001, 0x0101010101010100, 0x0101010101010101
};
/**
Generic implementation of ConvertDigitalSamples using a lookup table.
*/
static void ConvertDigitalSamplesLUT(bool* pout, uint8_t* pin, size_t count)
{
for(size_t i = 0; i < count; i++)
{
reinterpret_cast<uint64_t*>(pout)[i] = DigitalSamplesToBoolArrayLUT[pin[i]];
}
}
#ifdef __x86_64__
static void print_m256i_epi64(__m256i d)
{
printf("0x%016lx 0x%016lx 0x%016lx 0x%016lx\n\n",
static_cast<uint64_t>(_mm256_extract_epi64(d, 0)),
static_cast<uint64_t>(_mm256_extract_epi64(d, 1)),
static_cast<uint64_t>(_mm256_extract_epi64(d, 2)),
static_cast<uint64_t>(_mm256_extract_epi64(d, 3)));
}
static void print_m256i_epi8(__m256i d)
{
printf("0x%02x 0x%02x 0x%02x 0x%02x\n",
_mm256_extract_epi8(d, 0),
_mm256_extract_epi8(d, 1),
_mm256_extract_epi8(d, 2),
_mm256_extract_epi8(d, 3));
printf("0x%02x 0x%02x 0x%02x 0x%02x\n",
_mm256_extract_epi8(d, 4),
_mm256_extract_epi8(d, 5),
_mm256_extract_epi8(d, 6),
_mm256_extract_epi8(d, 7));
printf("0x%02x 0x%02x 0x%02x 0x%02x\n",
_mm256_extract_epi8(d, 8),
_mm256_extract_epi8(d, 9),
_mm256_extract_epi8(d, 10),
_mm256_extract_epi8(d, 11));
printf("0x%02x 0x%02x 0x%02x 0x%02x\n",
_mm256_extract_epi8(d, 12),
_mm256_extract_epi8(d, 13),
_mm256_extract_epi8(d, 14),
_mm256_extract_epi8(d, 15));
printf("0x%02x 0x%02x 0x%02x 0x%02x\n",
_mm256_extract_epi8(d, 16),
_mm256_extract_epi8(d, 17),
_mm256_extract_epi8(d, 18),
_mm256_extract_epi8(d, 19));
printf("0x%02x 0x%02x 0x%02x 0x%02x\n",
_mm256_extract_epi8(d, 20),
_mm256_extract_epi8(d, 21),
_mm256_extract_epi8(d, 22),
_mm256_extract_epi8(d, 23));
printf("0x%02x 0x%02x 0x%02x 0x%02x\n",
_mm256_extract_epi8(d, 24),
_mm256_extract_epi8(d, 25),
_mm256_extract_epi8(d, 26),
_mm256_extract_epi8(d, 27));
printf("0x%02x 0x%02x 0x%02x 0x%02x\n\n",
_mm256_extract_epi8(d, 28),
_mm256_extract_epi8(d, 29),
_mm256_extract_epi8(d, 30),
_mm256_extract_epi8(d, 31));
}
/**
AVX2 implementation of ConvertDigitalSamples
*/
__attribute__((target("avx2")))
static void ConvertDigitalSamplesAVX2(bool* pout, uint8_t* pin, size_t count)
{
unsigned int end = count - (count % 4);
//Mask to get n-th bit in n-th byte, where n is in 0..7.
//Broadcast mask for 4 blocks (with 32 samples);
const __m256i bitmask = _mm256_set1_epi64x(0x8040201008040201);
//Mask to extract first bit of each byte.
const __m256i ones = _mm256_set1_epi8(0x1);
for(unsigned int k = 0; k < end; k += 4)
{
uint8_t block0 = pin[k];
uint8_t block1 = pin[k + 1];
uint8_t block2 = pin[k + 2];
uint8_t block3 = pin[k + 3];
//Broadcast each block 8 times (such that each sample occupies its own byte)
__m256i b_sample = _mm256_set_epi8( block3,block3,block3,block3,
block3,block3,block3,block3,
block2,block2,block2,block2,
block2,block2,block2,block2,
block1,block1,block1,block1,
block1,block1,block1,block1,
block0,block0,block0,block0,
block0,block0,block0,block0);
//Extract nth bit of nth byte for each block
__m256i result = _mm256_and_si256(b_sample, bitmask);
//Fills each byte with 1s if it matches bitmask
result = _mm256_cmpeq_epi8(result, bitmask);
//Mask to get first bit of each byte. This gives us our clean bool array!
result = _mm256_and_si256(result, ones);
//Store results
_mm256_storeu_si256(reinterpret_cast<__m256i*>(pout + (k*8)), result);
}
//Get any extras we didn't get in the SIMD loop
for(size_t i = end; i < count; i++)
{
for(size_t j = 0; j < 8; j++)
{
pout[8*i + j] = (pin[i] >> j) & 0x1;
}
}
}
/**
Alternative AVX2 implementation of ConvertDigitalSamples
*/
__attribute__((target("avx2")))
static void ConvertDigitalSamplesAVX2_Alt(bool* pout, uint8_t* pin, size_t count)
{
unsigned int end = count - (count % 4);
//Mask to extract first bit of each byte.
const __m256i ones = _mm256_set1_epi8(0x1);
for(unsigned int k = 0; k < end; k += 4)
{
uint8_t block0 = pin[k];
uint8_t block1 = pin[k + 1];
uint8_t block2 = pin[k + 2];
uint8_t block3 = pin[k + 3];
//Prepare for sampling on first bit of each sample byte
__m256i b_sample = _mm256_set_epi8( block3>>7,block3>>6,block3>>5,block3>>4,
block3>>3,block3>>2,block3>>1,block3,
block2>>7,block2>>6,block2>>5,block2>>4,
block2>>3,block2>>2,block2>>1,block2,
block1>>7,block1>>6,block1>>5,block1>>4,
block1>>3,block1>>2,block1>>1,block1,
block0>>7,block0>>6,block0>>5,block0>>4,
block0>>3,block0>>2,block0>>1,block0);
//Mask to get first bit of each byte. This gives us our clean bool array!
__m256i result = _mm256_and_si256(b_sample, ones);
//Store results
_mm256_storeu_si256(reinterpret_cast<__m256i*>(pout + (k*8)), result);
}
//Get any extras we didn't get in the SIMD loop
for(size_t i = end; i < count; i++)
{
for(size_t j = 0; j < 8; j++)
{
pout[8*i + j] = (pin[i] >> j) & 0x1;
}
}
}
/**
Alternative AVX2 implementation of ConvertDigitalSamples using a gather
operation on a LUT
*/
__attribute__((target("avx2")))
static void ConvertDigitalSamplesAVX2_Gather(bool* pout, uint8_t* pin, size_t count)
{
unsigned int end = count - (count % 4);
for(unsigned int k = 0; k < end; k += 4)
{
//Gather values from LUT
__m256i vindex = _mm256_set_epi64x(pin[k+3], pin[k+2], pin[k+1], pin[k]);
__m256i result = _mm256_i64gather_epi64(DigitalSamplesToBoolArrayLUT, vindex, 8);
//Store results
_mm256_storeu_si256(reinterpret_cast<__m256i*>(pout + (k*8)), result);
}
//Get any extras we didn't get in the SIMD loop
for(size_t i = end; i < count; i++)
{
for(size_t j = 0; j < 8; j++)
{
pout[8*i + j] = (pin[i] >> j) & 0x1;
}
}
}
#endif
/**
Convert array of 8x1-bit digital samples to bool array. Note that `count`
is the number of 8x1-bit elements in the input array.
*/
static void ConvertDigitalSamples(int method, bool* pout, uint8_t* pin, size_t count)
{
switch(method)
{
case 3:
ConvertDigitalSamplesLUT(pout, pin, count);
break;
#ifdef __x86_64__
case 4:
ConvertDigitalSamplesAVX2_Gather(pout, pin, count);
break;
case 2:
ConvertDigitalSamplesAVX2_Alt(pout, pin, count);
break;
case 1:
ConvertDigitalSamplesAVX2(pout, pin, count);
break;
#endif
case 0:
default:
ConvertDigitalSamplesGeneric(pout, pin, count);
}
}
int main(int argc, char** argv)
{
using namespace std::literals;
int c;
bool print_en = false;
long int num_samples = 0;
int method = 0;
while ((c = getopt(argc, argv, "pn:m:")) != -1)
{
switch(c)
{
case 'p':
print_en = true;
break;
case 'n':
num_samples = strtol(optarg, NULL, 10);
break;
case 'm':
method = atoi(optarg);
break;
case ':':
printf("Option -%c requires an operand\n", optopt);
break;
case '?':
printf("Unrecognized argument -%c given\n", optopt);
return -1;
}
}
printf("Num Samples: %ld\n", num_samples);
switch(method)
{
case 3:
printf("Method: Generic with LUT\n");
break;
#ifdef __x86_64__
case 4:
printf("Method: AVX2_Gather\n");
break;
case 2:
printf("Method: AVX2_Alt\n");
break;
case 1:
printf("Method: AVX2\n");
break;
#endif
case 0:
default:
printf("Method: Generic (%d)\n", method);
}
srand(time(NULL));
//Digital signal stored as packed 1-bit samples
uint8_t* data = reinterpret_cast<uint8_t*>(calloc(num_samples/8 + 1, sizeof(uint8_t)));
for(size_t i = 0; i < num_samples/8 + 1; i++)
{
data[i] = rand();
}
//Sample Destinations
bool* samples = reinterpret_cast<bool*>(calloc(num_samples, sizeof(bool)));
bool* golden = reinterpret_cast<bool*>(calloc(num_samples, sizeof(bool)));
//Golden Reference Conversion
const auto g_start = std::chrono::system_clock::now();
ConvertDigitalSamples(0, golden, (uint8_t*)data, num_samples/8);
const auto g_diff = std::chrono::system_clock::now() - g_start;
//Sample Conversion
const auto start = std::chrono::system_clock::now();
ConvertDigitalSamples(method, samples, (uint8_t*)data, num_samples/8);
const auto diff = std::chrono::system_clock::now() - start;
printf("Time elapsed for conversion: %ld ns; generic: %ld ns\n", diff / 1ns, g_diff / 1ns);
for(size_t i = 0; i < num_samples; i++)
{
if(print_en)
printf("samples[%ld]: %d; golden: %d\n", i, samples[i], golden[i]);
//Compare to Golden Reference
assert(samples[i] == golden[i]);
}
free(data);
free(samples);
free(golden);
return 0;
}
@hansemro
Copy link
Author

AMD Ryzen 5 5600X Benchmark results:

$ ./dsconvert -n 320000000 -m 0
Num Samples: 320000000
Method: Generic (0)
Time elapsed for conversion: 639841211 ns
$ ./dsconvert -n 320000000 -m 0
Num Samples: 320000000
Method: Generic (0)
Time elapsed for conversion: 637728094 ns
$ ./dsconvert -n 320000000 -m 0
Num Samples: 320000000
Method: Generic (0)
Time elapsed for conversion: 631098053 ns
$ ./dsconvert -n 320000000 -m 1
Num Samples: 320000000
Method: AVX2
Time elapsed for conversion: 413110758 ns
$ ./dsconvert -n 320000000 -m 1
Num Samples: 320000000
Method: AVX2
Time elapsed for conversion: 415102210 ns
$ ./dsconvert -n 320000000 -m 1
Num Samples: 320000000
Method: AVX2
Time elapsed for conversion: 415215967 ns
$ ./dsconvert -n 320000000 -m 2
Num Samples: 320000000
Method: AVX2_Alt
Time elapsed for conversion: 320781818 ns
$ ./dsconvert -n 320000000 -m 2
Num Samples: 320000000
Method: AVX2_Alt
Time elapsed for conversion: 319345026 ns
$ ./dsconvert -n 320000000 -m 2
Num Samples: 320000000
Method: AVX2_Alt
Time elapsed for conversion: 322985653 ns

@hansemro
Copy link
Author

AMD Ryzen 7 3750H Benchmark results:

$ ./dsconvert -n 320000000 -m 0
Num Samples: 320000000
Method: Generic (0)
Time elapsed for conversion: 1457322614 ns
$ ./dsconvert -n 320000000 -m 0
Num Samples: 320000000
Method: Generic (0)
Time elapsed for conversion: 1457238744 ns
$ ./dsconvert -n 320000000 -m 0
Num Samples: 320000000
Method: Generic (0)
Time elapsed for conversion: 1479732995 ns
$ ./dsconvert -n 320000000 -m 1
Num Samples: 320000000
Method: AVX2
Time elapsed for conversion: 496241561 ns
$ ./dsconvert -n 320000000 -m 1
Num Samples: 320000000
Method: AVX2
Time elapsed for conversion: 497851702 ns
$ ./dsconvert -n 320000000 -m 1
Num Samples: 320000000
Method: AVX2
Time elapsed for conversion: 501259858 ns
$ ./dsconvert -n 320000000 -m 2
Num Samples: 320000000
Method: AVX2_Alt
Time elapsed for conversion: 386648209 ns
$ ./dsconvert -n 320000000 -m 2
Num Samples: 320000000
Method: AVX2_Alt
Time elapsed for conversion: 386676497 ns
$ ./dsconvert -n 320000000 -m 2
Num Samples: 320000000
Method: AVX2_Alt
Time elapsed for conversion: 393613003 ns

@hansemro
Copy link
Author

hansemro commented Sep 2, 2023

Added architecture-agnostic/generic lookup table implementation that outperforms AVX2 implementations.

@hansemro
Copy link
Author

hansemro commented Sep 2, 2023

Fastest AVX2 implementation does parallel lookup using a gather operation, but does not always yield a faster solution than non-AVX LUT implementation.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment