Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
/* Test Aarch64 SIMD implementation of popcount against
__builtin_popcountll() function.
Copyright 2015 (C) Pavel Odvody
License MIT
(all examples run under QEMU emulation)
$ ./test 100m
Test: -1044375694
Test: -1044375694
Time taken SIMD: 1780763 (1.780763 seconds)
Time taken built-in: 4569113 (4.569113 seconds)
Buffer size: 100m
(note that for small buffers ...)
$ ./test 512
Test: 15924
Test: 15924
Time taken SIMD: 2739 (0.002739 seconds)
Time taken built-in: 104 (0.000104 seconds)
Buffer size: 512
(SIMD takes over around 96k)
$ ./test 96k
Test: 3046567
Test: 3046567
Time taken SIMD: 4654 (0.004654 seconds)
Time taken built-in: 4704 (0.004704 seconds)
Buffer size: 96k
Need to test on real hardware, the __builtin function
implements this:
https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
Which is a real breeze for ARM due to its barrel-shifter:
__popcountdi2:
0x0000000000400bd8 <+0>: lsr x1, x0, #1
0x0000000000400bdc <+4>: and x1, x1, #0x5555555555555555
0x0000000000400be0 <+8>: sub x1, x0, x1
0x0000000000400be4 <+12>: and x0, x1, #0x3333333333333333
0x0000000000400be8 <+16>: lsr x1, x1, #2
0x0000000000400bec <+20>: and x1, x1, #0x3333333333333333
0x0000000000400bf0 <+24>: add x0, x0, x1
0x0000000000400bf4 <+28>: add x0, x0, x0, lsr #4
0x0000000000400bf8 <+32>: and x0, x0, #0xf0f0f0f0f0f0f0f
0x0000000000400bfc <+36>: add x0, x0, x0, lsl #8
0x0000000000400c00 <+40>: add x0, x0, x0, lsl #16
0x0000000000400c04 <+44>: add x0, x0, x0, lsl #32
0x0000000000400c08 <+48>: lsr x0, x0, #56
0x0000000000400c0c <+52>: ret
*/
int count_multiple_bits(unsigned long long *b, unsigned int size) {
unsigned long long *d = b;
unsigned int masked = 0, i = 0;
int c = 0;
masked = size & ~3;
for (; i < masked; i += 4)
__asm__("LD1 {v0.2D, v1.2D}, [%1], #32 \n\t"
"CNT v0.16b, v0.16b \n\t"
"CNT v1.16b, v1.16b \n\t"
"UADDLV h2, v0.16b \n\t"
"UADDLV h3, v1.16b \n\t"
"ADD d2, d3, d2 \n\t"
"UMOV x0, v2.d[0] \n\t"
"ADD %0, x0, %0 \n\t"
: "+r"(c), "+r"(d)
:: "x0", "v0", "v1", "v2", "v3");
for (; i < size; ++i)
__asm__("LD1 {v0.D}[0], [%1], #8 \n\t"
"CNT v0.8b, v0.8b \n\t"
"UADDLV h1, v0.8b \n\t"
"UMOV x0, v1.d[0] \n\t"
"ADD %0, x0, %0 \n\t"
: "+r"(c), "+r"(d)
:: "x0", "v0", "v1");
return c;
}
int count_bits_naive(unsigned long long *p, unsigned int size) {
unsigned int masked = 0, i = 0;
int c = 0;
masked = size & ~3;
for (; i < masked; i += 4) {
c += __builtin_popcountll(p[i + 0]);
c += __builtin_popcountll(p[i + 1]);
c += __builtin_popcountll(p[i + 2]);
c += __builtin_popcountll(p[i + 3]);
}
for (; i < size; ++i)
c += __builtin_popcountll(p[i]);
return c;
}
int main(int argc, const char *argv[]) {
char *endptr = NULL;
long int size = 0;
size_t c1 = 0, c2 = 0;
if (argc < 2) {
puts("Please provide buffer size argument");
return 1;
}
size = strtol(argv[1], &endptr, 10);
if (size < 0) {
puts("Negative buffer size? Cowardly flipping sign");
size = -size;
}
if (*endptr) {
switch (*endptr) {
case 'm':
case 'M':
size *= 1024 * 1024; break;
case 'k':
case 'K':
size *= 1024; break;
default: break;
}
}
srand(clock());
unsigned long long *b = malloc(sizeof(unsigned long long) * size);
for (unsigned long long i = 0; i < size; ++i)
b[i] = (unsigned long long)random() << 32 | (unsigned long long)random();
clock_t start = clock();
c1 = count_multiple_bits(b, size);
clock_t end = clock();
c2 = count_bits_naive(b, size);
clock_t end2 = clock();
printf("Test: %zu\n", c1);
printf("Test: %zu\n", c2);
printf("Time taken SIMD: %d (%f seconds) \nTime taken built-in: %d (%f seconds)\n",
end - start, (float)(end-start) / CLOCKS_PER_SEC, end2 - end, (float)(end2-end) / CLOCKS_PER_SEC);
printf("Buffer size: %s\n", argv[1]);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.