Skip to content

Instantly share code, notes, and snippets.

@clausecker
Last active August 4, 2020 16:00
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save clausecker/3de220b146f51e5f7bc9f4500c62847f to your computer and use it in GitHub Desktop.
Save clausecker/3de220b146f51e5f7bc9f4500c62847f to your computer and use it in GitHub Desktop.
8 bit positional popcount with AVX2 without too much code
#define _XOPEN_SOURCE 700
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
extern void pospopcnt_reg(int accum[8], const char *buf, size_t len);
extern void pospopcnt_mem(int accum[8], const char *buf, size_t len);
extern void
pospopcnt_naive(int accum[8], const char *buf, size_t len)
{
size_t i;
int j;
for (i = 0; i < len; i++)
for (j = 0; j < 8; j++)
accum[j] += !!(buf[i] & 1 << j);
}
/*
* Compute the difference of two struct timespec.
*/
static struct timespec
tsdiff(struct timespec a, struct timespec b)
{
a.tv_sec -= b.tv_sec;
a.tv_nsec -= b.tv_nsec;
if (a.tv_nsec < 0) {
a.tv_sec -= 1;
a.tv_nsec += 1000000000;
}
return (a);
}
/* perform a benchmark */
static void benchmark(const char *buf, size_t len, const char *name, void (*pospopcnt)(int[8], const char *, size_t))
{
struct timespec diff, start, end;
double dur;
long i, n = 1;
int accum[8] = {0};
do {
clock_gettime(CLOCK_REALTIME, &start);
for (i = 0; i < n; i++)
pospopcnt(accum, buf, len);
clock_gettime(CLOCK_REALTIME, &end);
diff = tsdiff(end, start);
n <<= 1;
} while (diff.tv_sec == 0);
n >>= 1;
dur = diff.tv_sec + diff.tv_nsec / 1000000000.0;
dur /= n;
printf("%s\t%g B/s\n", name, len / dur);
}
extern int
main(int argc, char *argv[])
{
size_t len = 8192;
int naive_accum[8], asm_accum[8];
FILE *random;
char *buf;
if (argc > 1)
len = atoll(argv[1]) + 31 & ~31LL;
buf = malloc(len);
if (buf == NULL) {
perror("malloc");
return (EXIT_FAILURE);
}
random = fopen("/dev/urandom", "rb");
if (random == NULL) {
perror("/dev/urandom");
return (EXIT_FAILURE);
}
fread(buf, 1, len, random);
memset(naive_accum, 0, sizeof naive_accum);
memset(asm_accum, 0, sizeof asm_accum);
pospopcnt_reg(asm_accum, buf, len);
pospopcnt_naive(naive_accum, buf, len);
if (memcmp(asm_accum, naive_accum, 8*4) != 0)
printf("mismatch\n");
benchmark(buf, len, "naive", pospopcnt_naive);
benchmark(buf, len, "addmem", pospopcnt_mem);
benchmark(buf, len, "addreg", pospopcnt_reg);
}
# pospopcnt(accum, buf, len)
# len must be a multiple of 32
.globl pospopcnt_mem
.type pospopcnt_mem, @function
pospopcnt_mem:
# rdi: accum
# rsi: buf
# rdx: len
test %rdx, %rdx # any work to do?
jnz 0f
ret
.align 16
0: vmovdqu (%rsi), %ymm0
add $32, %rsi
vpmovmskb %ymm0, %eax
popcnt %eax, %eax
add %eax, 7*4(%rdi)
vpaddd %ymm0, %ymm0, %ymm0
vpmovmskb %ymm0, %eax
popcnt %eax, %eax
add %eax, 6*4(%rdi)
vpaddd %ymm0, %ymm0, %ymm0
vpmovmskb %ymm0, %eax
popcnt %eax, %eax
add %eax, 5*4(%rdi)
vpaddd %ymm0, %ymm0, %ymm0
vpmovmskb %ymm0, %eax
popcnt %eax, %eax
add %eax, 4*4(%rdi)
vpaddd %ymm0, %ymm0, %ymm0
vpmovmskb %ymm0, %eax
popcnt %eax, %eax
add %eax, 3*4(%rdi)
vpaddd %ymm0, %ymm0, %ymm0
vpmovmskb %ymm0, %eax
popcnt %eax, %eax
add %eax, 2*4(%rdi)
vpaddd %ymm0, %ymm0, %ymm0
vpmovmskb %ymm0, %eax
popcnt %eax, %eax
add %eax, 1*4(%rdi)
vpaddd %ymm0, %ymm0, %ymm0
vpmovmskb %ymm0, %eax
popcnt %eax, %eax
add %eax, 0*4(%rdi)
sub $32, %rdx
jnz 0b
vzeroupper
ret
.globl pospopcnt_reg
.type pospopcnt_reg, @function
pospopcnt_reg:
# rdi: accum
# rsi: buf
# rdx: len
test %rdx, %rdx # any work to do?
jnz 1f
ret
1: push %rbp
push %rbx
push %r12
mov 0*4(%rdi), %r8d
mov 1*4(%rdi), %r9d
mov 2*4(%rdi), %r10d
mov 3*4(%rdi), %r11d
mov 4*4(%rdi), %r12d
mov 5*4(%rdi), %ebx
mov 6*4(%rdi), %ecx
mov 7*4(%rdi), %ebp
.align 16
0: vmovdqu (%rsi), %ymm0
add $32, %rsi
vpmovmskb %ymm0, %eax
popcnt %eax, %eax
add %eax, %ebp
vpaddd %ymm0, %ymm0, %ymm0
vpmovmskb %ymm0, %eax
popcnt %eax, %eax
add %eax, %ecx
vpaddd %ymm0, %ymm0, %ymm0
vpmovmskb %ymm0, %eax
popcnt %eax, %eax
add %eax, %ebx
vpaddd %ymm0, %ymm0, %ymm0
vpmovmskb %ymm0, %eax
popcnt %eax, %eax
add %eax, %r12d
vpaddd %ymm0, %ymm0, %ymm0
vpmovmskb %ymm0, %eax
popcnt %eax, %eax
add %eax, %r11d
vpaddd %ymm0, %ymm0, %ymm0
vpmovmskb %ymm0, %eax
popcnt %eax, %eax
add %eax, %r10d
vpaddd %ymm0, %ymm0, %ymm0
vpmovmskb %ymm0, %eax
popcnt %eax, %eax
add %eax, %r9d
vpaddd %ymm0, %ymm0, %ymm0
vpmovmskb %ymm0, %eax
popcnt %eax, %eax
add %eax, %r8d
sub $32, %rdx
jnz 0b
mov %r8d, 0*4(%rdi)
mov %r9d, 1*4(%rdi)
mov %r10d, 2*4(%rdi)
mov %r11d, 3*4(%rdi)
mov %r12d, 4*4(%rdi)
mov %ebx, 5*4(%rdi)
mov %ecx, 6*4(%rdi)
mov %ebp, 7*4(%rdi)
vzeroupper
pop %r12
pop %rbx
pop %rbp
ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment