Skip to content

Instantly share code, notes, and snippets.

@dubek
Created September 4, 2015 16:41
Show Gist options
  • Save dubek/51da1f5c3259289fa446 to your computer and use it in GitHub Desktop.
Save dubek/51da1f5c3259289fa446 to your computer and use it in GitHub Desktop.
Proof-of-concept faster redisPopcount() with SSE4.2 popcntl and popcntq CPU instructions
/*
* Proof-of-concept faster redisPopcount() with SSE4.2 popcntl and popcntq CPU instructions:
* https://gcc.gnu.org/onlinedocs/gcc/x86-Built-in-Functions.html
*
* Compile with -mpopcnt (or -msse4.2) to enable __builtin_popcount and
* __builtin_popcountll functions:
*
* gcc -mpopcnt -O2 -Wall -Wextra redis_popcount_benchmark.c -o redis_popcount_benchmark
*
* Usage:
*
* redis_popcount_bechmark <string_size> <repeat_times>
*
* Example (100MB string, repeat 10 times):
*
* ./redis_popcount_benchmark 100000000 10
*
*
* Note:
*
* For a production-ready implementation we'll probably need to wrap the
* relevant code with `#ifdef __POPCNT__` and `#if (__SIZEOF_POINTER__ == 8)`
* etc. to make sure we choose an optimized version according to the compiler
* switches (-mpopcnt and -m32). Also need to instruct users to compile with
* CFLAGS=-mpopcnt if they want this extra performance boost.
*
*
*
* Results on Intel(R) Core(TM) i5-3320M CPU @ 2.60GHz (Linux 2.6.32):
*
* $ gcc -mpopcnt -O2 -Wall -Wextra redis_popcount_benchmark.c -o redis_popcount_benchmark
* $ ./redis_popcount_benchmark 250000000 50
* orig_redisPopcount: reps=50 size=250000000 bitcount=1000000000 usecs=4306633 usecs/op=86132
* new32_redisPopcount: reps=50 size=250000000 bitcount=1000000000 usecs=2518808 usecs/op=50376
* new64_redisPopcount: reps=50 size=250000000 bitcount=1000000000 usecs=1143227 usecs/op=22864
*
*
*
* Note: benchmarks depend on the workload, machine, blah, blah, YMMV.
*/
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
/* ---------------------------------------------------------------------- */
/* This is from redis/src/bitop.c: */
/* Count number of bits set in the binary array pointed by 's' and long
* 'count' bytes. The implementation of this function is required to
* work with a input string length up to 512 MB. */
static size_t orig_redisPopcount(void *s, long count) {
size_t bits = 0;
unsigned char *p = s;
uint32_t *p4;
static const unsigned char bitsinbyte[256] = {0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8};
/* Count initial bytes not aligned to 32 bit. */
while((unsigned long)p & 3 && count) {
bits += bitsinbyte[*p++];
count--;
}
/* Count bits 28 bytes at a time */
p4 = (uint32_t*)p;
while(count>=28) {
uint32_t aux1, aux2, aux3, aux4, aux5, aux6, aux7;
aux1 = *p4++;
aux2 = *p4++;
aux3 = *p4++;
aux4 = *p4++;
aux5 = *p4++;
aux6 = *p4++;
aux7 = *p4++;
count -= 28;
aux1 = aux1 - ((aux1 >> 1) & 0x55555555);
aux1 = (aux1 & 0x33333333) + ((aux1 >> 2) & 0x33333333);
aux2 = aux2 - ((aux2 >> 1) & 0x55555555);
aux2 = (aux2 & 0x33333333) + ((aux2 >> 2) & 0x33333333);
aux3 = aux3 - ((aux3 >> 1) & 0x55555555);
aux3 = (aux3 & 0x33333333) + ((aux3 >> 2) & 0x33333333);
aux4 = aux4 - ((aux4 >> 1) & 0x55555555);
aux4 = (aux4 & 0x33333333) + ((aux4 >> 2) & 0x33333333);
aux5 = aux5 - ((aux5 >> 1) & 0x55555555);
aux5 = (aux5 & 0x33333333) + ((aux5 >> 2) & 0x33333333);
aux6 = aux6 - ((aux6 >> 1) & 0x55555555);
aux6 = (aux6 & 0x33333333) + ((aux6 >> 2) & 0x33333333);
aux7 = aux7 - ((aux7 >> 1) & 0x55555555);
aux7 = (aux7 & 0x33333333) + ((aux7 >> 2) & 0x33333333);
bits += ((((aux1 + (aux1 >> 4)) & 0x0F0F0F0F) +
((aux2 + (aux2 >> 4)) & 0x0F0F0F0F) +
((aux3 + (aux3 >> 4)) & 0x0F0F0F0F) +
((aux4 + (aux4 >> 4)) & 0x0F0F0F0F) +
((aux5 + (aux5 >> 4)) & 0x0F0F0F0F) +
((aux6 + (aux6 >> 4)) & 0x0F0F0F0F) +
((aux7 + (aux7 >> 4)) & 0x0F0F0F0F))* 0x01010101) >> 24;
}
/* Count the remaining bytes. */
p = (unsigned char*)p4;
while(count--) bits += bitsinbyte[*p++];
return bits;
}
/* ---------------------------------------------------------------------- */
static size_t new32_redisPopcount(void *s, long count) {
size_t bits = 0;
unsigned char *p = s;
uint32_t *p4;
static const unsigned char bitsinbyte[256] = {0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8};
/* Count initial bytes not aligned to 32 bit. */
while((unsigned long)p & 3 && count) {
bits += bitsinbyte[*p++];
count--;
}
/* Count bits 4 bytes at a time */
p4 = (uint32_t*)p;
while(count>=4) {
bits += __builtin_popcount(*p4++);
count -= 4;
}
/* Count the remaining bytes. */
p = (unsigned char*)p4;
while(count--) bits += bitsinbyte[*p++];
return bits;
}
static size_t new64_redisPopcount(void *s, long count) {
size_t bits = 0;
unsigned char *p = s;
uint64_t *p8;
static const unsigned char bitsinbyte[256] = {0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8};
/* Count initial bytes not aligned to 64 bit. */
while((unsigned long)p & 7 && count) {
bits += bitsinbyte[*p++];
count--;
}
/* Count bits 8 bytes at a time */
p8 = (uint64_t*)p;
while(count>=8) {
bits += __builtin_popcountll(*p8++);
count -= 8;
}
/* Count the remaining bytes. */
p = (unsigned char*)p8;
while(count--) bits += bitsinbyte[*p++];
return bits;
}
/* This is from redis/src/redis-benchmark.c: */
static long long ustime(void) {
struct timeval tv;
long long ust;
gettimeofday(&tv, NULL);
ust = ((long)tv.tv_sec)*1000000;
ust += tv.tv_usec;
return ust;
}
int main(int argc, char **argv) {
long size, reps, left;
long long t0, t1;
char *buf;
size_t bitcount = 0;
if (argc < 3) {
printf("Usage: %s stringsize reps\n", argv[0]);
exit(1);
} else {
size = atol(argv[1]);
reps = atol(argv[2]);
}
buf = malloc(size);
memset(buf, 0x55, size);
t0 = ustime();
for (left = reps; left > 0; left--) {
bitcount = orig_redisPopcount(buf, size);
}
t1 = ustime();
printf(" orig_redisPopcount: reps=%ld size=%ld bitcount=%zu usecs=%lld usecs/op=%lld\n", reps, size, bitcount, t1-t0, (t1-t0)/reps);
t0 = ustime();
for (left = reps; left > 0; left--) {
bitcount = new32_redisPopcount(buf, size);
}
t1 = ustime();
printf("new32_redisPopcount: reps=%ld size=%ld bitcount=%zu usecs=%lld usecs/op=%lld\n", reps, size, bitcount, t1-t0, (t1-t0)/reps);
t0 = ustime();
for (left = reps; left > 0; left--) {
bitcount = new64_redisPopcount(buf, size);
}
t1 = ustime();
printf("new64_redisPopcount: reps=%ld size=%ld bitcount=%zu usecs=%lld usecs/op=%lld\n", reps, size, bitcount, t1-t0, (t1-t0)/reps);
free(buf);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment