Created
September 4, 2015 16:41
-
-
Save dubek/51da1f5c3259289fa446 to your computer and use it in GitHub Desktop.
Proof-of-concept faster redisPopcount() with SSE4.2 popcntl and popcntq CPU instructions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Proof-of-concept faster redisPopcount() with SSE4.2 popcntl and popcntq CPU instructions: | |
* https://gcc.gnu.org/onlinedocs/gcc/x86-Built-in-Functions.html | |
* | |
* Compile with -mpopcnt (or -msse4.2) to enable __builtin_popcount and | |
* __builtin_popcountll functions: | |
* | |
* gcc -mpopcnt -O2 -Wall -Wextra redis_popcount_benchmark.c -o redis_popcount_benchmark | |
* | |
* Usage: | |
* | |
* redis_popcount_bechmark <string_size> <repeat_times> | |
* | |
* Example (100MB string, repeat 10 times): | |
* | |
* ./redis_popcount_benchmark 100000000 10 | |
* | |
* | |
* Note: | |
* | |
* For a production-ready implementation we'll probably need to wrap the | |
* relevant code with `#ifdef __POPCNT__` and `#if (__SIZEOF_POINTER__ == 8)` | |
* etc. to make sure we choose an optimized version according to the compiler | |
* switches (-mpopcnt and -m32). Also need to instruct users to compile with | |
* CFLAGS=-mpopcnt if they want this extra performance boost. | |
* | |
* | |
* | |
* Results on Intel(R) Core(TM) i5-3320M CPU @ 2.60GHz (Linux 2.6.32): | |
* | |
* $ gcc -mpopcnt -O2 -Wall -Wextra redis_popcount_benchmark.c -o redis_popcount_benchmark | |
* $ ./redis_popcount_benchmark 250000000 50 | |
* orig_redisPopcount: reps=50 size=250000000 bitcount=1000000000 usecs=4306633 usecs/op=86132 | |
* new32_redisPopcount: reps=50 size=250000000 bitcount=1000000000 usecs=2518808 usecs/op=50376 | |
* new64_redisPopcount: reps=50 size=250000000 bitcount=1000000000 usecs=1143227 usecs/op=22864 | |
* | |
* | |
* | |
* Note: benchmarks depend on the workload, machine, blah, blah, YMMV. | |
*/ | |
#include <inttypes.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <sys/time.h> | |
/* ---------------------------------------------------------------------- */ | |
/* This is from redis/src/bitop.c: */ | |
/* Count number of bits set in the binary array pointed by 's' and long | |
* 'count' bytes. The implementation of this function is required to | |
* work with a input string length up to 512 MB. */ | |
static size_t orig_redisPopcount(void *s, long count) { | |
size_t bits = 0; | |
unsigned char *p = s; | |
uint32_t *p4; | |
static const unsigned char bitsinbyte[256] = {0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8}; | |
/* Count initial bytes not aligned to 32 bit. */ | |
while((unsigned long)p & 3 && count) { | |
bits += bitsinbyte[*p++]; | |
count--; | |
} | |
/* Count bits 28 bytes at a time */ | |
p4 = (uint32_t*)p; | |
while(count>=28) { | |
uint32_t aux1, aux2, aux3, aux4, aux5, aux6, aux7; | |
aux1 = *p4++; | |
aux2 = *p4++; | |
aux3 = *p4++; | |
aux4 = *p4++; | |
aux5 = *p4++; | |
aux6 = *p4++; | |
aux7 = *p4++; | |
count -= 28; | |
aux1 = aux1 - ((aux1 >> 1) & 0x55555555); | |
aux1 = (aux1 & 0x33333333) + ((aux1 >> 2) & 0x33333333); | |
aux2 = aux2 - ((aux2 >> 1) & 0x55555555); | |
aux2 = (aux2 & 0x33333333) + ((aux2 >> 2) & 0x33333333); | |
aux3 = aux3 - ((aux3 >> 1) & 0x55555555); | |
aux3 = (aux3 & 0x33333333) + ((aux3 >> 2) & 0x33333333); | |
aux4 = aux4 - ((aux4 >> 1) & 0x55555555); | |
aux4 = (aux4 & 0x33333333) + ((aux4 >> 2) & 0x33333333); | |
aux5 = aux5 - ((aux5 >> 1) & 0x55555555); | |
aux5 = (aux5 & 0x33333333) + ((aux5 >> 2) & 0x33333333); | |
aux6 = aux6 - ((aux6 >> 1) & 0x55555555); | |
aux6 = (aux6 & 0x33333333) + ((aux6 >> 2) & 0x33333333); | |
aux7 = aux7 - ((aux7 >> 1) & 0x55555555); | |
aux7 = (aux7 & 0x33333333) + ((aux7 >> 2) & 0x33333333); | |
bits += ((((aux1 + (aux1 >> 4)) & 0x0F0F0F0F) + | |
((aux2 + (aux2 >> 4)) & 0x0F0F0F0F) + | |
((aux3 + (aux3 >> 4)) & 0x0F0F0F0F) + | |
((aux4 + (aux4 >> 4)) & 0x0F0F0F0F) + | |
((aux5 + (aux5 >> 4)) & 0x0F0F0F0F) + | |
((aux6 + (aux6 >> 4)) & 0x0F0F0F0F) + | |
((aux7 + (aux7 >> 4)) & 0x0F0F0F0F))* 0x01010101) >> 24; | |
} | |
/* Count the remaining bytes. */ | |
p = (unsigned char*)p4; | |
while(count--) bits += bitsinbyte[*p++]; | |
return bits; | |
} | |
/* ---------------------------------------------------------------------- */ | |
static size_t new32_redisPopcount(void *s, long count) { | |
size_t bits = 0; | |
unsigned char *p = s; | |
uint32_t *p4; | |
static const unsigned char bitsinbyte[256] = {0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8}; | |
/* Count initial bytes not aligned to 32 bit. */ | |
while((unsigned long)p & 3 && count) { | |
bits += bitsinbyte[*p++]; | |
count--; | |
} | |
/* Count bits 4 bytes at a time */ | |
p4 = (uint32_t*)p; | |
while(count>=4) { | |
bits += __builtin_popcount(*p4++); | |
count -= 4; | |
} | |
/* Count the remaining bytes. */ | |
p = (unsigned char*)p4; | |
while(count--) bits += bitsinbyte[*p++]; | |
return bits; | |
} | |
static size_t new64_redisPopcount(void *s, long count) { | |
size_t bits = 0; | |
unsigned char *p = s; | |
uint64_t *p8; | |
static const unsigned char bitsinbyte[256] = {0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8}; | |
/* Count initial bytes not aligned to 64 bit. */ | |
while((unsigned long)p & 7 && count) { | |
bits += bitsinbyte[*p++]; | |
count--; | |
} | |
/* Count bits 8 bytes at a time */ | |
p8 = (uint64_t*)p; | |
while(count>=8) { | |
bits += __builtin_popcountll(*p8++); | |
count -= 8; | |
} | |
/* Count the remaining bytes. */ | |
p = (unsigned char*)p8; | |
while(count--) bits += bitsinbyte[*p++]; | |
return bits; | |
} | |
/* This is from redis/src/redis-benchmark.c: */ | |
static long long ustime(void) { | |
struct timeval tv; | |
long long ust; | |
gettimeofday(&tv, NULL); | |
ust = ((long)tv.tv_sec)*1000000; | |
ust += tv.tv_usec; | |
return ust; | |
} | |
int main(int argc, char **argv) { | |
long size, reps, left; | |
long long t0, t1; | |
char *buf; | |
size_t bitcount = 0; | |
if (argc < 3) { | |
printf("Usage: %s stringsize reps\n", argv[0]); | |
exit(1); | |
} else { | |
size = atol(argv[1]); | |
reps = atol(argv[2]); | |
} | |
buf = malloc(size); | |
memset(buf, 0x55, size); | |
t0 = ustime(); | |
for (left = reps; left > 0; left--) { | |
bitcount = orig_redisPopcount(buf, size); | |
} | |
t1 = ustime(); | |
printf(" orig_redisPopcount: reps=%ld size=%ld bitcount=%zu usecs=%lld usecs/op=%lld\n", reps, size, bitcount, t1-t0, (t1-t0)/reps); | |
t0 = ustime(); | |
for (left = reps; left > 0; left--) { | |
bitcount = new32_redisPopcount(buf, size); | |
} | |
t1 = ustime(); | |
printf("new32_redisPopcount: reps=%ld size=%ld bitcount=%zu usecs=%lld usecs/op=%lld\n", reps, size, bitcount, t1-t0, (t1-t0)/reps); | |
t0 = ustime(); | |
for (left = reps; left > 0; left--) { | |
bitcount = new64_redisPopcount(buf, size); | |
} | |
t1 = ustime(); | |
printf("new64_redisPopcount: reps=%ld size=%ld bitcount=%zu usecs=%lld usecs/op=%lld\n", reps, size, bitcount, t1-t0, (t1-t0)/reps); | |
free(buf); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment