Created
June 8, 2020 17:19
-
-
Save himanshugoel2797/4e043d8936f6c862b29b3ab6d3331cc9 to your computer and use it in GitHub Desktop.
Additional benchmarks based on https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/master/2016/06/25/fastrange.c
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Demonstrates how can map a 32-bit integer to a range faster than | |
* a modulo reduction. | |
* Assumes x64 processor. | |
* gcc -O3 -o fastrange fastrange.c | |
*/ | |
#include <stdint.h> | |
#include <stdlib.h> | |
#include <stdio.h> | |
#define RDTSC_START(cycles) \ | |
do { \ | |
register unsigned cyc_high, cyc_low; \ | |
__asm volatile( \ | |
"cpuid\n\t" \ | |
"rdtsc\n\t" \ | |
"mov %%edx, %0\n\t" \ | |
"mov %%eax, %1\n\t" \ | |
: "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \ | |
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ | |
} while (0) | |
#define RDTSC_FINAL(cycles) \ | |
do { \ | |
register unsigned cyc_high, cyc_low; \ | |
__asm volatile( \ | |
"rdtscp\n\t" \ | |
"mov %%edx, %0\n\t" \ | |
"mov %%eax, %1\n\t" \ | |
"cpuid\n\t" \ | |
: "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \ | |
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ | |
} while (0) | |
/* | |
* Prints the best number of operations per cycle where | |
* test is the function call, answer is the expected answer generated by | |
* test, repeat is the number of times we should repeat and size is the | |
* number of operations represented by test. | |
*/ | |
#define BEST_TIME(test, answer, repeat, size) \ | |
do { \ | |
printf("%s: ", #test); \ | |
fflush(NULL); \ | |
uint64_t cycles_start, cycles_final, cycles_diff; \ | |
uint64_t min_diff = (uint64_t)-1; \ | |
int wrong_answer = 0; \ | |
for (int i = 0; i < repeat; i++) { \ | |
__asm volatile("" ::: /* pretend to clobber */ "memory"); \ | |
RDTSC_START(cycles_start); \ | |
if (test != answer) wrong_answer = 1; \ | |
RDTSC_FINAL(cycles_final); \ | |
cycles_diff = (cycles_final - cycles_start); \ | |
if (cycles_diff < min_diff) min_diff = cycles_diff; \ | |
} \ | |
uint64_t S = (uint64_t)size; \ | |
float cycle_per_op = (min_diff) / (float)S; \ | |
printf(" %.2f cycles per operation", cycle_per_op); \ | |
if (wrong_answer) printf(" [ERROR]"); \ | |
printf("\n"); \ | |
fflush(NULL); \ | |
} while (0) | |
#define MODSUM_OPT(N) \ | |
uint32_t modsum_##N (uint32_t *z, uint32_t n, uint32_t *accesses, uint32_t nmbr) { \ | |
uint32_t sum = 0; \ | |
for (uint32_t j = 0; j < nmbr; ++j) { \ | |
sum += z[accesses[j] % (1 << N)]; \ | |
} \ | |
return sum; \ | |
} | |
MODSUM_OPT(24) | |
MODSUM_OPT(16) | |
MODSUM_OPT(8) | |
uint32_t modsum(uint32_t * z, uint32_t N, uint32_t * accesses, uint32_t nmbr) { | |
uint32_t sum = 0; | |
for(uint32_t j = 0; j < nmbr ; ++j ) { | |
sum += z[accesses[j] % N]; | |
} | |
return sum; | |
} | |
uint32_t fastsum(uint32_t * z, uint32_t N, uint32_t * accesses, uint32_t nmbr) { | |
uint32_t sum = 0; | |
uint64_t N64 = (uint64_t) N; | |
for(uint32_t j = 0; j < nmbr ; ++j ) { | |
sum += z[(accesses[j] * N64)>> 32] ; | |
} | |
return sum; | |
} | |
// N is a power of two | |
uint32_t maskedsum(uint32_t * z, uint32_t N, uint32_t * accesses, uint32_t nmbr) { | |
uint32_t sum = 0; | |
for(uint32_t j = 0; j < nmbr ; ++j ) { | |
sum += z[accesses[j] & (N-1)] ; | |
} | |
return sum; | |
} | |
void demo(uint32_t N) { | |
printf("N = %d\n", N); | |
uint32_t * z = malloc(N * sizeof(uint32_t)); | |
for(uint32_t i = 0 ; i < N; ++i) z[i] = rand(); // some rand. number | |
uint32_t nmbr = 500; | |
uint32_t * accesses = malloc(nmbr * sizeof(uint32_t)); | |
for(uint32_t i = 0 ; i < nmbr; ++i) accesses[i] = rand(); // some rand. number | |
uint32_t expected1 = modsum(z,N,accesses,nmbr); | |
uint32_t expected2 = fastsum(z,N,accesses,nmbr); | |
BEST_TIME(modsum(z,N,accesses,nmbr), expected1, 1000, nmbr); | |
BEST_TIME(fastsum(z,N,accesses,nmbr), expected2, 1000, nmbr); | |
free(z); | |
free(accesses); | |
} | |
void demopoweroftwo(uint32_t N) { | |
printf("N = %d\n", N); | |
uint32_t * z = malloc(N * sizeof(uint32_t)); | |
for(uint32_t i = 0 ; i < N; ++i) z[i] = rand(); // some rand. number | |
uint32_t nmbr = 500; | |
uint32_t * accesses = malloc(nmbr * sizeof(uint32_t)); | |
for(uint32_t i = 0 ; i < nmbr; ++i) accesses[i] = rand(); // some rand. number | |
uint32_t expected1 = modsum(z,N,accesses,nmbr); | |
uint32_t expected2 = fastsum(z,N,accesses,nmbr); | |
BEST_TIME(modsum(z,N,accesses,nmbr), expected1, 1000, nmbr); | |
BEST_TIME(fastsum(z,N,accesses,nmbr), expected2, 1000, nmbr); | |
BEST_TIME(maskedsum(z,N,accesses,nmbr), expected1, 1000, nmbr); | |
switch(N){ | |
case 1<<8: | |
BEST_TIME(modsum_8(z,N,accesses,nmbr),expected1, 1000, nmbr); | |
break; | |
case 1<<16: | |
BEST_TIME(modsum_16(z,N,accesses,nmbr), expected1, 1000, nmbr); | |
break; | |
case 1<<24: | |
BEST_TIME(modsum_24(z,N,accesses,nmbr), expected1, 1000, nmbr); | |
break; | |
} | |
free(z); | |
free(accesses); | |
} | |
int main() { | |
demo(1 << 24); | |
demo(1 << 8); | |
demo(1 << 16); | |
demo(1 << 24); | |
demopoweroftwo(1 << 8); | |
demopoweroftwo(1 << 16); | |
demopoweroftwo(1 << 24); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment