Skip to content

Instantly share code, notes, and snippets.

@himanshugoel2797
Created June 8, 2020 17:19
Show Gist options
  • Save himanshugoel2797/4e043d8936f6c862b29b3ab6d3331cc9 to your computer and use it in GitHub Desktop.
Save himanshugoel2797/4e043d8936f6c862b29b3ab6d3331cc9 to your computer and use it in GitHub Desktop.
/**
* Demonstrates how can map a 32-bit integer to a range faster than
* a modulo reduction.
* Assumes x64 processor.
* gcc -O3 -o fastrange fastrange.c
*/
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#define RDTSC_START(cycles) \
do { \
register unsigned cyc_high, cyc_low; \
__asm volatile( \
"cpuid\n\t" \
"rdtsc\n\t" \
"mov %%edx, %0\n\t" \
"mov %%eax, %1\n\t" \
: "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
#define RDTSC_FINAL(cycles) \
do { \
register unsigned cyc_high, cyc_low; \
__asm volatile( \
"rdtscp\n\t" \
"mov %%edx, %0\n\t" \
"mov %%eax, %1\n\t" \
"cpuid\n\t" \
: "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
/*
* Prints the best number of operations per cycle where
* test is the function call, answer is the expected answer generated by
* test, repeat is the number of times we should repeat and size is the
* number of operations represented by test.
*/
#define BEST_TIME(test, answer, repeat, size) \
do { \
printf("%s: ", #test); \
fflush(NULL); \
uint64_t cycles_start, cycles_final, cycles_diff; \
uint64_t min_diff = (uint64_t)-1; \
int wrong_answer = 0; \
for (int i = 0; i < repeat; i++) { \
__asm volatile("" ::: /* pretend to clobber */ "memory"); \
RDTSC_START(cycles_start); \
if (test != answer) wrong_answer = 1; \
RDTSC_FINAL(cycles_final); \
cycles_diff = (cycles_final - cycles_start); \
if (cycles_diff < min_diff) min_diff = cycles_diff; \
} \
uint64_t S = (uint64_t)size; \
float cycle_per_op = (min_diff) / (float)S; \
printf(" %.2f cycles per operation", cycle_per_op); \
if (wrong_answer) printf(" [ERROR]"); \
printf("\n"); \
fflush(NULL); \
} while (0)
#define MODSUM_OPT(N) \
uint32_t modsum_##N (uint32_t *z, uint32_t n, uint32_t *accesses, uint32_t nmbr) { \
uint32_t sum = 0; \
for (uint32_t j = 0; j < nmbr; ++j) { \
sum += z[accesses[j] % (1 << N)]; \
} \
return sum; \
}
MODSUM_OPT(24)
MODSUM_OPT(16)
MODSUM_OPT(8)
uint32_t modsum(uint32_t * z, uint32_t N, uint32_t * accesses, uint32_t nmbr) {
uint32_t sum = 0;
for(uint32_t j = 0; j < nmbr ; ++j ) {
sum += z[accesses[j] % N];
}
return sum;
}
uint32_t fastsum(uint32_t * z, uint32_t N, uint32_t * accesses, uint32_t nmbr) {
uint32_t sum = 0;
uint64_t N64 = (uint64_t) N;
for(uint32_t j = 0; j < nmbr ; ++j ) {
sum += z[(accesses[j] * N64)>> 32] ;
}
return sum;
}
// N is a power of two
uint32_t maskedsum(uint32_t * z, uint32_t N, uint32_t * accesses, uint32_t nmbr) {
uint32_t sum = 0;
for(uint32_t j = 0; j < nmbr ; ++j ) {
sum += z[accesses[j] & (N-1)] ;
}
return sum;
}
void demo(uint32_t N) {
printf("N = %d\n", N);
uint32_t * z = malloc(N * sizeof(uint32_t));
for(uint32_t i = 0 ; i < N; ++i) z[i] = rand(); // some rand. number
uint32_t nmbr = 500;
uint32_t * accesses = malloc(nmbr * sizeof(uint32_t));
for(uint32_t i = 0 ; i < nmbr; ++i) accesses[i] = rand(); // some rand. number
uint32_t expected1 = modsum(z,N,accesses,nmbr);
uint32_t expected2 = fastsum(z,N,accesses,nmbr);
BEST_TIME(modsum(z,N,accesses,nmbr), expected1, 1000, nmbr);
BEST_TIME(fastsum(z,N,accesses,nmbr), expected2, 1000, nmbr);
free(z);
free(accesses);
}
void demopoweroftwo(uint32_t N) {
printf("N = %d\n", N);
uint32_t * z = malloc(N * sizeof(uint32_t));
for(uint32_t i = 0 ; i < N; ++i) z[i] = rand(); // some rand. number
uint32_t nmbr = 500;
uint32_t * accesses = malloc(nmbr * sizeof(uint32_t));
for(uint32_t i = 0 ; i < nmbr; ++i) accesses[i] = rand(); // some rand. number
uint32_t expected1 = modsum(z,N,accesses,nmbr);
uint32_t expected2 = fastsum(z,N,accesses,nmbr);
BEST_TIME(modsum(z,N,accesses,nmbr), expected1, 1000, nmbr);
BEST_TIME(fastsum(z,N,accesses,nmbr), expected2, 1000, nmbr);
BEST_TIME(maskedsum(z,N,accesses,nmbr), expected1, 1000, nmbr);
switch(N){
case 1<<8:
BEST_TIME(modsum_8(z,N,accesses,nmbr),expected1, 1000, nmbr);
break;
case 1<<16:
BEST_TIME(modsum_16(z,N,accesses,nmbr), expected1, 1000, nmbr);
break;
case 1<<24:
BEST_TIME(modsum_24(z,N,accesses,nmbr), expected1, 1000, nmbr);
break;
}
free(z);
free(accesses);
}
int main() {
demo(1 << 24);
demo(1 << 8);
demo(1 << 16);
demo(1 << 24);
demopoweroftwo(1 << 8);
demopoweroftwo(1 << 16);
demopoweroftwo(1 << 24);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment