Skip to content

Instantly share code, notes, and snippets.

@slembcke
Created April 11, 2019 16:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save slembcke/bc7ad818f1d5df627d040c83e13b7d1e to your computer and use it in GitHub Desktop.
Save slembcke/bc7ad818f1d5df627d040c83e13b7d1e to your computer and use it in GitHub Desktop.
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
static const size_t COUNT = 64*(1<<20);
typedef float FLOAT_T;
static FLOAT_T SRC[COUNT];
static FLOAT_T DST[COUNT];
static uint32_t INDEX[COUNT];
static uint8_t FLUSH_IN[COUNT];
static uint8_t FLUSH_OUT[COUNT];
static void flush_cache(){
// Attempt to flush the CPU cache with a large memcpy
memcpy(FLUSH_OUT, FLUSH_IN, sizeof(FLUSH_IN));
}
static inline void profile_start(uint64_t *cycles){
flush_cache();
*cycles = __builtin_readcyclecounter();
}
static inline void profile_end(uint64_t *cycles, const char *msg, size_t count, const char *unit){
double rate = (double)(__builtin_readcyclecounter() - *cycles)/count;
printf("%s: %.2f cycles/%s\n", msg, rate, unit);
printf("", SRC, DST);
}
static inline float Q_rsqrt( float number )
{
long i;
float x2, y;
const float threehalfs = 1.5F;
x2 = number * 0.5F;
y = number;
i = * ( long * ) &y; // evil floating point bit level hacking
i = 0x5f3759df - ( i >> 1 ); // what the fuck?
y = * ( float * ) &i;
y = y * ( threehalfs - ( x2 * y * y ) ); // 1st iteration
y = y * ( threehalfs - ( x2 * y * y ) ); // 2nd iteration, this can be removed
return y;
}
// This is a totally wrong implementation,
// but the performance should be comparable.
static const size_t TABLE_SIZE = 64*1024*(1 << 10);
static float TABLE[TABLE_SIZE];
static inline float table_sqrt(float f){
// Get a random index value to encourage cache misses.
float ipart = 0;
float t = modff(f, &ipart);
unsigned index = ipart;
// Grab the two nearby values.
float s0 = TABLE[(index + 0) & (TABLE_SIZE - 1)];
float s1 = TABLE[(index + 1) & (TABLE_SIZE - 1)];
return (1 - t)*s0 + t*s1;
}
int main(){
uint64_t cycles = 0;
bzero(SRC, sizeof(SRC));
bzero(DST, sizeof(DST));
bzero(TABLE, sizeof(TABLE));
for(unsigned i = 0; i < COUNT; i++) INDEX[i] = i;
printf("Warming up the CPU\n");
for(unsigned j = 0; j < 10; j++){
for(unsigned i = 0; i < COUNT; i++){
DST[i] = sin(SRC[i]) + cos(SRC[i]) + sqrt(SRC[i]) + exp(SRC[i]) + log(SRC[i]);
}
}
flush_cache();
profile_start(&cycles);
bzero(DST, sizeof(DST));
profile_end(&cycles, "bzero()", sizeof(SRC), "byte");
profile_start(&cycles);
memcpy(DST, SRC, sizeof(SRC));
profile_end(&cycles, "memcpy()", sizeof(SRC), "byte");
profile_start(&cycles);
for(unsigned i = 0; i < sizeof(SRC); i += 16){
((uint8_t *)DST)[i] = ((uint8_t *)SRC)[i];
}
profile_end(&cycles, "Copy every 16th byte", sizeof(SRC), "byte");
profile_start(&cycles);
for(unsigned i = 0; i < sizeof(SRC); i += 32){
((uint8_t *)DST)[i] = ((uint8_t *)SRC)[i];
}
profile_end(&cycles, "Copy every 32nd byte", sizeof(SRC), "byte");
profile_start(&cycles);
for(unsigned i = 0; i < sizeof(SRC); i += 64){
((uint8_t *)DST)[i] = ((uint8_t *)SRC)[i];
}
profile_end(&cycles, "Copy every 64th byte", sizeof(SRC), "byte");
profile_start(&cycles);
for(unsigned i = 0; i < sizeof(SRC); i += 128){
((uint8_t *)DST)[i] = ((uint8_t *)SRC)[i];
}
profile_end(&cycles, "Copy every 128th byte", sizeof(SRC), "byte");
profile_start(&cycles);
for(unsigned i = 0; i < sizeof(SRC); i += 256){
((uint8_t *)DST)[i] = ((uint8_t *)SRC)[i];
}
profile_end(&cycles, "Copy every 256th byte", sizeof(SRC), "byte");
profile_start(&cycles);
for(unsigned i = 0; i < sizeof(SRC); i++){
((uint8_t *)DST)[i] = ((uint8_t *)SRC)[(i*1572869) & (sizeof(SRC) - 1)];
}
profile_end(&cycles, "Copy bytes scrambled", sizeof(SRC), "byte");
// profile_start(&cycles);
// for(unsigned i = 0; i < COUNT; i++){
// DST[i] = SRC[INDEX[i]];
// }
// profile_end(&cycles, "Copy indirect");
// profile_start(&cycles);
// for(unsigned i = 0; i < COUNT; i++){
// DST[i] = SRC[(i*1572869) & (COUNT - 1)];
// }
// profile_end(&cycles, "Copy scrambled");
profile_start(&cycles);
for(unsigned i = 0; i < COUNT; i++){
DST[i] = SRC[i] + 1;
}
profile_end(&cycles, "x + 1", COUNT, "loop");
profile_start(&cycles);
for(unsigned i = 0; i < COUNT; i++){
DST[i] = 1/SRC[i] + 1;
}
profile_end(&cycles, "1 / x", COUNT, "loop");
profile_start(&cycles);
#pragma clang loop vectorize(disable)
for(unsigned i = 0; i < COUNT; i++){
DST[i] = sqrtf(SRC[i]);
}
profile_end(&cycles, "sqrt()", COUNT, "loop");
profile_start(&cycles);
#pragma clang loop vectorize(enable)
for(unsigned i = 0; i < COUNT; i++){
DST[i] = sqrtf(SRC[i]);
}
profile_end(&cycles, "sqrt() vec", COUNT, "loop");
profile_start(&cycles);
#pragma clang loop vectorize(disable)
for(unsigned i = 0; i < COUNT; i++){
DST[i] = 1/Q_rsqrt(SRC[i]);
}
profile_end(&cycles, "q_sqrt()", COUNT, "loop");
profile_start(&cycles);
#pragma clang loop vectorize(disable)
for(unsigned i = 0; i < COUNT; i++){
DST[i] = table_sqrt(SRC[i] + i*1572869);
}
profile_end(&cycles, "table_sqrt()", COUNT, "loop");
profile_start(&cycles);
#pragma clang loop vectorize(disable)
for(unsigned i = 0; i < COUNT; i++){
DST[i] = 1/sqrtf(SRC[i]);
}
profile_end(&cycles, "rsqrt()", COUNT, "loop");
profile_start(&cycles);
#pragma clang loop vectorize(disable)
for(unsigned i = 0; i < COUNT; i++){
DST[i] = Q_rsqrt(SRC[i]);
}
profile_end(&cycles, "q_rsqrt()", COUNT, "loop");
profile_start(&cycles);
for(unsigned i = 0; i < COUNT; i++){
DST[i] = log(SRC[i]);
}
profile_end(&cycles, "log()", COUNT, "loop");
profile_start(&cycles);
for(unsigned i = 0; i < COUNT; i++){
DST[i] = exp(SRC[i]);
}
profile_end(&cycles, "exp()", COUNT, "loop");
profile_start(&cycles);
for(unsigned i = 0; i < COUNT; i++){
DST[i] = pow(SRC[i], SRC[i]);
}
profile_end(&cycles, "pow()", COUNT, "loop");
profile_start(&cycles);
for(unsigned i = 0; i < COUNT; i++){
DST[i] = cos(SRC[i]);
}
profile_end(&cycles, "cos()", COUNT, "loop");
profile_start(&cycles);
for(unsigned i = 0; i < COUNT; i++){
DST[i] = acos(SRC[i]);
}
profile_end(&cycles, "acos()", COUNT, "loop");
profile_start(&cycles);
for(unsigned i = 0; i < COUNT; i++){
DST[i] = atan2(SRC[i], SRC[i]);
}
profile_end(&cycles, "atan2()", COUNT, "loop");
return EXIT_SUCCESS;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment