Last active
August 29, 2015 14:11
-
-
Save rcls/95a6c881684197a09cbf to your computer and use it in GitHub Desktop.
Looking at how x86 atomic increment timings change depend on surrounding memory usage.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// On my Haswell 2.9GHz plus boost. | |
// With the atomic increment, the 2**29 iterations take just under 30 seconds. | |
// Without, about 11 seconds. | |
// | |
// I.e., the overhead for 0.5e9 atomic increments is approx 15 seconds, or | |
// 30nanosecs each. | |
// | |
// If we take the movntps operations out, the 2**29 atomic increments take | |
// approx 2.6 seconds, or 5ns each. | |
#include <stdio.h> | |
#include <sys/time.h> | |
#define SIZE 16777216 | |
float buffer[SIZE]; | |
typedef float v4sf __attribute__((vector_size(16))); | |
volatile long n; | |
static inline void storef64(float * p) | |
{ | |
const v4sf zeros = (v4sf) { 0, 0, 0, 0 }; | |
__builtin_ia32_movntps(p + 0, zeros); | |
__builtin_ia32_movntps(p + 4, zeros); | |
__builtin_ia32_movntps(p + 8, zeros); | |
__builtin_ia32_movntps(p + 12, zeros); | |
__builtin_ia32_movntps(p + 16, zeros); | |
__builtin_ia32_movntps(p + 20, zeros); | |
__builtin_ia32_movntps(p + 24, zeros); | |
__builtin_ia32_movntps(p + 28, zeros); | |
__builtin_ia32_movntps(p + 32, zeros); | |
__builtin_ia32_movntps(p + 36, zeros); | |
__builtin_ia32_movntps(p + 40, zeros); | |
__builtin_ia32_movntps(p + 44, zeros); | |
__builtin_ia32_movntps(p + 48, zeros); | |
__builtin_ia32_movntps(p + 52, zeros); | |
__builtin_ia32_movntps(p + 56, zeros); | |
__builtin_ia32_movntps(p + 60, zeros); | |
} | |
int main() | |
{ | |
struct timeval start; | |
gettimeofday(&start, NULL); | |
for (int i = 0; i != 4096; ++i) { | |
for (int base = 0; base < SIZE; base += 128) { | |
__asm__ volatile ("" ::: "memory"); | |
#ifndef NO_STORE | |
storef64(buffer + base); | |
storef64(buffer + base + 64); | |
#endif | |
#ifndef NO_XINC | |
__atomic_add_fetch(&n, 1, __ATOMIC_RELAXED); | |
#endif | |
} | |
} | |
struct timeval finish; | |
gettimeofday(&finish, NULL); | |
double d = finish.tv_sec - start.tv_sec | |
+ 1e-6 * (finish.tv_usec - start.tv_usec); | |
fprintf(stderr, "%g seconds for %li iterations\n", d, n); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment