Skip to content

Instantly share code, notes, and snippets.

@rcls
Last active August 29, 2015 14:11
Show Gist options
  • Save rcls/95a6c881684197a09cbf to your computer and use it in GitHub Desktop.
Save rcls/95a6c881684197a09cbf to your computer and use it in GitHub Desktop.
Looking at how x86 atomic increment timings change depend on surrounding memory usage.
// On my Haswell 2.9GHz plus boost.
// With the atomic increment, the 2**29 iterations take just under 30 seconds.
// Without, about 11 seconds.
//
// I.e., the overhead for 0.5e9 atomic increments is approx 15 seconds, or
// 30nanosecs each.
//
// If we take the movntps operations out, the 2**29 atomic increments take
// approx 2.6 seconds, or 5ns each.
#include <stdio.h>
#include <sys/time.h>
#define SIZE 16777216
float buffer[SIZE];
typedef float v4sf __attribute__((vector_size(16)));
volatile long n;
static inline void storef64(float * p)
{
const v4sf zeros = (v4sf) { 0, 0, 0, 0 };
__builtin_ia32_movntps(p + 0, zeros);
__builtin_ia32_movntps(p + 4, zeros);
__builtin_ia32_movntps(p + 8, zeros);
__builtin_ia32_movntps(p + 12, zeros);
__builtin_ia32_movntps(p + 16, zeros);
__builtin_ia32_movntps(p + 20, zeros);
__builtin_ia32_movntps(p + 24, zeros);
__builtin_ia32_movntps(p + 28, zeros);
__builtin_ia32_movntps(p + 32, zeros);
__builtin_ia32_movntps(p + 36, zeros);
__builtin_ia32_movntps(p + 40, zeros);
__builtin_ia32_movntps(p + 44, zeros);
__builtin_ia32_movntps(p + 48, zeros);
__builtin_ia32_movntps(p + 52, zeros);
__builtin_ia32_movntps(p + 56, zeros);
__builtin_ia32_movntps(p + 60, zeros);
}
int main()
{
struct timeval start;
gettimeofday(&start, NULL);
for (int i = 0; i != 4096; ++i) {
for (int base = 0; base < SIZE; base += 128) {
__asm__ volatile ("" ::: "memory");
#ifndef NO_STORE
storef64(buffer + base);
storef64(buffer + base + 64);
#endif
#ifndef NO_XINC
__atomic_add_fetch(&n, 1, __ATOMIC_RELAXED);
#endif
}
}
struct timeval finish;
gettimeofday(&finish, NULL);
double d = finish.tv_sec - start.tv_sec
+ 1e-6 * (finish.tv_usec - start.tv_usec);
fprintf(stderr, "%g seconds for %li iterations\n", d, n);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment