Skip to content

Instantly share code, notes, and snippets.

@rcls
Created July 15, 2014 06:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rcls/c855e3e782253e58e046 to your computer and use it in GitHub Desktop.
Save rcls/c855e3e782253e58e046 to your computer and use it in GitHub Desktop.
// Program to compare pthread locking v. x86 weakly ordered stores.
//
// One thread does writes protected by a lock, the main thread does
// reads protected by the lock and checks that it sees consistent
// values.
// Invoke with the command line parameter "normal" to use normal
// memory writes, or with the parameter "weak" to use SSE2
// weakly-ordered memory writes.
// Use the taskset command to place the two threads on different
// physical cores, especially if you are running on a hyper-threaded
// processor. If the two threads run on the same core (even if
// different hyper-threads) you will not see any inconsistency.
// On both an Intel "Ivy Bridge" i7-3770 and an AMD Phenom K8, I see
// the "weak" case producing inconsistent results: i.e., the locking
// is not synchronising the weakly ordered instructions. But only
// when a pthread_spinlock_t is used; I don't see the inconsistent
// results with pthread_mutex_t. I don't know if that is because the mutex
// does a memory-barrier, or just that it is slower.
#include <err.h>
#include <pthread.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
// We can either compile to use mutexes or else spin-locks.
#if 1
static pthread_spinlock_t mutex;
#define LOCK_INIT() pthread_spin_init(&mutex, 0)
#define LOCK() pthread_spin_lock(&mutex)
#define UNLOCK() pthread_spin_unlock(&mutex)
#else
static pthread_mutex_t mutex;
#define LOCK_INIT() pthread_mutex_init(&mutex, NULL)
#define LOCK() pthread_mutex_lock(&mutex)
#define UNLOCK() pthread_mutex_unlock(&mutex)
#endif
struct line_t {
int32_t v1;
int32_t dummy[14];
int32_t v2;
};
// The cache line and DRAM burst size are both 64 bytes. block_t
// actually covers 2 bursts, so hopefully we can pick up a single
// isolated one. I see the behaviour with line_t only covering a
// single cache line, but it is easier to reproduce with pairs.
struct block_t {
struct line_t beg;
struct line_t end;
};
#define NUM_BLOCKS 1024
#define STORE_BODY() \
struct block_t * blocks = p; \
for (unsigned i = 0;; ++i) { \
for (int j = 0; j != NUM_BLOCKS; ++j) { \
LOCK(); \
STORE(blocks[j].beg.v1, i); \
STORE(blocks[j].beg.v2, i); \
STORE(blocks[j].end.v1, i); \
STORE(blocks[j].end.v2, i); \
UNLOCK(); \
} \
} \
return NULL;
static void * thread_normal(void * p)
{
#define STORE(v,n) v = n
STORE_BODY()
#undef STORE
}
static void * thread_nt(void * p)
{
#define STORE(v,n) __builtin_ia32_movnti(&v, n);
STORE_BODY()
#undef STORE
}
int main(int argc, char * argv[])
{
LOCK_INIT();
struct block_t * blocks = calloc(NUM_BLOCKS, sizeof(struct block_t));
if (blocks == NULL)
err(1, "Out of memory");
pthread_t thread;
if (argc == 2 && strcmp(argv[1], "weak") == 0)
pthread_create(&thread, NULL, thread_nt, blocks);
else if (argc == 2 && strcmp(argv[1], "normal") == 0)
pthread_create(&thread, NULL, thread_normal, blocks);
else
errx(1, "'weak' or 'normal' please");
while (1) {
for (int j = NUM_BLOCKS; j--;) {
LOCK();
int end = blocks[j].end.v2;
int beg = blocks[j].beg.v1;
UNLOCK();
if (__builtin_expect(beg != end, 0))
fprintf(stderr, "At %i got %i v %i\n", j, beg, end);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment