Created
July 15, 2014 06:34
-
-
Save rcls/c855e3e782253e58e046 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Program to compare pthread locking v. x86 weakly ordered stores. | |
// | |
// One thread does writes protected by a lock, the main thread does | |
// reads protected by the lock and checks that it sees consistent | |
// values. | |
// Invoke with the command line parameter "normal" to use normal | |
// memory writes, or with the parameter "weak" to use SSE2 | |
// weakly-ordered memory writes. | |
// Use the taskset command to place the two threads on different | |
// physical cores, especially if you are running on a hyper-threaded | |
// processor. If the two threads run on the same core (even if | |
// different hyper-threads) you will not see any inconsistency. | |
// On both an Intel "Ivy Bridge" i7-3770 and an AMD Phenom K8, I see | |
// the "weak" case producing inconsistent results: i.e., the locking | |
// is not synchronising the weakly ordered instructions. But only | |
// when a pthread_spinlock_t is used; I don't see the inconsistent | |
// results with pthread_mutex_t. I don't know if that is because the mutex | |
// does a memory-barrier, or just that it is slower. | |
#include <err.h> | |
#include <pthread.h> | |
#include <stdint.h> | |
#include <string.h> | |
#include <stdio.h> | |
// We can either compile to use mutexes or else spin-locks. | |
#if 1 | |
static pthread_spinlock_t mutex; | |
#define LOCK_INIT() pthread_spin_init(&mutex, 0) | |
#define LOCK() pthread_spin_lock(&mutex) | |
#define UNLOCK() pthread_spin_unlock(&mutex) | |
#else | |
static pthread_mutex_t mutex; | |
#define LOCK_INIT() pthread_mutex_init(&mutex, NULL) | |
#define LOCK() pthread_mutex_lock(&mutex) | |
#define UNLOCK() pthread_mutex_unlock(&mutex) | |
#endif | |
struct line_t { | |
int32_t v1; | |
int32_t dummy[14]; | |
int32_t v2; | |
}; | |
// The cache line and DRAM burst size are both 64 bytes. block_t | |
// actually covers 2 bursts, so hopefully we can pick up a single | |
// isolated one. I see the behaviour with line_t only covering a | |
// single cache line, but it is easier to reproduce with pairs. | |
struct block_t { | |
struct line_t beg; | |
struct line_t end; | |
}; | |
#define NUM_BLOCKS 1024 | |
#define STORE_BODY() \ | |
struct block_t * blocks = p; \ | |
for (unsigned i = 0;; ++i) { \ | |
for (int j = 0; j != NUM_BLOCKS; ++j) { \ | |
LOCK(); \ | |
STORE(blocks[j].beg.v1, i); \ | |
STORE(blocks[j].beg.v2, i); \ | |
STORE(blocks[j].end.v1, i); \ | |
STORE(blocks[j].end.v2, i); \ | |
UNLOCK(); \ | |
} \ | |
} \ | |
return NULL; | |
static void * thread_normal(void * p) | |
{ | |
#define STORE(v,n) v = n | |
STORE_BODY() | |
#undef STORE | |
} | |
static void * thread_nt(void * p) | |
{ | |
#define STORE(v,n) __builtin_ia32_movnti(&v, n); | |
STORE_BODY() | |
#undef STORE | |
} | |
int main(int argc, char * argv[]) | |
{ | |
LOCK_INIT(); | |
struct block_t * blocks = calloc(NUM_BLOCKS, sizeof(struct block_t)); | |
if (blocks == NULL) | |
err(1, "Out of memory"); | |
pthread_t thread; | |
if (argc == 2 && strcmp(argv[1], "weak") == 0) | |
pthread_create(&thread, NULL, thread_nt, blocks); | |
else if (argc == 2 && strcmp(argv[1], "normal") == 0) | |
pthread_create(&thread, NULL, thread_normal, blocks); | |
else | |
errx(1, "'weak' or 'normal' please"); | |
while (1) { | |
for (int j = NUM_BLOCKS; j--;) { | |
LOCK(); | |
int end = blocks[j].end.v2; | |
int beg = blocks[j].beg.v1; | |
UNLOCK(); | |
if (__builtin_expect(beg != end, 0)) | |
fprintf(stderr, "At %i got %i v %i\n", j, beg, end); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment