rcls/weak.c

## weak.c
// Program to compare pthread locking v. x86 weakly ordered stores.
//
// One thread does writes protected by a lock, the main thread does
// reads protected by the lock and checks that it sees consistent
// values.

// Invoke with the command line parameter "normal" to use normal
// memory writes, or with the parameter "weak" to use SSE2
// weakly-ordered memory writes.

// Use the taskset command to place the two threads on different
// physical cores, especially if you are running on a hyper-threaded
// processor.  If the two threads run on the same core (even if
// different hyper-threads) you will not see any inconsistency.

// On both an Intel "Ivy Bridge" i7-3770 and an AMD Phenom K8, I see
// the "weak" case producing inconsistent results: i.e., the locking
// is not synchronising the weakly ordered instructions.  But only
// when a pthread_spinlock_t is used; I don't see the inconsistent
// results with pthread_mutex_t.  I don't know if that is because the mutex
// does a memory-barrier, or just that it is slower.

#include <err.h>
#include <pthread.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>

// We can either compile to use mutexes or else spin-locks.
#if 1

static pthread_spinlock_t mutex;
#define LOCK_INIT() pthread_spin_init(&mutex, 0)
#define LOCK() pthread_spin_lock(&mutex)
#define UNLOCK() pthread_spin_unlock(&mutex)

#else

static pthread_mutex_t mutex;
#define LOCK_INIT() pthread_mutex_init(&mutex, NULL)
#define LOCK() pthread_mutex_lock(&mutex)
#define UNLOCK() pthread_mutex_unlock(&mutex)

#endif

struct line_t {
    int32_t v1;
    int32_t dummy[14];
    int32_t v2;
};

// The cache line and DRAM burst size are both 64 bytes.  block_t
// actually covers 2 bursts, so hopefully we can pick up a single
// isolated one.  I see the behaviour with line_t only covering a
// single cache line, but it is easier to reproduce with pairs.
struct block_t {
    struct line_t beg;
    struct line_t end;
};

#define NUM_BLOCKS 1024

#define STORE_BODY()                            \
    struct block_t * blocks = p;                \
    for (unsigned i = 0;; ++i) {                \
        for (int j = 0; j != NUM_BLOCKS; ++j) { \
            LOCK();                             \
            STORE(blocks[j].beg.v1, i);         \
            STORE(blocks[j].beg.v2, i);         \
            STORE(blocks[j].end.v1, i);         \
            STORE(blocks[j].end.v2, i);         \
            UNLOCK();                           \
        }                                       \
    }                                           \
    return NULL;


static void * thread_normal(void * p)
{
#define STORE(v,n) v = n
    STORE_BODY()
#undef STORE
}

static void * thread_nt(void * p)
{
#define STORE(v,n) __builtin_ia32_movnti(&v, n);
    STORE_BODY()
#undef STORE
}

int main(int argc, char * argv[])
{
    LOCK_INIT();

    struct block_t * blocks = calloc(NUM_BLOCKS, sizeof(struct block_t));
    if (blocks == NULL)
        err(1, "Out of memory");

    pthread_t thread;
    if (argc == 2 && strcmp(argv[1], "weak") == 0)
        pthread_create(&thread, NULL, thread_nt, blocks);
    else if (argc == 2 && strcmp(argv[1], "normal") == 0)
        pthread_create(&thread, NULL, thread_normal, blocks);
    else
        errx(1, "'weak' or 'normal' please");

    while (1) {
        for (int j = NUM_BLOCKS; j--;) {
            LOCK();
            int end = blocks[j].end.v2;
            int beg = blocks[j].beg.v1;
            UNLOCK();
            if (__builtin_expect(beg != end, 0))
                fprintf(stderr, "At %i got %i v %i\n", j, beg, end);
        }
    }
}
	// Program to compare pthread locking v. x86 weakly ordered stores.
	//
	// One thread does writes protected by a lock, the main thread does
	// reads protected by the lock and checks that it sees consistent
	// values.

	// Invoke with the command line parameter "normal" to use normal
	// memory writes, or with the parameter "weak" to use SSE2
	// weakly-ordered memory writes.

	// Use the taskset command to place the two threads on different
	// physical cores, especially if you are running on a hyper-threaded
	// processor. If the two threads run on the same core (even if
	// different hyper-threads) you will not see any inconsistency.

	// On both an Intel "Ivy Bridge" i7-3770 and an AMD Phenom K8, I see
	// the "weak" case producing inconsistent results: i.e., the locking
	// is not synchronising the weakly ordered instructions. But only
	// when a pthread_spinlock_t is used; I don't see the inconsistent
	// results with pthread_mutex_t. I don't know if that is because the mutex
	// does a memory-barrier, or just that it is slower.

	#include <err.h>
	#include <pthread.h>
	#include <stdint.h>
	#include <string.h>
	#include <stdio.h>

	// We can either compile to use mutexes or else spin-locks.
	#if 1

	static pthread_spinlock_t mutex;
	#define LOCK_INIT() pthread_spin_init(&mutex, 0)
	#define LOCK() pthread_spin_lock(&mutex)
	#define UNLOCK() pthread_spin_unlock(&mutex)

	#else

	static pthread_mutex_t mutex;
	#define LOCK_INIT() pthread_mutex_init(&mutex, NULL)
	#define LOCK() pthread_mutex_lock(&mutex)
	#define UNLOCK() pthread_mutex_unlock(&mutex)

	#endif

	struct line_t {
	int32_t v1;
	int32_t dummy[14];
	int32_t v2;
	};

	// The cache line and DRAM burst size are both 64 bytes. block_t
	// actually covers 2 bursts, so hopefully we can pick up a single
	// isolated one. I see the behaviour with line_t only covering a
	// single cache line, but it is easier to reproduce with pairs.
	struct block_t {
	struct line_t beg;
	struct line_t end;
	};

	#define NUM_BLOCKS 1024

	#define STORE_BODY() \
	struct block_t * blocks = p; \
	for (unsigned i = 0;; ++i) { \
	for (int j = 0; j != NUM_BLOCKS; ++j) { \
	LOCK(); \
	STORE(blocks[j].beg.v1, i); \
	STORE(blocks[j].beg.v2, i); \
	STORE(blocks[j].end.v1, i); \
	STORE(blocks[j].end.v2, i); \
	UNLOCK(); \
	} \
	} \
	return NULL;


	static void * thread_normal(void * p)
	{
	#define STORE(v,n) v = n
	STORE_BODY()
	#undef STORE
	}

	static void * thread_nt(void * p)
	{
	#define STORE(v,n) __builtin_ia32_movnti(&v, n);
	STORE_BODY()
	#undef STORE
	}

	int main(int argc, char * argv[])
	{
	LOCK_INIT();

	struct block_t * blocks = calloc(NUM_BLOCKS, sizeof(struct block_t));
	if (blocks == NULL)
	err(1, "Out of memory");

	pthread_t thread;
	if (argc == 2 && strcmp(argv[1], "weak") == 0)
	pthread_create(&thread, NULL, thread_nt, blocks);
	else if (argc == 2 && strcmp(argv[1], "normal") == 0)
	pthread_create(&thread, NULL, thread_normal, blocks);
	else
	errx(1, "'weak' or 'normal' please");

	while (1) {
	for (int j = NUM_BLOCKS; j--;) {
	LOCK();
	int end = blocks[j].end.v2;
	int beg = blocks[j].beg.v1;
	UNLOCK();
	if (__builtin_expect(beg != end, 0))
	fprintf(stderr, "At %i got %i v %i\n", j, beg, end);
	}
	}
	}