anarazel/threaded_atomic_bench.c

## threaded_atomic_bench.c
#include <pthread.h>
#include <stdio.h>
#include <stdint.h>
#include <unistd.h>
#include <signal.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>

#ifndef SECS
#define SECS 5
#endif

#ifndef NTHREADS
#define NTHREADS 40
#endif

#ifndef ATOMIC_WIDTH
#define ATOMIC_WIDTH 64
#endif

//#define ATOMIC_METHOD_RACY
//#define ATOMIC_METHOD_MUTEX
//#define ATOMIC_METHOD_ADD
//#define ATOMIC_METHOD_INC
//#define ATOMIC_METHOD_XADD
//#define ATOMIC_METHOD_CMPXCHG
//#define ATOMIC_METHOD_CMPXCHG16

static pthread_barrier_t barrier;

#if ATOMIC_WIDTH==32
typedef uint32_t atomic_width_t;
#else
typedef uint64_t atomic_width_t;
#endif

struct shared_data {
	char pad0[64];

	atomic_width_t count;

#if defined(ATOMIC_METHOD_MUTEX)
	pthread_mutex_t mutex;
#endif
	char pad1[64];
} shared_data;

typedef struct per_thread_data
{
	uint64_t count;
	uint64_t retries;
} per_thread_data;


#if ATOMIC_WIDTH==32
static inline void
atomic_add(uint32_t *val, uint32_t add_)
{
	__asm__ __volatile__(
		"	lock				\n"
		"	add	%1,%0		\n"
:		"+m"(*val)
:		"r" (add_)
:		"memory", "cc");
}

static inline void
atomic_inc(uint32_t *val)
{
	__asm__ __volatile__(
		"	lock				\n"
		"	inc	%0		\n"
:		"+m"(*val)
:
:		"memory", "cc");
}


static inline bool
atomic_cmpxchg(volatile uint32_t *ptr,
	       uint32_t *expected, uint32_t newval)
{
	char	ret;

	/*
	 * Perform cmpxchg and use the zero flag which it implicitly sets when
	 * equal to measure the success.
	 */
	__asm__ __volatile__(
		"	lock				\n"
		"	cmpxchgl	%4,%5	\n"
		"   setz		%2		\n"
:		"=a" (*expected), "=m"(*ptr), "=q" (ret)
:		"a" (*expected), "r" (newval), "m"(*ptr)
:		"memory", "cc");
	return (bool) ret;
}

static inline bool
non_atomic_cmpxchg(volatile uint32_t *ptr,
	       uint32_t *expected, uint32_t newval)
{
	char	ret;

	/*
	 * Perform cmpxchg and use the zero flag which it implicitly sets when
	 * equal to measure the success.
	 */
	__asm__ __volatile__(
		"	cmpxchgl	%4,%5	\n"
		"   setz		%2		\n"
:		"=a" (*expected), "=m"(*ptr), "=q" (ret)
:		"a" (*expected), "r" (newval), "m"(*ptr)
:		"memory", "cc");
	return (bool) ret;
}

#else
static inline void
atomic_add(uint64_t *val, uint64_t add_)
{
	__asm__ __volatile__(
		"	lock				\n"
		"	addq	%1,%0		\n"
:		"+m"(*val)
:		"r" (add_)
:		"memory", "cc");
}

static inline void
atomic_inc(uint64_t *val)
{
	__asm__ __volatile__(
		"	lock				\n"
		"	incq	%0		\n"
:		"+m"(*val)
:
:		"memory", "cc");
}

#endif

void *
thread_main(void *p)
{
	per_thread_data *this_thread = (per_thread_data *) p;

	pthread_barrier_wait(&barrier);

	while (1)
	{
#if defined(ATOMIC_METHOD_RACY)
		this_thread->count++;
		shared_data.count++;
		__asm__ __volatile__("" :::"memory");
#elif defined(ATOMIC_METHOD_MUTEX)
		pthread_mutex_lock(&shared_data.mutex);
		this_thread->count++;
		shared_data.count++;
		pthread_mutex_unlock(&shared_data.mutex);
#elif defined(ATOMIC_METHOD_ADD)
		this_thread->count++;
		atomic_add(&shared_data.count, 1);
#elif defined(ATOMIC_METHOD_INC)
		this_thread->count++;
		atomic_inc(&shared_data.count);
#elif defined(ATOMIC_METHOD_XADD)
		this_thread->count++;
		__atomic_fetch_add(&shared_data.count, 1, __ATOMIC_SEQ_CST);
#elif defined(ATOMIC_METHOD_CMPXCHG)
		uint64_t cur = shared_data.count;

		this_thread->count++;
		while (!__atomic_compare_exchange_n(&shared_data.count, &cur, cur + 1,
											0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
		{
			this_thread->retries++;
		}

#elif defined(ATOMIC_METHOD_CMPXCHG_ASM)
		atomic_width_t cur = shared_data.count;

		this_thread->count++;
		while (!atomic_cmpxchg(&shared_data.count, &cur, cur + 1))
		{
			this_thread->retries++;
		}
#elif defined(NON_ATOMIC_METHOD_CMPXCHG_ASM)
		atomic_width_t cur = shared_data.count;

		this_thread->count++;
		while (!non_atomic_cmpxchg(&shared_data.count, &cur, cur + 1))
		{
			this_thread->retries++;
		}
#else
#error gotta tell me what to do
#endif
	}
}

static void
handle_sigint(int sig)
{
	pthread_exit(0);
}

int
main(int argc, char **argv)
{
	pthread_t threads[NTHREADS];
	per_thread_data *thread_data[NTHREADS];
	int secs = SECS;

	signal(SIGINT, handle_sigint);
	pthread_barrier_init(&barrier, NULL, NTHREADS+1);

#if defined(ATOMIC_METHOD_MUTEX)
	pthread_mutex_init(&shared_data.mutex, NULL);
#endif

	shared_data.count = 0;

	for (int i = 0; i < NTHREADS; i++)
	{
		thread_data[i] = aligned_alloc(4096, sizeof(per_thread_data));
		memset(thread_data[i], 0, sizeof(per_thread_data));

		pthread_create(&threads[i], NULL, thread_main, thread_data[i]);
	}

	pthread_barrier_wait(&barrier);
	sleep(secs);

	fprintf(stderr, "shared counter after 1s (before killing) is: %llu\n",
			(long long unsigned) shared_data.count);

	for (int i = 0; i < NTHREADS; i++)
	{
		pthread_kill(threads[i], SIGINT);
	}

	for (int i = 0; i < NTHREADS; i++)
	{
		pthread_join(threads[i], NULL);
	}

	uint64_t thread_count_sum = 0;
	uint64_t thread_retries_sum = 0;

	for (int i = 0; i < NTHREADS; i++)
	{
		thread_count_sum += thread_data[i]->count;
		thread_retries_sum += thread_data[i]->retries;
		// XXX: compute stddev instead
		fprintf(stderr, "thread %d: %llu\n",
				i, (long long unsigned) thread_data[i]->count);
	}

	fprintf(stderr, "final counters after killing are: %llu, per-thread counters sum %llu, diff %lld (allowed <= %d), retries %llu\n",
			(long long unsigned) shared_data.count,
			(long long unsigned) thread_count_sum,
			(long long) shared_data.count - (long long) thread_count_sum,
			-NTHREADS,
			(long long unsigned) thread_retries_sum);

	fprintf(stderr, "throughput per thread: %.2fM/s, total: %.2fM/s\n",
			((double) thread_count_sum / NTHREADS) / secs / 1000000,
			((double) thread_count_sum) / secs / 1000000);

	return 0;
}
	#include <pthread.h>
	#include <stdio.h>
	#include <stdint.h>
	#include <unistd.h>
	#include <signal.h>
	#include <stdlib.h>
	#include <string.h>
	#include <stdbool.h>

	#ifndef SECS
	#define SECS 5
	#endif

	#ifndef NTHREADS
	#define NTHREADS 40
	#endif

	#ifndef ATOMIC_WIDTH
	#define ATOMIC_WIDTH 64
	#endif

	//#define ATOMIC_METHOD_RACY
	//#define ATOMIC_METHOD_MUTEX
	//#define ATOMIC_METHOD_ADD
	//#define ATOMIC_METHOD_INC
	//#define ATOMIC_METHOD_XADD
	//#define ATOMIC_METHOD_CMPXCHG
	//#define ATOMIC_METHOD_CMPXCHG16

	static pthread_barrier_t barrier;

	#if ATOMIC_WIDTH==32
	typedef uint32_t atomic_width_t;
	#else
	typedef uint64_t atomic_width_t;
	#endif

	struct shared_data {
	char pad0[64];

	atomic_width_t count;

	#if defined(ATOMIC_METHOD_MUTEX)
	pthread_mutex_t mutex;
	#endif
	char pad1[64];
	} shared_data;

	typedef struct per_thread_data
	{
	uint64_t count;
	uint64_t retries;
	} per_thread_data;


	#if ATOMIC_WIDTH==32
	static inline void
	atomic_add(uint32_t *val, uint32_t add_)
	{
	__asm__ __volatile__(
	" lock \n"
	" add %1,%0 \n"
	: "+m"(*val)
	: "r" (add_)
	: "memory", "cc");
	}

	static inline void
	atomic_inc(uint32_t *val)
	{
	__asm__ __volatile__(
	" lock \n"
	" inc %0 \n"
	: "+m"(*val)
	:
	: "memory", "cc");
	}


	static inline bool
	atomic_cmpxchg(volatile uint32_t *ptr,
	uint32_t *expected, uint32_t newval)
	{
	char ret;

	/*
	* Perform cmpxchg and use the zero flag which it implicitly sets when
	* equal to measure the success.
	*/
	__asm__ __volatile__(
	" lock \n"
	" cmpxchgl %4,%5 \n"
	" setz %2 \n"
	: "=a" (expected), "=m"(ptr), "=q" (ret)
	: "a" (expected), "r" (newval), "m"(ptr)
	: "memory", "cc");
	return (bool) ret;
	}

	static inline bool
	non_atomic_cmpxchg(volatile uint32_t *ptr,
	uint32_t *expected, uint32_t newval)
	{
	char ret;

	/*
	* Perform cmpxchg and use the zero flag which it implicitly sets when
	* equal to measure the success.
	*/
	__asm__ __volatile__(
	" cmpxchgl %4,%5 \n"
	" setz %2 \n"
	: "=a" (expected), "=m"(ptr), "=q" (ret)
	: "a" (expected), "r" (newval), "m"(ptr)
	: "memory", "cc");
	return (bool) ret;
	}

	#else
	static inline void
	atomic_add(uint64_t *val, uint64_t add_)
	{
	__asm__ __volatile__(
	" lock \n"
	" addq %1,%0 \n"
	: "+m"(*val)
	: "r" (add_)
	: "memory", "cc");
	}

	static inline void
	atomic_inc(uint64_t *val)
	{
	__asm__ __volatile__(
	" lock \n"
	" incq %0 \n"
	: "+m"(*val)
	:
	: "memory", "cc");
	}

	#endif

	void *
	thread_main(void *p)
	{
	per_thread_data this_thread = (per_thread_data ) p;

	pthread_barrier_wait(&barrier);

	while (1)
	{
	#if defined(ATOMIC_METHOD_RACY)
	this_thread->count++;
	shared_data.count++;
	__asm__ __volatile__("" :::"memory");
	#elif defined(ATOMIC_METHOD_MUTEX)
	pthread_mutex_lock(&shared_data.mutex);
	this_thread->count++;
	shared_data.count++;
	pthread_mutex_unlock(&shared_data.mutex);
	#elif defined(ATOMIC_METHOD_ADD)
	this_thread->count++;
	atomic_add(&shared_data.count, 1);
	#elif defined(ATOMIC_METHOD_INC)
	this_thread->count++;
	atomic_inc(&shared_data.count);
	#elif defined(ATOMIC_METHOD_XADD)
	this_thread->count++;
	__atomic_fetch_add(&shared_data.count, 1, __ATOMIC_SEQ_CST);
	#elif defined(ATOMIC_METHOD_CMPXCHG)
	uint64_t cur = shared_data.count;

	this_thread->count++;
	while (!__atomic_compare_exchange_n(&shared_data.count, &cur, cur + 1,
	0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
	{
	this_thread->retries++;
	}

	#elif defined(ATOMIC_METHOD_CMPXCHG_ASM)
	atomic_width_t cur = shared_data.count;

	this_thread->count++;
	while (!atomic_cmpxchg(&shared_data.count, &cur, cur + 1))
	{
	this_thread->retries++;
	}
	#elif defined(NON_ATOMIC_METHOD_CMPXCHG_ASM)
	atomic_width_t cur = shared_data.count;

	this_thread->count++;
	while (!non_atomic_cmpxchg(&shared_data.count, &cur, cur + 1))
	{
	this_thread->retries++;
	}
	#else
	#error gotta tell me what to do
	#endif
	}
	}

	static void
	handle_sigint(int sig)
	{
	pthread_exit(0);
	}

	int
	main(int argc, char **argv)
	{
	pthread_t threads[NTHREADS];
	per_thread_data *thread_data[NTHREADS];
	int secs = SECS;

	signal(SIGINT, handle_sigint);
	pthread_barrier_init(&barrier, NULL, NTHREADS+1);

	#if defined(ATOMIC_METHOD_MUTEX)
	pthread_mutex_init(&shared_data.mutex, NULL);
	#endif

	shared_data.count = 0;

	for (int i = 0; i < NTHREADS; i++)
	{
	thread_data[i] = aligned_alloc(4096, sizeof(per_thread_data));
	memset(thread_data[i], 0, sizeof(per_thread_data));

	pthread_create(&threads[i], NULL, thread_main, thread_data[i]);
	}

	pthread_barrier_wait(&barrier);
	sleep(secs);

	fprintf(stderr, "shared counter after 1s (before killing) is: %llu\n",
	(long long unsigned) shared_data.count);

	for (int i = 0; i < NTHREADS; i++)
	{
	pthread_kill(threads[i], SIGINT);
	}

	for (int i = 0; i < NTHREADS; i++)
	{
	pthread_join(threads[i], NULL);
	}

	uint64_t thread_count_sum = 0;
	uint64_t thread_retries_sum = 0;

	for (int i = 0; i < NTHREADS; i++)
	{
	thread_count_sum += thread_data[i]->count;
	thread_retries_sum += thread_data[i]->retries;
	// XXX: compute stddev instead
	fprintf(stderr, "thread %d: %llu\n",
	i, (long long unsigned) thread_data[i]->count);
	}

	fprintf(stderr, "final counters after killing are: %llu, per-thread counters sum %llu, diff %lld (allowed <= %d), retries %llu\n",
	(long long unsigned) shared_data.count,
	(long long unsigned) thread_count_sum,
	(long long) shared_data.count - (long long) thread_count_sum,
	-NTHREADS,
	(long long unsigned) thread_retries_sum);

	fprintf(stderr, "throughput per thread: %.2fM/s, total: %.2fM/s\n",
	((double) thread_count_sum / NTHREADS) / secs / 1000000,
	((double) thread_count_sum) / secs / 1000000);

	return 0;
	}