duarten/false_sharing.c

## false_sharing.c
/*
 * This is an example program to demonstrate false sharing between threads.
 *
 * It can be compiled two ways:
 *    gcc -g false_sharing.c -Wall -pthread -lnuma -o false_sharing
 *    gcc -g false_sharing.c -Wall -pthread -lnuma -DNO_FALSE_SHARING -o no_false_sharing
 *
 * The -DNO_FALSE_SHARING macro reduces the false sharing.
 *
 * The usage is:
 *     ./false_sharing <number of threads in a NUMA node>
 *     ./no_false_sharing <number of threads in a NUMA node>
 */

#define _MULTI_THREADED
#define _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <sched.h>
#include <pthread.h>
#include <sys/types.h>
#include <assert.h>
#include <numa.h>

#define LOOP_CNT (100 * 1024 * 1024)

static __inline__ uint64_t rdtsc() {
    uint64_t hi, lo;
    __asm__ __volatile__ ( "rdtsc" : "=a"(lo), "=d"(hi));
    return lo | (hi << 32);
}

#ifdef NO_FALSE_SHARING
#define ALIGN 128
#else
#define ALIGN sizeof(uint64_t)
#endif

struct padded_long {
    uint64_t value;
} __attribute__((aligned (ALIGN)));

struct thread_data {
    pthread_t tid;
    int numa_node;
    volatile uint64_t* idx;
    volatile int running;
};

volatile int all_running = 0;

#define check_result(string, val) {                  \
    if (val) {                                       \
        printf("Failed with %d at %s", val, string); \
        exit(1);                                     \
    }                                                \
}

void* read_write_func(void* arg) {
    struct thread_data* t = (struct thread_data*)arg;
    numa_run_on_node(t->numa_node);
    t->running = 1;

    while (!all_running) ;

    uint64_t start = rdtsc();
    for (int i = 0; i < LOOP_CNT; ++i) {
        *t->idx += i;
    }

    uint64_t stop = rdtsc();
    printf("%lu mticks, node %d, cpu %d\n", (stop - start) / 1000000, t->numa_node, sched_getcpu());

    return NULL;
}

int main(int argc, char *argv[]) {
    if (argc != 2) {
        printf( "usage: %s <n>\n", argv[0] );
        printf( "where \"n\" is the number of threads per NUMA node\n");
        exit(1);
    }

    int num_nodes = numa_max_node() + 1;
    int num_threads = atoi(argv[1]);
    num_threads = num_threads > 1 ? num_threads * num_nodes : 2;

    struct thread_data* threads = (struct thread_data*) malloc(sizeof(struct thread_data) * num_threads);
    struct padded_long* longs = (struct padded_long*) malloc(sizeof(struct padded_long) * num_threads);
    assert((uint64_t)&longs[1] - (uint64_t)&longs[0] == ALIGN);

    for (int i = 0; i < num_threads; ++i) {
        threads[i].idx = &longs[i].value;
        threads[i].numa_node = i % num_nodes;
        threads[i].running = 0;
        int rc = pthread_create(&threads[i].tid, NULL, read_write_func, &threads[i]);
        check_result("pthread_create()\n", rc);
    }

    for (int i = 0; i < num_threads; ++i) {
        while (!threads[i].running) ;
    }
    all_running = 1;

    uint64_t start = rdtsc();
    for (int i = 0; i < num_threads; i++) {
        int rc = pthread_join(threads[i].tid, NULL);
        check_result("pthread_join()\n", rc);
    }
    uint64_t stop = rdtsc();
    int cpu = sched_getcpu();
    int node = numa_node_of_cpu(cpu);
    printf("main %lu mticks, node %d, cpu %d\n", (stop - start) / 1000000, node, cpu);

    free(threads);
    free(longs);
    return 0;
}
	/*
	* This is an example program to demonstrate false sharing between threads.
	*
	* It can be compiled two ways:
	* gcc -g false_sharing.c -Wall -pthread -lnuma -o false_sharing
	* gcc -g false_sharing.c -Wall -pthread -lnuma -DNO_FALSE_SHARING -o no_false_sharing
	*
	* The -DNO_FALSE_SHARING macro reduces the false sharing.
	*
	* The usage is:
	* ./false_sharing <number of threads in a NUMA node>
	* ./no_false_sharing <number of threads in a NUMA node>
	*/

	#define _MULTI_THREADED
	#define _GNU_SOURCE
	#include <stdio.h>
	#include <string.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <unistd.h>
	#include <sched.h>
	#include <pthread.h>
	#include <sys/types.h>
	#include <assert.h>
	#include <numa.h>

	#define LOOP_CNT (100 * 1024 * 1024)

	static __inline__ uint64_t rdtsc() {
	uint64_t hi, lo;
	__asm__ __volatile__ ( "rdtsc" : "=a"(lo), "=d"(hi));
	return lo \| (hi << 32);
	}

	#ifdef NO_FALSE_SHARING
	#define ALIGN 128
	#else
	#define ALIGN sizeof(uint64_t)
	#endif

	struct padded_long {
	uint64_t value;
	} __attribute__((aligned (ALIGN)));

	struct thread_data {
	pthread_t tid;
	int numa_node;
	volatile uint64_t* idx;
	volatile int running;
	};

	volatile int all_running = 0;

	#define check_result(string, val) { \
	if (val) { \
	printf("Failed with %d at %s", val, string); \
	exit(1); \
	} \
	}

	void* read_write_func(void* arg) {
	struct thread_data* t = (struct thread_data*)arg;
	numa_run_on_node(t->numa_node);
	t->running = 1;

	while (!all_running) ;

	uint64_t start = rdtsc();
	for (int i = 0; i < LOOP_CNT; ++i) {
	*t->idx += i;
	}

	uint64_t stop = rdtsc();
	printf("%lu mticks, node %d, cpu %d\n", (stop - start) / 1000000, t->numa_node, sched_getcpu());

	return NULL;
	}

	int main(int argc, char *argv[]) {
	if (argc != 2) {
	printf( "usage: %s <n>\n", argv[0] );
	printf( "where \"n\" is the number of threads per NUMA node\n");
	exit(1);
	}

	int num_nodes = numa_max_node() + 1;
	int num_threads = atoi(argv[1]);
	num_threads = num_threads > 1 ? num_threads * num_nodes : 2;

	struct thread_data* threads = (struct thread_data) malloc(sizeof(struct thread_data) num_threads);
	struct padded_long* longs = (struct padded_long) malloc(sizeof(struct padded_long) num_threads);
	assert((uint64_t)&longs[1] - (uint64_t)&longs[0] == ALIGN);

	for (int i = 0; i < num_threads; ++i) {
	threads[i].idx = &longs[i].value;
	threads[i].numa_node = i % num_nodes;
	threads[i].running = 0;
	int rc = pthread_create(&threads[i].tid, NULL, read_write_func, &threads[i]);
	check_result("pthread_create()\n", rc);
	}

	for (int i = 0; i < num_threads; ++i) {
	while (!threads[i].running) ;
	}
	all_running = 1;

	uint64_t start = rdtsc();
	for (int i = 0; i < num_threads; i++) {
	int rc = pthread_join(threads[i].tid, NULL);
	check_result("pthread_join()\n", rc);
	}
	uint64_t stop = rdtsc();
	int cpu = sched_getcpu();
	int node = numa_node_of_cpu(cpu);
	printf("main %lu mticks, node %d, cpu %d\n", (stop - start) / 1000000, node, cpu);

	free(threads);
	free(longs);
	return 0;
	}