Last active
August 29, 2015 14:09
-
-
Save jigsawecho/6a2e78d65f0fe67adf1b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* gcc -Wall -g -O3 a.c -pthread */ | |
/* perf stat -e LLC-store-misses,LLC-store,LLC-load-misses,LLC-load ./a.out */ | |
#define _GNU_SOURCE | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <pthread.h> | |
#define MAX_WORKER 64 | |
#define CACHELINE_SIZE 64 | |
union work { | |
volatile int64_t buf; | |
char pad[CACHELINE_SIZE]; | |
}; | |
struct distributor { | |
union work wrk[MAX_WORKER]; | |
}; | |
static inline void | |
my_pause(void) | |
{ | |
asm volatile ("rep;nop":::"memory"); | |
} | |
static inline int64_t | |
get_work(struct distributor *d, int id) | |
{ | |
union work *w = &d->wrk[id]; | |
while (w->buf) | |
my_pause(); | |
w->buf = 1; | |
while (w->buf) | |
my_pause(); | |
return w->buf; | |
} | |
static void | |
process(struct distributor *d, unsigned num) | |
{ | |
static unsigned id = 0; | |
unsigned i; | |
int64_t data; | |
i = 0; | |
while (i < num) { | |
if (id == MAX_WORKER) | |
id = 0; | |
data = d->wrk[id].buf; | |
if (data) { | |
d->wrk[id].buf = 0; | |
++i; | |
} | |
++id; | |
} | |
} | |
static struct distributor dist; | |
static volatile uint64_t stop = 0; | |
static void * | |
worker_entry(void *arg) | |
{ | |
int id = (int)(uintptr_t)arg; | |
int cnt = 0; | |
while (!stop) { | |
get_work(&dist, id); | |
cnt++; | |
} | |
printf("kid %d get %d work\n", id + 1, cnt); | |
return NULL; | |
} | |
/* | |
* The topology is | |
* NUMA node0 CPU(s): 0-7,16-23 | |
* NUMA node1 CPU(s): 8-15,24-31 | |
* The master core is always 0. | |
* If the number of kids is within [1, 7], then all can stay on node0. | |
* If the number of kids is within [8, 15], then we should use [1, 7] U [16, 23] | |
* for kids. | |
* If the number of kids is larger than 15, then we cannot avoid spanning over 2 | |
* nodes. | |
*/ | |
static void | |
set_kids_affinity(pthread_t *kids, int cnt, int naive) | |
{ | |
cpu_set_t cpuset[MAX_WORKER]; | |
int i; | |
const static int top[] = { | |
1, 2, 3, 4, 5, 6, 7, | |
16, 17, 18, 19, 20, 21, 22, 23, | |
8, 9, 10, 11, 12, 13, 14, 15, | |
24, 25, 26, 27, 28, 29, 30, 31 | |
}; | |
for (i = 0; i < cnt; i++) { | |
CPU_ZERO(&cpuset[i]); | |
CPU_SET(naive ? (i + 1) : top[i], &cpuset[i]); | |
pthread_setaffinity_np(kids[i], sizeof(cpu_set_t), &cpuset[i]); | |
} | |
} | |
int | |
main(int argc, char **argv) | |
{ | |
#define DEFAULT_WORKER_CNT 15 | |
int cnt = DEFAULT_WORKER_CNT; | |
int i; | |
cpu_set_t master; | |
pthread_t kids[MAX_WORKER]; | |
if (argc > 1) | |
cnt = atoi(argv[1]); | |
bzero(&dist, sizeof(dist)); | |
CPU_ZERO(&master); | |
CPU_SET(0, &master); | |
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &master); | |
for (i = 0; i < cnt; i++) | |
pthread_create(&kids[i], NULL, worker_entry, (void *)(uintptr_t)i); | |
/* XXX | |
* Use default worker num, 15. | |
* Change 0 to 1 to see difference of LLC cache miss. | |
*/ | |
set_kids_affinity(kids, cnt, 0); | |
process(&dist, 1000000); | |
stop = 1; | |
/* clean up */ | |
while (1) { | |
int dirty = 0; | |
for (i = 0; i < MAX_WORKER; i++) { | |
if (dist.wrk[i].buf) { | |
dist.wrk[i].buf = 0; | |
++dirty; | |
} | |
} | |
if (!dirty) | |
break; | |
} | |
for (i = 0; i < cnt; i++) | |
pthread_join(kids[i], NULL); | |
printf("OK\n"); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment