Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
/* gcc -Wall -g -O3 a.c -pthread */
/* perf stat -e LLC-store-misses,LLC-store,LLC-load-misses,LLC-load ./a.out */
#define _GNU_SOURCE
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#define MAX_WORKER 64
#define CACHELINE_SIZE 64
union work {
volatile int64_t buf;
char pad[CACHELINE_SIZE];
};
struct distributor {
union work wrk[MAX_WORKER];
};
static inline void
my_pause(void)
{
asm volatile ("rep;nop":::"memory");
}
static inline int64_t
get_work(struct distributor *d, int id)
{
union work *w = &d->wrk[id];
while (w->buf)
my_pause();
w->buf = 1;
while (w->buf)
my_pause();
return w->buf;
}
static void
process(struct distributor *d, unsigned num)
{
static unsigned id = 0;
unsigned i;
int64_t data;
i = 0;
while (i < num) {
if (id == MAX_WORKER)
id = 0;
data = d->wrk[id].buf;
if (data) {
d->wrk[id].buf = 0;
++i;
}
++id;
}
}
static struct distributor dist;
static volatile uint64_t stop = 0;
static void *
worker_entry(void *arg)
{
int id = (int)(uintptr_t)arg;
int cnt = 0;
while (!stop) {
get_work(&dist, id);
cnt++;
}
printf("kid %d get %d work\n", id + 1, cnt);
return NULL;
}
/*
* The topology is
* NUMA node0 CPU(s): 0-7,16-23
* NUMA node1 CPU(s): 8-15,24-31
* The master core is always 0.
* If the number of kids is within [1, 7], then all can stay on node0.
* If the number of kids is within [8, 15], then we should use [1, 7] U [16, 23]
* for kids.
* If the number of kids is larger than 15, then we cannot avoid spanning over 2
* nodes.
*/
static void
set_kids_affinity(pthread_t *kids, int cnt, int naive)
{
cpu_set_t cpuset[MAX_WORKER];
int i;
const static int top[] = {
1, 2, 3, 4, 5, 6, 7,
16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15,
24, 25, 26, 27, 28, 29, 30, 31
};
for (i = 0; i < cnt; i++) {
CPU_ZERO(&cpuset[i]);
CPU_SET(naive ? (i + 1) : top[i], &cpuset[i]);
pthread_setaffinity_np(kids[i], sizeof(cpu_set_t), &cpuset[i]);
}
}
int
main(int argc, char **argv)
{
#define DEFAULT_WORKER_CNT 15
int cnt = DEFAULT_WORKER_CNT;
int i;
cpu_set_t master;
pthread_t kids[MAX_WORKER];
if (argc > 1)
cnt = atoi(argv[1]);
bzero(&dist, sizeof(dist));
CPU_ZERO(&master);
CPU_SET(0, &master);
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &master);
for (i = 0; i < cnt; i++)
pthread_create(&kids[i], NULL, worker_entry, (void *)(uintptr_t)i);
/* XXX
* Use default worker num, 15.
* Change 0 to 1 to see difference of LLC cache miss.
*/
set_kids_affinity(kids, cnt, 0);
process(&dist, 1000000);
stop = 1;
/* clean up */
while (1) {
int dirty = 0;
for (i = 0; i < MAX_WORKER; i++) {
if (dist.wrk[i].buf) {
dist.wrk[i].buf = 0;
++dirty;
}
}
if (!dirty)
break;
}
for (i = 0; i < cnt; i++)
pthread_join(kids[i], NULL);
printf("OK\n");
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.