Skip to content

Instantly share code, notes, and snippets.

@kumagi
Created July 22, 2014 12:55
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kumagi/2e5f7318b262d1f85ead to your computer and use it in GitHub Desktop.
Save kumagi/2e5f7318b262d1f85ead to your computer and use it in GitHub Desktop.
byte_lockが速いとかいうので試してみた。やっつけ実装なのに60倍速い。
#include <pthread.h> // pthread_create/join
#include <sys/time.h> // gettimeofday
#include <stdio.h> // printf, perror
#include <stdlib.h> // exit
#include <stdint.h> // exit
#define _GNU_SOURCE 1
#include <sched.h> // sched_setaffinity
struct byte_lock {
byte_lock()
: owner_tid(0) {
for (int i = 0; i < sizeof(slot); ++i) {
slot[i] = 0;
}
}
void read_lock(int tid) {
slot[tid] = 1;
__asm__ volatile ("" ::: "memory");
while (owner_tid) {
slot[tid] = 0;
pthread_yield();
slot[tid] = 1;
}
}
void read_unlock(int tid) {
slot[tid] = 0;
__asm__ volatile ("" ::: "memory");
}
void write_lock(int tid) {
for (;;) {
if (owner_tid) {
pthread_yield();
continue;
}
volatile int local = tid;
if (__sync_bool_compare_and_swap(&owner_tid, 0, tid)) {
break;
}
(local);
}
for (;;) {
bool readers_eliminated = true;
const int long_stride = sizeof(slot) >> 3;
uint64_t* long_slot = (uint64_t*)&slot;
for (int i = 0; i < long_stride; ++i) {
if (long_slot[i] != 0) {
pthread_yield();
readers_eliminated = false;
break;
}
}
if (!readers_eliminated && sizeof(slot) & 7) {
int offset = (sizeof(slot) >> 3) << 3;
for (int i = offset; i < offset + (sizeof(slot) & 7); ++i) {
if (slot[i] != 0) {
pthread_yield();
readers_eliminated = false;
break;
}
}
}
if (readers_eliminated) {
break;
}
pthread_yield();
}
}
void write_unlock() {
owner_tid = 0;
}
int owner_tid __attribute__((aligned(64)));
char slot[64 - sizeof(int)];
};
/// ------- benchmark ---------
struct working_set {
int tid;
pthread_barrier_t* bar;
byte_lock* bl;
pthread_rwlock_t* lk;
};
void* work(void* w) {
working_set* ws = (working_set*)w;
const int tid = ws->tid;
// スレッドアフィニティの設定。tid番目のコアにこのスレッドを貼り付ける
cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(ws->tid, &mask);
if (sched_setaffinity(0, sizeof(mask), &mask) == -1) {
perror("setaffinity:");
exit(1);
}
// 測定開始待ち
pthread_barrier_wait(ws->bar);
printf("thread[%d] started\n", tid);
byte_lock* bl = ws->bl;
pthread_rwlock_t* lk = ws->lk;
if (tid == 0) {
for (int i = 0; i < 10000000; ++i) {
/* // byte lock
bl->write_lock(tid);
bl->write_unlock();
//*/* // pthread_rwlock
pthread_rwlock_wrlock(lk);
pthread_rwlock_unlock(lk);
//*/
}
} else {
for (int i = 0; i < 1000000000; ++i) {
/* // byte lock
bl->read_lock(tid);
bl->read_unlock(tid);
//*/* // pthread_rwlock
pthread_rwlock_rdlock(lk);
pthread_rwlock_unlock(lk);
//*/
}
}
printf("thread[%d] finished\n", tid);
pthread_barrier_wait(ws->bar);
}
// 現在時刻をそれなりの精度で求める関数
double now(){
struct timeval t;
gettimeofday(&t, NULL);
return (double)t.tv_sec + (double)t.tv_usec * 1e-6;
}
int main(int argc, char** argv) {
if (argc < 2) { // ./a.out 4 って書くと4スレッドになる
printf("set thread num\n");
return 1; // 指定漏れがあった場合は自殺
}
const int threads = atoi(argv[1]);
// 各スレッドやデータ構造の初期化
pthread_t th[threads];
pthread_barrier_t barrier;
working_set ws[threads];
byte_lock bl;
pthread_rwlock_t lk;
pthread_rwlock_init(&lk, NULL);
pthread_barrier_init( &barrier, 0, threads + 1);
for (int i = 0; i < threads; ++i) {
ws[i].tid = i;
ws[i].bar = &barrier;
ws[i].bl = &bl;
ws[i].lk = &lk;
pthread_create(&th[i], NULL, work, &ws[i]);
}
// 測定開始
const double start = now();
pthread_barrier_wait(&barrier);
// 測定終了
pthread_barrier_wait(&barrier);
const double finish = now();
for (int i = 0; i < threads; ++i) {
pthread_join(th[i], NULL);
}
// 経過時間を表示
printf("thread[%d] time: %lf sec\n", threads, finish - start);
return 0;
}
@kumagi
Copy link
Author

kumagi commented Jul 22, 2014

byte_lock使った方

$ ./a.out 4
thread[0] started
thread[2] started
thread[3] started
thread[1] started
thread[1] finished
thread[3] finished
thread[2] finished
thread[0] finished
thread[4] time: 8.872833 sec

@kumagi
Copy link
Author

kumagi commented Jul 22, 2014

pthread_rwlock使った方

$ ./a.out 4
thread[2] started
thread[1] started
thread[3] started
thread[0] started
thread[0] finished
thread[1] finished
thread[2] finished
thread[3] finished
thread[4] time: 494.172233 sec

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment