Skip to content

Instantly share code, notes, and snippets.

@sile
Last active August 29, 2015 14:15
Show Gist options
  • Save sile/995bc284251577b0c258 to your computer and use it in GitHub Desktop.
Save sile/995bc284251577b0c258 to your computer and use it in GitHub Desktop.
『The Art of Multiprocessor Programming』の12章に出てくるBitonic Counting Networkの実装
// g++ -O3 -pthread -std=c++11 -o bench bench.cc
//
// Usage: bench type(0..6) thread_count loop_count
#include <iostream>
#include <cassert>
#include <cstdlib>
#include <thread>
#include <vector>
#include <functional>
#include "counter.hh"
template <class T>
void worker(T * c, int begin, int end) {
unsigned int tid = std::hash<std::thread::id>()(std::this_thread::get_id());
for(int i=begin; i < end; i++) {
c->next(tid);
}
}
template <class T>
void do_bench(T * counter, const char * type, const int thread_count, const int loop_count) {
std::vector<std::thread*> threads;
const int per_work = loop_count / thread_count;
std::cout << "type=" << type << ", thread=" << thread_count << ", count=" << loop_count << std::endl;
for(int i = 0; i < loop_count; i += per_work) {
threads.push_back(new std::thread(worker<T>, counter, i, i + per_work));
}
for(int i = 0; i < threads.size(); i++) {
threads[i]->join();
}
std::cout << "result: " << counter->next(0) << std::endl;
}
int main(int argc, char ** argv) {
UnsafeCounter c0;
AtomicCounter c1;
BitonicCounter<2> c2;
BitonicCounter<4> c3;
BitonicCounter<8> c4;
BitonicCounter<16> c5;
BitonicCounter<32> c6;
int type = atoi(argv[1]);
int thread_count = atoi(argv[2]);
int loop_count = atoi(argv[3]);
switch (type) {
case 0: do_bench(&c0, "unsafe", thread_count, loop_count); break;
case 1: do_bench(&c1, "atomic", thread_count, loop_count); break;
case 2: do_bench(&c2, "bitonic<2>", thread_count, loop_count); break;
case 3: do_bench(&c3, "bitonic<4>", thread_count, loop_count); break;
case 4: do_bench(&c4, "bitonic<8>", thread_count, loop_count); break;
case 5: do_bench(&c5, "bitonic<16>", thread_count, loop_count); break;
case 6: do_bench(&c6, "bitonic<32>", thread_count, loop_count); break;
}
return 0;
}
#ifndef __BITONIC_HH__
#define __BITONIC_HH__
#include <atomic>
#include <cassert>
class Balancer {
public:
Balancer() : toggle(0) { assert(toggle.is_lock_free()); }
int traverse() { return toggle.fetch_add(1) % 2; }
private:
std::atomic<int> toggle;
};
template <int SIZE>
class Merger {
public:
int traverse(int input) {
int i = (input < SIZE / 2) ? (input % 2) : (1 - input % 2);
int output = half[i].traverse(input / 2);
return 2 * output + layer[output].traverse();
}
private:
Merger<SIZE/2> half[2];
Balancer layer[SIZE / 2];
};
template <>
class Merger<2> {
public:
int traverse(int input) { return layer.traverse(); }
private:
Balancer layer;
};
template <int SIZE>
class Bitonic {
public:
int traverse(int input) {
int output = half[input / (SIZE / 2)].traverse(input / 2);
return merger.traverse((input >= (SIZE / 2) ? (SIZE / 2) : 0) + output);
}
private:
Bitonic<SIZE/2> half[2];
Merger<SIZE> merger;
};
template <>
class Bitonic <2> {
public:
int traverse(int input) { return merger.traverse(input); }
private:
Merger<2> merger;
};
#endif
#ifndef __COUNTER_HH__
#define __COUNTER_HH__
#include "bitonic.hh"
#include <atomic>
class UnsafeCounter {
public:
UnsafeCounter() : counter(0) {}
int next(unsigned int thread_id) { return counter++; }
private:
volatile int counter;
};
class AtomicCounter {
public:
AtomicCounter() : counter(0) {}
int next(unsigned int thread_id) { return counter.fetch_add(1); }
private:
std::atomic<int> counter;
};
template <int SIZE>
class BitonicCounter {
public:
BitonicCounter() {
for(int i = 0; i < SIZE; i++) {
table[i] = 0;
}
}
int next(unsigned int thread_id) {
int i = bitonic.traverse(thread_id % SIZE);
int c = table[i].fetch_add(1);
return c * SIZE + i;
}
private:
Bitonic<SIZE> bitonic;
std::atomic<int> table[SIZE];
};
#endif

測定結果

環境

  • EC2
    • c4.8xlarge
    • Intel(R) Xeon(R) CPU E5-2666 v3 @ 2.90GHz (x36)
  • Ubuntu-14.04
  • gcc version 4.8.2 (Ubuntu 4.8.2-19ubuntu1)

計測方法

三回実行して中央値を採用

$ time ./bench 0..6 1..10000 100000000

結果

縦軸はスレッド数、横軸はカウンタの種類

カウンタ:

  • unsafe: 同期なし (並列時には正しい結果を返さない。参考までに記載)
  • atomic: アトミック命令と使った単一整数カウンタ
  • bitonic(N): BitonicCountingNetworkを使ったカウンタ

real time

timeコマンドの実行時間

unsafe (欠損率) atomic bitonic(2) bitonic(8) bitonic(32)
1 0.193s ( 0.0%) 0.580s 1.143s 4.822s 11.483s
10 0.057s (86.8%) 1.559s 3.191s 14.757s 19.754s
100 0.076s (96.0%) 1.655s 3.336s 16.153s 17.108s
1000 0.084s (88.2%) 1.698s 3.347s 15.842s 17.437s
10000 0.451s (37.2%) 2.005s 3.557s 18.084s 17.464s
100000 4.347s (00.1%) 4.888s 6.482s 19.454s 17.108s

user time

全スレッドの実行時間の合計 (システム時間は除く ※除かない方が良かったかも...)

unsafe atomic bitonic(2) bitonic(8) bitonic(32)
1 0.193s 0.579s 1.142s 4.821s 11.481s
10 0.415s 12.534s 25.281s 132.889s 163.568s
100 2.257s 55.549s 111.133s 571.614s 603.764s
1000 1.212s 59.785s 119.084s 568.242s 626.111s
10000 0.509s 52.009s 108.870s 642.781s 624.420s
100000 1.009s 8.436s 77.382s 555.895s 489.713s

並列度(?)

user time / real time

unsafe atomic bitonic(2) bitonic(8) bitonic(32)
1 1.00 0.99 0.99 0.99 0.99
10 7.28 8.03 7.92 9.00 8.28
100 29.69 33.56 33.31 35.43 35.29
1000 14.42 35.20 35.57 35.86 35.90
10000 1.12 25.93 30.60 35.54 35.75
100000 0.23 1.72 11.93 28.57 28.62

所感

  • 36コア(or 16コア32スレッド?)程度なら単純なatomicカウンタの方が高速
    • 同一プロセス上でのカウンタなら(ロックモデルでいう)クリティカルセクションが極めて短いので並列化する動機は低い
    • bitnicはオーバーヘッドが大きすぎる
    • それを打ち消すほどの並列度はない
  • スレッド数が多くなるにつれて、atomicカウンタでは、直列化によるスループットの低下が見て取れる
    • Nをある程度大きくとったbitnicでは、スレッド数を多くしても、スループットは下がらない (下がりにくい)
  • 要約
    • atomicカウンタ: 本質的なスケールアップ性は低いが、二桁程度のコア数なら現実的にはこちらの方が高速
    • bitonicカウンタ: 本質的なスケールアップ性は高いが、基礎オーバヘッドが大きすぎて、現実的ではない
      • ※ スケールアップというよりは、スレッド増加時の性能劣化が少ない、といった方が適切 (いずれのケースでもシングルスレッドが最速)
      • どういったケースでbitonic(or 他のスケールアップする)カウンタが利用可能か?
      • コア数が圧倒的に多い環境
      • 一つ当たりのコア性能がかなり低い環境 (クリティカルセクションの時間が相対的に長くなり、直列化の影響を受けやすくなる)
      • 同期コストが高い環境(ex. 分散環境)は逆に不利かもしれない (同期回数自体は増えるので)

生データ

$ time ./bench 0 1 100000000
type=unsafe, thread=1, count=100000000
result: 100000000

real	0m0.193s
user	0m0.193s
sys	0m0.005s

$ time ./bench 0 10 100000000
type=unsafe, thread=10, count=100000000
result: 13175260

real	0m0.057s
user	0m0.415s
sys	0m0.004s

$ time ./bench 0 100 100000000
type=unsafe, thread=100, count=100000000
result: 3930586

real	0m0.076s
user	0m2.257s
sys	0m0.000s

$ time ./bench 0 1000 100000000
type=unsafe, thread=1000, count=100000000
result: 11782362

real	0m0.084s
user	0m1.212s
sys	0m0.097s

$ time ./bench 0 10000 100000000
type=unsafe, thread=10000, count=100000000
result: 62775562

real	0m0.451s
user	0m0.509s
sys	0m0.718s

$ time ./bench 0 100000 100000000
type=unsafe, thread=100000, count=100000000
result: 99893306

real	0m4.347s
user	0m1.009s
sys	0m6.178s

$ time ./bench 1 1 100000000
type=atomic, thread=1, count=100000000
result: 100000000

real	0m0.580s
user	0m0.579s
sys	0m0.000s

$ time ./bench 1 10 100000000
type=atomic, thread=10, count=100000000
result: 100000000

real	0m1.559s
user	0m12.534s
sys	0m0.004s

$ time ./bench 1 100 100000000
type=atomic, thread=100, count=100000000
result: 100000000

real	0m1.655s
user	0m55.549s
sys	0m0.012s

$ time ./bench 1 1000 100000000
type=atomic, thread=1000, count=100000000
result: 100000000

real	0m1.698s
user	0m59.785s
sys	0m0.216s

$ time ./bench 1 10000 100000000
type=atomic, thread=10000, count=100000000
result: 100000000

real	0m2.005s
user	0m52.009s
sys	0m1.694s

$ time ./bench 1 100000 100000000
type=atomic, thread=100000, count=100000000
result: 100000000

real	0m4.888s
user	0m8.436s
sys	0m3.029s

$ time ./bench 2 1 100000000
type=bitonic<2>, thread=1, count=100000000
result: 100000000

real	0m1.143s
user	0m1.142s
sys	0m0.000s

$ time ./bench 2 10 100000000
type=bitonic<2>, thread=10, count=100000000
result: 100000000

real	0m3.191s
user	0m25.281s
sys	0m0.000s

$ time ./bench 2 100 100000000
type=bitonic<2>, thread=100, count=100000000
result: 100000000

real	0m3.336s
user	1m51.133s
sys	0m0.128s

$ time ./bench 2 1000 100000000
type=bitonic<2>, thread=1000, count=100000000
result: 100000000

real	0m3.347s
user	1m59.084s
sys	0m0.219s

$ time ./bench 2 10000 100000000
type=bitonic<2>, thread=10000, count=100000000
result: 100000000

real	0m3.557s
user	1m48.870s
sys	0m2.054s

$ time ./bench 2 100000 100000000
type=bitonic<2>, thread=100000, count=100000000
result: 100000000

real	0m6.482s
user	1m17.382s
sys	0m6.306s

$ time ./bench 4 1 100000000
type=bitonic<8>, thread=1, count=100000000
result: 100000000

real	0m4.822s
user	0m4.821s
sys	0m0.004s

$ time ./bench 4 10 100000000
type=bitonic<8>, thread=10, count=100000000
result: 100000000

real	0m14.757s
user	2m12.889s
sys	0m0.000s

$ time ./bench 4 100 100000000
type=bitonic<8>, thread=100, count=100000000
result: 100000000

real	0m16.153s
user	9m31.614s
sys	0m0.140s

$ time ./bench 4 1000 100000000
type=bitonic<8>, thread=1000, count=100000000
result: 100000000

real	0m15.842s
user	9m28.242s
sys	0m0.228s

$ time ./bench 4 10000 100000000
type=bitonic<8>, thread=10000, count=100000000
result: 100000000

real	0m18.084s
user	10m42.781s
sys	0m1.664s

$ time ./bench 4 100000 100000000
type=bitonic<8>, thread=100000, count=100000000
result: 100000000

real	0m19.454s
user	9m15.895s
sys	0m12.347s

$ time ./bench 6 1 100000000
type=bitonic<32>, thread=1, count=100000000
result: 100000000

real	0m11.483s
user	0m11.481s
sys	0m0.000s

$ time ./bench 6 10 100000000
type=bitonic<32>, thread=10, count=100000000
result: 100000000

real	0m19.754s
user	2m43.568s
sys	0m0.000s

$ time ./bench 6 100 100000000
type=bitonic<32>, thread=100, count=100000000
result: 100000000

real	0m17.108s
user	10m3.764s
sys	0m0.056s

$ time ./bench 6 1000 100000000
type=bitonic<32>, thread=1000, count=100000000
result: 100000000

real	0m17.437s
user	10m26.111s
sys	0m0.126s

$ time ./bench 6 10000 100000000
type=bitonic<32>, thread=10000, count=100000000
result: 100000000

real	0m17.464s
user	10m24.420s
sys	0m0.707s

$ time ./bench 6 100000 100000000
type=bitonic<32>, thread=100000, count=100000000
result: 100000000

real	0m17.108s
user	8m9.713s
sys	0m7.057s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment