sile/bench.cc

## bench.cc
// g++ -O3 -pthread -std=c++11 -o bench bench.cc
//
// Usage: bench type(0..6) thread_count loop_count
#include <iostream>
#include <cassert>
#include <cstdlib>
#include <thread>
#include <vector>
#include <functional>
#include "counter.hh"

template <class T>
void worker(T * c, int begin, int end) {
  unsigned int tid = std::hash<std::thread::id>()(std::this_thread::get_id());
  for(int i=begin; i < end; i++) {
    c->next(tid);
  }
}


template <class T>
void do_bench(T * counter, const char * type, const int thread_count, const int loop_count) {
  std::vector<std::thread*> threads;
  const int per_work = loop_count / thread_count;
  std::cout << "type=" << type << ", thread=" << thread_count << ", count=" << loop_count << std::endl;
  for(int i = 0; i < loop_count; i += per_work) {
    threads.push_back(new std::thread(worker<T>, counter, i, i + per_work));
  }
  for(int i = 0; i < threads.size(); i++) {
    threads[i]->join();
  }
  std::cout << "result: " << counter->next(0) << std::endl;
}

int main(int argc, char ** argv) {
  UnsafeCounter c0;
  AtomicCounter c1;
  BitonicCounter<2> c2;
  BitonicCounter<4> c3;
  BitonicCounter<8> c4;
  BitonicCounter<16> c5;
  BitonicCounter<32> c6;

  int type = atoi(argv[1]);
  int thread_count = atoi(argv[2]);
  int loop_count = atoi(argv[3]);

  switch (type) {
  case 0: do_bench(&c0, "unsafe", thread_count, loop_count); break;
  case 1: do_bench(&c1, "atomic", thread_count, loop_count); break;
  case 2: do_bench(&c2, "bitonic<2>", thread_count, loop_count); break;
  case 3: do_bench(&c3, "bitonic<4>", thread_count, loop_count); break;
  case 4: do_bench(&c4, "bitonic<8>", thread_count, loop_count); break;
  case 5: do_bench(&c5, "bitonic<16>", thread_count, loop_count); break;
  case 6: do_bench(&c6, "bitonic<32>", thread_count, loop_count); break;
  }

  return 0;
}

## bitonic.hh
#ifndef __BITONIC_HH__
#define __BITONIC_HH__

#include <atomic>
#include <cassert>

class Balancer {
public:
  Balancer() : toggle(0) { assert(toggle.is_lock_free()); }
  int traverse() { return toggle.fetch_add(1) % 2; }

private:
  std::atomic<int> toggle;
};

template <int SIZE>
class Merger {
public:
  int traverse(int input) {
    int i = (input < SIZE / 2) ? (input % 2) : (1 - input % 2);
    int output = half[i].traverse(input / 2);
    return 2 * output + layer[output].traverse();
  }

private:
  Merger<SIZE/2> half[2];
  Balancer layer[SIZE / 2];
};

template <>
class Merger<2> {
public:
  int traverse(int input) { return layer.traverse(); }

private:
  Balancer layer;
};

template <int SIZE>
class Bitonic {
public:
  int traverse(int input) {
    int output = half[input / (SIZE / 2)].traverse(input / 2);
    return merger.traverse((input >= (SIZE / 2) ? (SIZE / 2) : 0) + output);
  }

private:
  Bitonic<SIZE/2> half[2];
  Merger<SIZE> merger;
};

template <>
class Bitonic <2> {
public:
  int traverse(int input) { return merger.traverse(input); }
private:
  Merger<2> merger;
};

#endif

## counter.hh
#ifndef __COUNTER_HH__
#define __COUNTER_HH__

#include "bitonic.hh"
#include <atomic>

class UnsafeCounter {
public:
  UnsafeCounter() : counter(0) {}
  int next(unsigned int thread_id) { return counter++; }

private:
  volatile int counter;
};

class AtomicCounter {
public:
  AtomicCounter() : counter(0) {}
  int next(unsigned int thread_id) { return counter.fetch_add(1); }

private:
  std::atomic<int> counter;
};

template <int SIZE>
class BitonicCounter {
public:
  BitonicCounter() {
    for(int i = 0; i < SIZE; i++) {
      table[i] = 0;
    }
  }

  int next(unsigned int thread_id) {
    int i = bitonic.traverse(thread_id % SIZE);
    int c = table[i].fetch_add(1);
    return c * SIZE + i;
  }

private:
  Bitonic<SIZE> bitonic;
  std::atomic<int> table[SIZE];
};

#endif

## result.md

      
    Raw
  

              result.md
            
          
    測定結果

環境


EC2

c4.8xlarge
Intel(R) Xeon(R) CPU E5-2666 v3 @ 2.90GHz (x36)


Ubuntu-14.04
gcc version 4.8.2 (Ubuntu 4.8.2-19ubuntu1)

計測方法

三回実行して中央値を採用
$ time ./bench 0..6 1..10000 100000000
結果

縦軸はスレッド数、横軸はカウンタの種類
カウンタ:

unsafe: 同期なし (並列時には正しい結果を返さない。参考までに記載)
atomic: アトミック命令と使った単一整数カウンタ
bitonic(N): BitonicCountingNetworkを使ったカウンタ

real time

timeコマンドの実行時間


unsafe (欠損率)
atomic
bitonic(2)
bitonic(8)
bitonic(32)


1
0.193s ( 0.0%)
0.580s
1.143s
4.822s
11.483s


10
0.057s (86.8%)
1.559s
3.191s
14.757s
19.754s


100
0.076s (96.0%)
1.655s
3.336s
16.153s
17.108s


1000
0.084s (88.2%)
1.698s
3.347s
15.842s
17.437s


10000
0.451s (37.2%)
2.005s
3.557s
18.084s
17.464s


100000
4.347s (00.1%)
4.888s
6.482s
19.454s
17.108s


user time

全スレッドの実行時間の合計 (システム時間は除く ※除かない方が良かったかも...)


unsafe
atomic
bitonic(2)
bitonic(8)
bitonic(32)


1
0.193s
0.579s
1.142s
4.821s
11.481s


10
0.415s
12.534s
25.281s
132.889s
163.568s


100
2.257s
55.549s
111.133s
571.614s
603.764s


1000
1.212s
59.785s
119.084s
568.242s
626.111s


10000
0.509s
52.009s
108.870s
642.781s
624.420s


100000
1.009s
8.436s
77.382s
555.895s
489.713s


並列度(?)

user time / real time


unsafe
atomic
bitonic(2)
bitonic(8)
bitonic(32)


1
1.00
0.99
0.99
0.99
0.99


10
7.28
8.03
7.92
9.00
8.28


100
29.69
33.56
33.31
35.43
35.29


1000
14.42
35.20
35.57
35.86
35.90


10000
1.12
25.93
30.60
35.54
35.75


100000
0.23
1.72
11.93
28.57
28.62


所感


36コア(or 16コア32スレッド?)程度なら単純なatomicカウンタの方が高速

同一プロセス上でのカウンタなら(ロックモデルでいう)クリティカルセクションが極めて短いので並列化する動機は低い
bitnicはオーバーヘッドが大きすぎる
それを打ち消すほどの並列度はない


スレッド数が多くなるにつれて、atomicカウンタでは、直列化によるスループットの低下が見て取れる

Nをある程度大きくとったbitnicでは、スレッド数を多くしても、スループットは下がらない (下がりにくい)


要約

atomicカウンタ: 本質的なスケールアップ性は低いが、二桁程度のコア数なら現実的にはこちらの方が高速
bitonicカウンタ: 本質的なスケールアップ性は高いが、基礎オーバヘッドが大きすぎて、現実的ではない

※ スケールアップというよりは、スレッド増加時の性能劣化が少ない、といった方が適切 (いずれのケースでもシングルスレッドが最速)
どういったケースでbitonic(or 他のスケールアップする)カウンタが利用可能か？
コア数が圧倒的に多い環境
一つ当たりのコア性能がかなり低い環境 (クリティカルセクションの時間が相対的に長くなり、直列化の影響を受けやすくなる)
同期コストが高い環境(ex. 分散環境)は逆に不利かもしれない (同期回数自体は増えるので)


生データ

$ time ./bench 0 1 100000000
type=unsafe, thread=1, count=100000000
result: 100000000

real	0m0.193s
user	0m0.193s
sys	0m0.005s

$ time ./bench 0 10 100000000
type=unsafe, thread=10, count=100000000
result: 13175260

real	0m0.057s
user	0m0.415s
sys	0m0.004s

$ time ./bench 0 100 100000000
type=unsafe, thread=100, count=100000000
result: 3930586

real	0m0.076s
user	0m2.257s
sys	0m0.000s

$ time ./bench 0 1000 100000000
type=unsafe, thread=1000, count=100000000
result: 11782362

real	0m0.084s
user	0m1.212s
sys	0m0.097s

$ time ./bench 0 10000 100000000
type=unsafe, thread=10000, count=100000000
result: 62775562

real	0m0.451s
user	0m0.509s
sys	0m0.718s

$ time ./bench 0 100000 100000000
type=unsafe, thread=100000, count=100000000
result: 99893306

real	0m4.347s
user	0m1.009s
sys	0m6.178s

$ time ./bench 1 1 100000000
type=atomic, thread=1, count=100000000
result: 100000000

real	0m0.580s
user	0m0.579s
sys	0m0.000s

$ time ./bench 1 10 100000000
type=atomic, thread=10, count=100000000
result: 100000000

real	0m1.559s
user	0m12.534s
sys	0m0.004s

$ time ./bench 1 100 100000000
type=atomic, thread=100, count=100000000
result: 100000000

real	0m1.655s
user	0m55.549s
sys	0m0.012s

$ time ./bench 1 1000 100000000
type=atomic, thread=1000, count=100000000
result: 100000000

real	0m1.698s
user	0m59.785s
sys	0m0.216s

$ time ./bench 1 10000 100000000
type=atomic, thread=10000, count=100000000
result: 100000000

real	0m2.005s
user	0m52.009s
sys	0m1.694s

$ time ./bench 1 100000 100000000
type=atomic, thread=100000, count=100000000
result: 100000000

real	0m4.888s
user	0m8.436s
sys	0m3.029s

$ time ./bench 2 1 100000000
type=bitonic<2>, thread=1, count=100000000
result: 100000000

real	0m1.143s
user	0m1.142s
sys	0m0.000s

$ time ./bench 2 10 100000000
type=bitonic<2>, thread=10, count=100000000
result: 100000000

real	0m3.191s
user	0m25.281s
sys	0m0.000s

$ time ./bench 2 100 100000000
type=bitonic<2>, thread=100, count=100000000
result: 100000000

real	0m3.336s
user	1m51.133s
sys	0m0.128s

$ time ./bench 2 1000 100000000
type=bitonic<2>, thread=1000, count=100000000
result: 100000000

real	0m3.347s
user	1m59.084s
sys	0m0.219s

$ time ./bench 2 10000 100000000
type=bitonic<2>, thread=10000, count=100000000
result: 100000000

real	0m3.557s
user	1m48.870s
sys	0m2.054s

$ time ./bench 2 100000 100000000
type=bitonic<2>, thread=100000, count=100000000
result: 100000000

real	0m6.482s
user	1m17.382s
sys	0m6.306s

$ time ./bench 4 1 100000000
type=bitonic<8>, thread=1, count=100000000
result: 100000000

real	0m4.822s
user	0m4.821s
sys	0m0.004s

$ time ./bench 4 10 100000000
type=bitonic<8>, thread=10, count=100000000
result: 100000000

real	0m14.757s
user	2m12.889s
sys	0m0.000s

$ time ./bench 4 100 100000000
type=bitonic<8>, thread=100, count=100000000
result: 100000000

real	0m16.153s
user	9m31.614s
sys	0m0.140s

$ time ./bench 4 1000 100000000
type=bitonic<8>, thread=1000, count=100000000
result: 100000000

real	0m15.842s
user	9m28.242s
sys	0m0.228s

$ time ./bench 4 10000 100000000
type=bitonic<8>, thread=10000, count=100000000
result: 100000000

real	0m18.084s
user	10m42.781s
sys	0m1.664s

$ time ./bench 4 100000 100000000
type=bitonic<8>, thread=100000, count=100000000
result: 100000000

real	0m19.454s
user	9m15.895s
sys	0m12.347s

$ time ./bench 6 1 100000000
type=bitonic<32>, thread=1, count=100000000
result: 100000000

real	0m11.483s
user	0m11.481s
sys	0m0.000s

$ time ./bench 6 10 100000000
type=bitonic<32>, thread=10, count=100000000
result: 100000000

real	0m19.754s
user	2m43.568s
sys	0m0.000s

$ time ./bench 6 100 100000000
type=bitonic<32>, thread=100, count=100000000
result: 100000000

real	0m17.108s
user	10m3.764s
sys	0m0.056s

$ time ./bench 6 1000 100000000
type=bitonic<32>, thread=1000, count=100000000
result: 100000000

real	0m17.437s
user	10m26.111s
sys	0m0.126s

$ time ./bench 6 10000 100000000
type=bitonic<32>, thread=10000, count=100000000
result: 100000000

real	0m17.464s
user	10m24.420s
sys	0m0.707s

$ time ./bench 6 100000 100000000
type=bitonic<32>, thread=100000, count=100000000
result: 100000000

real	0m17.108s
user	8m9.713s
sys	0m7.057s
	// g++ -O3 -pthread -std=c++11 -o bench bench.cc
	//
	// Usage: bench type(0..6) thread_count loop_count
	#include <iostream>
	#include <cassert>
	#include <cstdlib>
	#include <thread>
	#include <vector>
	#include <functional>
	#include "counter.hh"

	template <class T>
	void worker(T * c, int begin, int end) {
	unsigned int tid = std::hash<std::thread::id>()(std::this_thread::get_id());
	for(int i=begin; i < end; i++) {
	c->next(tid);
	}
	}


	template <class T>
	void do_bench(T * counter, const char * type, const int thread_count, const int loop_count) {
	std::vector<std::thread*> threads;
	const int per_work = loop_count / thread_count;
	std::cout << "type=" << type << ", thread=" << thread_count << ", count=" << loop_count << std::endl;
	for(int i = 0; i < loop_count; i += per_work) {
	threads.push_back(new std::thread(worker<T>, counter, i, i + per_work));
	}
	for(int i = 0; i < threads.size(); i++) {
	threads[i]->join();
	}
	std::cout << "result: " << counter->next(0) << std::endl;
	}

	int main(int argc, char ** argv) {
	UnsafeCounter c0;
	AtomicCounter c1;
	BitonicCounter<2> c2;
	BitonicCounter<4> c3;
	BitonicCounter<8> c4;
	BitonicCounter<16> c5;
	BitonicCounter<32> c6;

	int type = atoi(argv[1]);
	int thread_count = atoi(argv[2]);
	int loop_count = atoi(argv[3]);

	switch (type) {
	case 0: do_bench(&c0, "unsafe", thread_count, loop_count); break;
	case 1: do_bench(&c1, "atomic", thread_count, loop_count); break;
	case 2: do_bench(&c2, "bitonic<2>", thread_count, loop_count); break;
	case 3: do_bench(&c3, "bitonic<4>", thread_count, loop_count); break;
	case 4: do_bench(&c4, "bitonic<8>", thread_count, loop_count); break;
	case 5: do_bench(&c5, "bitonic<16>", thread_count, loop_count); break;
	case 6: do_bench(&c6, "bitonic<32>", thread_count, loop_count); break;
	}

	return 0;
	}
	#ifndef __BITONIC_HH__
	#define __BITONIC_HH__

	#include <atomic>
	#include <cassert>

	class Balancer {
	public:
	Balancer() : toggle(0) { assert(toggle.is_lock_free()); }
	int traverse() { return toggle.fetch_add(1) % 2; }

	private:
	std::atomic<int> toggle;
	};

	template <int SIZE>
	class Merger {
	public:
	int traverse(int input) {
	int i = (input < SIZE / 2) ? (input % 2) : (1 - input % 2);
	int output = half[i].traverse(input / 2);
	return 2 * output + layer[output].traverse();
	}

	private:
	Merger<SIZE/2> half[2];
	Balancer layer[SIZE / 2];
	};

	template <>
	class Merger<2> {
	public:
	int traverse(int input) { return layer.traverse(); }

	private:
	Balancer layer;
	};

	template <int SIZE>
	class Bitonic {
	public:
	int traverse(int input) {
	int output = half[input / (SIZE / 2)].traverse(input / 2);
	return merger.traverse((input >= (SIZE / 2) ? (SIZE / 2) : 0) + output);
	}

	private:
	Bitonic<SIZE/2> half[2];
	Merger<SIZE> merger;
	};

	template <>
	class Bitonic <2> {
	public:
	int traverse(int input) { return merger.traverse(input); }
	private:
	Merger<2> merger;
	};

	#endif
	#ifndef __COUNTER_HH__
	#define __COUNTER_HH__

	#include "bitonic.hh"
	#include <atomic>

	class UnsafeCounter {
	public:
	UnsafeCounter() : counter(0) {}
	int next(unsigned int thread_id) { return counter++; }

	private:
	volatile int counter;
	};

	class AtomicCounter {
	public:
	AtomicCounter() : counter(0) {}
	int next(unsigned int thread_id) { return counter.fetch_add(1); }

	private:
	std::atomic<int> counter;
	};

	template <int SIZE>
	class BitonicCounter {
	public:
	BitonicCounter() {
	for(int i = 0; i < SIZE; i++) {
	table[i] = 0;
	}
	}

	int next(unsigned int thread_id) {
	int i = bitonic.traverse(thread_id % SIZE);
	int c = table[i].fetch_add(1);
	return c * SIZE + i;
	}

	private:
	Bitonic<SIZE> bitonic;
	std::atomic<int> table[SIZE];
	};

	#endif
	unsafe (欠損率)	atomic	bitonic(2)	bitonic(8)	bitonic(32)
1	0.193s ( 0.0%)	0.580s	1.143s	4.822s	11.483s
10	0.057s (86.8%)	1.559s	3.191s	14.757s	19.754s
100	0.076s (96.0%)	1.655s	3.336s	16.153s	17.108s
1000	0.084s (88.2%)	1.698s	3.347s	15.842s	17.437s
10000	0.451s (37.2%)	2.005s	3.557s	18.084s	17.464s
100000	4.347s (00.1%)	4.888s	6.482s	19.454s	17.108s
	unsafe	atomic	bitonic(2)	bitonic(8)	bitonic(32)
1	0.193s	0.579s	1.142s	4.821s	11.481s
10	0.415s	12.534s	25.281s	132.889s	163.568s
100	2.257s	55.549s	111.133s	571.614s	603.764s
1000	1.212s	59.785s	119.084s	568.242s	626.111s
10000	0.509s	52.009s	108.870s	642.781s	624.420s
100000	1.009s	8.436s	77.382s	555.895s	489.713s
	unsafe	atomic	bitonic(2)	bitonic(8)	bitonic(32)
1	1.00	0.99	0.99	0.99	0.99
10	7.28	8.03	7.92	9.00	8.28
100	29.69	33.56	33.31	35.43	35.29
1000	14.42	35.20	35.57	35.86	35.90
10000	1.12	25.93	30.60	35.54	35.75
100000	0.23	1.72	11.93	28.57	28.62