azat/atomic_benchmark.cpp

## atomic_benchmark.cpp
/// https://github.com/aosp-mirror/platform_bionic/blob/master/benchmarks/atomic_benchmark.cpp

/*
 * Copyright (C) 2017 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// Our goal is to measure the cost of various C++ atomic operations.
// Android doesn't really control those. But since some of these operations can be quite
// expensive, this may be useful input for development of higher level code.
// Expected mappings from C++ atomics to hardware primitives can be found at
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html .

#include <atomic>
#include <mutex>

#include <benchmark/benchmark.h>
#include "util.h"

// We time atomic operations separated by a volatile (not atomic!) increment.  This ensures
// that the compiler emits memory instructions (e.g. load or store) prior to any fence or the
// like.  That in turn ensures that the CPU has outstanding memory operations when the fence
// is executed.

// In most respects, we compute best case values. Since there is only one thread, there are no
// coherence misses.

// We assume that the compiler is not smart enough to optimize away fences in a single-threaded
// program. If that changes, we'll need to add a second thread.

static volatile unsigned counter;

std::atomic<int> test_loc(0);

static volatile unsigned sink;

static std::mutex mtx;

void BM_atomic_empty(benchmark::State& state) {
  while (state.KeepRunning()) {
    ++counter;
  }
}
BIONIC_BENCHMARK(BM_atomic_empty);

static void BM_atomic_load_relaxed(benchmark::State& state) {
  unsigned result = 0;
  while (state.KeepRunning()) {
    result += test_loc.load(std::memory_order_relaxed);
    ++counter;
  }
  sink = result;
}
BIONIC_BENCHMARK(BM_atomic_load_relaxed);

static void BM_atomic_load_acquire(benchmark::State& state) {
  unsigned result = 0;
  while (state.KeepRunning()) {
    result += test_loc.load(std::memory_order_acquire);
    ++counter;
  }
  sink = result;
}
BIONIC_BENCHMARK(BM_atomic_load_acquire);

static void BM_atomic_store_release(benchmark::State& state) {
  int i = counter;
  while (state.KeepRunning()) {
    test_loc.store(++i, std::memory_order_release);
    ++counter;
  }
}
BIONIC_BENCHMARK(BM_atomic_store_release);

static void BM_atomic_store_seq_cst(benchmark::State& state) {
  int i = counter;
  while (state.KeepRunning()) {
    test_loc.store(++i, std::memory_order_seq_cst);
    ++counter;
  }
}
BIONIC_BENCHMARK(BM_atomic_store_seq_cst);

static void BM_atomic_fetch_add_relaxed(benchmark::State& state) {
  unsigned result = 0;
  while (state.KeepRunning()) {
    result += test_loc.fetch_add(1, std::memory_order_relaxed);
    ++counter;
  }
  sink = result;
}
BIONIC_BENCHMARK(BM_atomic_fetch_add_relaxed);

static void BM_atomic_fetch_add_seq_cst(benchmark::State& state) {
  unsigned result = 0;
  while (state.KeepRunning()) {
    result += test_loc.fetch_add(1, std::memory_order_seq_cst);
    ++counter;
  }
  sink = result;
}
BIONIC_BENCHMARK(BM_atomic_fetch_add_seq_cst);

// The fence benchmarks include a relaxed load to make it much harder to optimize away
// the fence.

static void BM_atomic_acquire_fence(benchmark::State& state) {
  unsigned result = 0;
  while (state.KeepRunning()) {
    result += test_loc.load(std::memory_order_relaxed);
    std::atomic_thread_fence(std::memory_order_acquire);
    ++counter;
  }
  sink = result;
}
BIONIC_BENCHMARK(BM_atomic_acquire_fence);

static void BM_atomic_seq_cst_fence(benchmark::State& state) {
  unsigned result = 0;
  while (state.KeepRunning()) {
    result += test_loc.load(std::memory_order_relaxed);
    std::atomic_thread_fence(std::memory_order_seq_cst);
    ++counter;
  }
  sink = result;
}
BIONIC_BENCHMARK(BM_atomic_seq_cst_fence);

// For comparison, also throw in a critical section version:

static void BM_atomic_fetch_add_cs(benchmark::State& state) {
  unsigned result = 0;
  while (state.KeepRunning()) {
    {
      std::lock_guard<std::mutex> _(mtx);
      result += ++counter;
    }
  }
  sink = result;
}
BIONIC_BENCHMARK(BM_atomic_fetch_add_cs);

## atomic_benchmark.txt
$ src/Common/benchmarks/atomic_benchmark
2020-11-23T22:44:06+03:00
Running src/Common/benchmarks/atomic_benchmark
Run on (12 X 4500 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x6)
  L1 Instruction 32 KiB (x6)
  L2 Unified 256 KiB (x6)
  L3 Unified 12288 KiB (x1)
Load Average: 0.08, 1.15, 1.08
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
----------------------------------------------------------------------
Benchmark                            Time             CPU   Iterations
----------------------------------------------------------------------
BM_atomic_empty                   2.05 ns         2.05 ns    293076586
BM_atomic_load_relaxed            2.07 ns         2.07 ns    338530084
BM_atomic_load_acquire            2.10 ns         2.10 ns    333778097
BM_atomic_store_release           2.02 ns         2.02 ns    347426790
BM_atomic_store_seq_cst           9.64 ns         9.64 ns     72618247
BM_atomic_fetch_add_relaxed       9.64 ns         9.64 ns     72614481
BM_atomic_fetch_add_seq_cst       9.71 ns         9.70 ns     72455484
BM_atomic_acquire_fence           2.10 ns         2.10 ns    333774984
BM_atomic_seq_cst_fence           27.0 ns         27.0 ns     25935192
BM_atomic_fetch_add_cs            39.1 ns         39.0 ns     17922906

## run.sh
clang++ -O2 -o atomic_benchmark atomic_benchmark.cpp -lbenchmark -lbenchmark_main
# -D_LGPL_SOURCE is signficant
clang++ -O2 -o userspace_rcu_benchmark userspace_rcu_bp_benchmark.cpp -lurcu-bp -lbenchmark -lbenchmark_main -D_LGPL_SOURCE=1
clang++ -O2 -o userspace_rcu_benchmark userspace_rcu_memb_benchmark.cpp -lurcu-memb -lbenchmark -lbenchmark_main -D_LGPL_SOURCE=1

## summary.txt
------------------------------------------------------------------------------
Benchmark                                    Time             CPU   Iterations
------------------------------------------------------------------------------
BM_userspace_rcu                          3.55 ns         3.55 ns    180008939 # bp version
BM_userspace_rcu_uatomic_add              6.94 ns         6.94 ns    100860254 # bp version
BM_userspace_rcu_uatomic_add_return       9.64 ns         9.64 ns     72620501 # bp version
BM_atomic_empty                           2.05 ns         2.05 ns    293076586
BM_atomic_load_relaxed                    2.07 ns         2.07 ns    338530084
BM_atomic_load_acquire                    2.10 ns         2.10 ns    333778097
BM_atomic_store_release                   2.02 ns         2.02 ns    347426790
BM_atomic_store_seq_cst                   9.64 ns         9.64 ns     72618247
BM_atomic_fetch_add_relaxed               9.64 ns         9.64 ns     72614481
BM_atomic_fetch_add_seq_cst               9.71 ns         9.70 ns     72455484
BM_atomic_acquire_fence                   2.10 ns         2.10 ns    333774984
BM_atomic_seq_cst_fence                   27.0 ns         27.0 ns     25935192
BM_atomic_fetch_add_cs                    39.1 ns         39.0 ns     17922906

## userspace_rcu_benchmark.cpp
#include <benchmark/benchmark.h>
#include <urcu/uatomic.h>
#if defined(RCU_MEMB)
#include <urcu.h>
#elif defined(RCU_BP)
#include <urcu-bp.h>
#else
#error None of RCU_* defined
#endif

#if !defined(_LGPL_SOURCE)
#error URCU is very slow w/o _LGPL_SOURCE
#endif

static void BM_userspace_rcu(benchmark::State& state)
{
    rcu_init();

    unsigned atomic = 0;
    while (state.KeepRunning())
    {
        rcu_read_lock();
        ++atomic;
        rcu_read_unlock();
    }
}
BENCHMARK(BM_userspace_rcu);

static void BM_userspace_rcu_uatomic_add(benchmark::State& state)
{
    unsigned atomic = 0;
    while (state.KeepRunning())
    {
        uatomic_add(&atomic, 1);
    }
}
BENCHMARK(BM_userspace_rcu_uatomic_add);

static void BM_userspace_rcu_uatomic_add_return(benchmark::State& state)
{
    unsigned atomic = 0;
    while (state.KeepRunning())
    {
        uatomic_add_return(&atomic, 1);
    }
}
BENCHMARK(BM_userspace_rcu_uatomic_add_return);

## userspace_rcu_bp_benchmark.txt
$ src/Common/benchmarks/userspace_rcu_bp_benchmark
2020-11-23T22:37:49+03:00
Running src/Common/benchmarks/userspace_rcu_bp_benchmark
Run on (12 X 4500 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x6)
  L1 Instruction 32 KiB (x6)
  L2 Unified 256 KiB (x6)
  L3 Unified 12288 KiB (x1)
Load Average: 6.48, 3.69, 1.56
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
------------------------------------------------------------------------------
Benchmark                                    Time             CPU   Iterations
------------------------------------------------------------------------------
BM_userspace_rcu                          3.55 ns         3.55 ns    180008939
BM_userspace_rcu_uatomic_add              6.94 ns         6.94 ns    100860254
BM_userspace_rcu_uatomic_add_return       9.64 ns         9.64 ns     72620501

## userspace_rcu_memb_benchmark.txt
$ src/Common/benchmarks/userspace_rcu_memb_benchmark
2020-11-23T22:38:01+03:00
Running src/Common/benchmarks/userspace_rcu_memb_benchmark
Run on (12 X 4500 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x6)
  L1 Instruction 32 KiB (x6)
  L2 Unified 256 KiB (x6)
  L3 Unified 12288 KiB (x1)
Load Average: 5.11, 3.52, 1.54
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
------------------------------------------------------------------------------
Benchmark                                    Time             CPU   Iterations
------------------------------------------------------------------------------
BM_userspace_rcu                          4.38 ns         4.38 ns    155844656
BM_userspace_rcu_uatomic_add              6.96 ns         6.95 ns    100851261
BM_userspace_rcu_uatomic_add_return       9.64 ns         9.64 ns     72623295
	/// https://github.com/aosp-mirror/platform_bionic/blob/master/benchmarks/atomic_benchmark.cpp

	/*
	* Copyright (C) 2017 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// Our goal is to measure the cost of various C++ atomic operations.
	// Android doesn't really control those. But since some of these operations can be quite
	// expensive, this may be useful input for development of higher level code.
	// Expected mappings from C++ atomics to hardware primitives can be found at
	// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html .

	#include <atomic>
	#include <mutex>

	#include <benchmark/benchmark.h>
	#include "util.h"

	// We time atomic operations separated by a volatile (not atomic!) increment. This ensures
	// that the compiler emits memory instructions (e.g. load or store) prior to any fence or the
	// like. That in turn ensures that the CPU has outstanding memory operations when the fence
	// is executed.

	// In most respects, we compute best case values. Since there is only one thread, there are no
	// coherence misses.

	// We assume that the compiler is not smart enough to optimize away fences in a single-threaded
	// program. If that changes, we'll need to add a second thread.

	static volatile unsigned counter;

	std::atomic<int> test_loc(0);

	static volatile unsigned sink;

	static std::mutex mtx;

	void BM_atomic_empty(benchmark::State& state) {
	while (state.KeepRunning()) {
	++counter;
	}
	}
	BIONIC_BENCHMARK(BM_atomic_empty);

	static void BM_atomic_load_relaxed(benchmark::State& state) {
	unsigned result = 0;
	while (state.KeepRunning()) {
	result += test_loc.load(std::memory_order_relaxed);
	++counter;
	}
	sink = result;
	}
	BIONIC_BENCHMARK(BM_atomic_load_relaxed);

	static void BM_atomic_load_acquire(benchmark::State& state) {
	unsigned result = 0;
	while (state.KeepRunning()) {
	result += test_loc.load(std::memory_order_acquire);
	++counter;
	}
	sink = result;
	}
	BIONIC_BENCHMARK(BM_atomic_load_acquire);

	static void BM_atomic_store_release(benchmark::State& state) {
	int i = counter;
	while (state.KeepRunning()) {
	test_loc.store(++i, std::memory_order_release);
	++counter;
	}
	}
	BIONIC_BENCHMARK(BM_atomic_store_release);

	static void BM_atomic_store_seq_cst(benchmark::State& state) {
	int i = counter;
	while (state.KeepRunning()) {
	test_loc.store(++i, std::memory_order_seq_cst);
	++counter;
	}
	}
	BIONIC_BENCHMARK(BM_atomic_store_seq_cst);

	static void BM_atomic_fetch_add_relaxed(benchmark::State& state) {
	unsigned result = 0;
	while (state.KeepRunning()) {
	result += test_loc.fetch_add(1, std::memory_order_relaxed);
	++counter;
	}
	sink = result;
	}
	BIONIC_BENCHMARK(BM_atomic_fetch_add_relaxed);

	static void BM_atomic_fetch_add_seq_cst(benchmark::State& state) {
	unsigned result = 0;
	while (state.KeepRunning()) {
	result += test_loc.fetch_add(1, std::memory_order_seq_cst);
	++counter;
	}
	sink = result;
	}
	BIONIC_BENCHMARK(BM_atomic_fetch_add_seq_cst);

	// The fence benchmarks include a relaxed load to make it much harder to optimize away
	// the fence.

	static void BM_atomic_acquire_fence(benchmark::State& state) {
	unsigned result = 0;
	while (state.KeepRunning()) {
	result += test_loc.load(std::memory_order_relaxed);
	std::atomic_thread_fence(std::memory_order_acquire);
	++counter;
	}
	sink = result;
	}
	BIONIC_BENCHMARK(BM_atomic_acquire_fence);

	static void BM_atomic_seq_cst_fence(benchmark::State& state) {
	unsigned result = 0;
	while (state.KeepRunning()) {
	result += test_loc.load(std::memory_order_relaxed);
	std::atomic_thread_fence(std::memory_order_seq_cst);
	++counter;
	}
	sink = result;
	}
	BIONIC_BENCHMARK(BM_atomic_seq_cst_fence);

	// For comparison, also throw in a critical section version:

	static void BM_atomic_fetch_add_cs(benchmark::State& state) {
	unsigned result = 0;
	while (state.KeepRunning()) {
	{
	std::lock_guard<std::mutex> _(mtx);
	result += ++counter;
	}
	}
	sink = result;
	}
	BIONIC_BENCHMARK(BM_atomic_fetch_add_cs);
	$ src/Common/benchmarks/atomic_benchmark
	2020-11-23T22:44:06+03:00
	Running src/Common/benchmarks/atomic_benchmark
	Run on (12 X 4500 MHz CPU s)
	CPU Caches:
	L1 Data 32 KiB (x6)
	L1 Instruction 32 KiB (x6)
	L2 Unified 256 KiB (x6)
	L3 Unified 12288 KiB (x1)
	Load Average: 0.08, 1.15, 1.08
	*WARNING* CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
	----------------------------------------------------------------------
	Benchmark Time CPU Iterations
	----------------------------------------------------------------------
	BM_atomic_empty 2.05 ns 2.05 ns 293076586
	BM_atomic_load_relaxed 2.07 ns 2.07 ns 338530084
	BM_atomic_load_acquire 2.10 ns 2.10 ns 333778097
	BM_atomic_store_release 2.02 ns 2.02 ns 347426790
	BM_atomic_store_seq_cst 9.64 ns 9.64 ns 72618247
	BM_atomic_fetch_add_relaxed 9.64 ns 9.64 ns 72614481
	BM_atomic_fetch_add_seq_cst 9.71 ns 9.70 ns 72455484
	BM_atomic_acquire_fence 2.10 ns 2.10 ns 333774984
	BM_atomic_seq_cst_fence 27.0 ns 27.0 ns 25935192
	BM_atomic_fetch_add_cs 39.1 ns 39.0 ns 17922906
	clang++ -O2 -o atomic_benchmark atomic_benchmark.cpp -lbenchmark -lbenchmark_main
	# -D_LGPL_SOURCE is signficant
	clang++ -O2 -o userspace_rcu_benchmark userspace_rcu_bp_benchmark.cpp -lurcu-bp -lbenchmark -lbenchmark_main -D_LGPL_SOURCE=1
	clang++ -O2 -o userspace_rcu_benchmark userspace_rcu_memb_benchmark.cpp -lurcu-memb -lbenchmark -lbenchmark_main -D_LGPL_SOURCE=1
	#include <benchmark/benchmark.h>
	#include <urcu/uatomic.h>
	#if defined(RCU_MEMB)
	#include <urcu.h>
	#elif defined(RCU_BP)
	#include <urcu-bp.h>
	#else
	#error None of RCU_* defined
	#endif

	#if !defined(_LGPL_SOURCE)
	#error URCU is very slow w/o _LGPL_SOURCE
	#endif

	static void BM_userspace_rcu(benchmark::State& state)
	{
	rcu_init();

	unsigned atomic = 0;
	while (state.KeepRunning())
	{
	rcu_read_lock();
	++atomic;
	rcu_read_unlock();
	}
	}
	BENCHMARK(BM_userspace_rcu);

	static void BM_userspace_rcu_uatomic_add(benchmark::State& state)
	{
	unsigned atomic = 0;
	while (state.KeepRunning())
	{
	uatomic_add(&atomic, 1);
	}
	}
	BENCHMARK(BM_userspace_rcu_uatomic_add);

	static void BM_userspace_rcu_uatomic_add_return(benchmark::State& state)
	{
	unsigned atomic = 0;
	while (state.KeepRunning())
	{
	uatomic_add_return(&atomic, 1);
	}
	}
	BENCHMARK(BM_userspace_rcu_uatomic_add_return);
	$ src/Common/benchmarks/userspace_rcu_bp_benchmark
	2020-11-23T22:37:49+03:00
	Running src/Common/benchmarks/userspace_rcu_bp_benchmark
	Run on (12 X 4500 MHz CPU s)
	CPU Caches:
	L1 Data 32 KiB (x6)
	L1 Instruction 32 KiB (x6)
	L2 Unified 256 KiB (x6)
	L3 Unified 12288 KiB (x1)
	Load Average: 6.48, 3.69, 1.56
	*WARNING* CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
	------------------------------------------------------------------------------
	Benchmark Time CPU Iterations
	------------------------------------------------------------------------------
	BM_userspace_rcu 3.55 ns 3.55 ns 180008939
	BM_userspace_rcu_uatomic_add 6.94 ns 6.94 ns 100860254
	BM_userspace_rcu_uatomic_add_return 9.64 ns 9.64 ns 72620501
	$ src/Common/benchmarks/userspace_rcu_memb_benchmark
	2020-11-23T22:38:01+03:00
	Running src/Common/benchmarks/userspace_rcu_memb_benchmark
	Run on (12 X 4500 MHz CPU s)
	CPU Caches:
	L1 Data 32 KiB (x6)
	L1 Instruction 32 KiB (x6)
	L2 Unified 256 KiB (x6)
	L3 Unified 12288 KiB (x1)
	Load Average: 5.11, 3.52, 1.54
	*WARNING* CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
	------------------------------------------------------------------------------
	Benchmark Time CPU Iterations
	------------------------------------------------------------------------------
	BM_userspace_rcu 4.38 ns 4.38 ns 155844656
	BM_userspace_rcu_uatomic_add 6.96 ns 6.95 ns 100851261
	BM_userspace_rcu_uatomic_add_return 9.64 ns 9.64 ns 72623295