userspace-RCU-vs-std::atomic
/// https://github.com/aosp-mirror/platform_bionic/blob/master/benchmarks/atomic_benchmark.cpp | |
/* | |
* Copyright (C) 2017 The Android Open Source Project | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
// Our goal is to measure the cost of various C++ atomic operations. | |
// Android doesn't really control those. But since some of these operations can be quite | |
// expensive, this may be useful input for development of higher level code. | |
// Expected mappings from C++ atomics to hardware primitives can be found at | |
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html . | |
#include <atomic> | |
#include <mutex> | |
#include <benchmark/benchmark.h> | |
#include "util.h" | |
// We time atomic operations separated by a volatile (not atomic!) increment. This ensures | |
// that the compiler emits memory instructions (e.g. load or store) prior to any fence or the | |
// like. That in turn ensures that the CPU has outstanding memory operations when the fence | |
// is executed. | |
// In most respects, we compute best case values. Since there is only one thread, there are no | |
// coherence misses. | |
// We assume that the compiler is not smart enough to optimize away fences in a single-threaded | |
// program. If that changes, we'll need to add a second thread. | |
static volatile unsigned counter; | |
std::atomic<int> test_loc(0); | |
static volatile unsigned sink; | |
static std::mutex mtx; | |
void BM_atomic_empty(benchmark::State& state) { | |
while (state.KeepRunning()) { | |
++counter; | |
} | |
} | |
BIONIC_BENCHMARK(BM_atomic_empty); | |
static void BM_atomic_load_relaxed(benchmark::State& state) { | |
unsigned result = 0; | |
while (state.KeepRunning()) { | |
result += test_loc.load(std::memory_order_relaxed); | |
++counter; | |
} | |
sink = result; | |
} | |
BIONIC_BENCHMARK(BM_atomic_load_relaxed); | |
static void BM_atomic_load_acquire(benchmark::State& state) { | |
unsigned result = 0; | |
while (state.KeepRunning()) { | |
result += test_loc.load(std::memory_order_acquire); | |
++counter; | |
} | |
sink = result; | |
} | |
BIONIC_BENCHMARK(BM_atomic_load_acquire); | |
static void BM_atomic_store_release(benchmark::State& state) { | |
int i = counter; | |
while (state.KeepRunning()) { | |
test_loc.store(++i, std::memory_order_release); | |
++counter; | |
} | |
} | |
BIONIC_BENCHMARK(BM_atomic_store_release); | |
static void BM_atomic_store_seq_cst(benchmark::State& state) { | |
int i = counter; | |
while (state.KeepRunning()) { | |
test_loc.store(++i, std::memory_order_seq_cst); | |
++counter; | |
} | |
} | |
BIONIC_BENCHMARK(BM_atomic_store_seq_cst); | |
static void BM_atomic_fetch_add_relaxed(benchmark::State& state) { | |
unsigned result = 0; | |
while (state.KeepRunning()) { | |
result += test_loc.fetch_add(1, std::memory_order_relaxed); | |
++counter; | |
} | |
sink = result; | |
} | |
BIONIC_BENCHMARK(BM_atomic_fetch_add_relaxed); | |
static void BM_atomic_fetch_add_seq_cst(benchmark::State& state) { | |
unsigned result = 0; | |
while (state.KeepRunning()) { | |
result += test_loc.fetch_add(1, std::memory_order_seq_cst); | |
++counter; | |
} | |
sink = result; | |
} | |
BIONIC_BENCHMARK(BM_atomic_fetch_add_seq_cst); | |
// The fence benchmarks include a relaxed load to make it much harder to optimize away | |
// the fence. | |
static void BM_atomic_acquire_fence(benchmark::State& state) { | |
unsigned result = 0; | |
while (state.KeepRunning()) { | |
result += test_loc.load(std::memory_order_relaxed); | |
std::atomic_thread_fence(std::memory_order_acquire); | |
++counter; | |
} | |
sink = result; | |
} | |
BIONIC_BENCHMARK(BM_atomic_acquire_fence); | |
static void BM_atomic_seq_cst_fence(benchmark::State& state) { | |
unsigned result = 0; | |
while (state.KeepRunning()) { | |
result += test_loc.load(std::memory_order_relaxed); | |
std::atomic_thread_fence(std::memory_order_seq_cst); | |
++counter; | |
} | |
sink = result; | |
} | |
BIONIC_BENCHMARK(BM_atomic_seq_cst_fence); | |
// For comparison, also throw in a critical section version: | |
static void BM_atomic_fetch_add_cs(benchmark::State& state) { | |
unsigned result = 0; | |
while (state.KeepRunning()) { | |
{ | |
std::lock_guard<std::mutex> _(mtx); | |
result += ++counter; | |
} | |
} | |
sink = result; | |
} | |
BIONIC_BENCHMARK(BM_atomic_fetch_add_cs); |
$ src/Common/benchmarks/atomic_benchmark | |
2020-11-23T22:44:06+03:00 | |
Running src/Common/benchmarks/atomic_benchmark | |
Run on (12 X 4500 MHz CPU s) | |
CPU Caches: | |
L1 Data 32 KiB (x6) | |
L1 Instruction 32 KiB (x6) | |
L2 Unified 256 KiB (x6) | |
L3 Unified 12288 KiB (x1) | |
Load Average: 0.08, 1.15, 1.08 | |
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. | |
---------------------------------------------------------------------- | |
Benchmark Time CPU Iterations | |
---------------------------------------------------------------------- | |
BM_atomic_empty 2.05 ns 2.05 ns 293076586 | |
BM_atomic_load_relaxed 2.07 ns 2.07 ns 338530084 | |
BM_atomic_load_acquire 2.10 ns 2.10 ns 333778097 | |
BM_atomic_store_release 2.02 ns 2.02 ns 347426790 | |
BM_atomic_store_seq_cst 9.64 ns 9.64 ns 72618247 | |
BM_atomic_fetch_add_relaxed 9.64 ns 9.64 ns 72614481 | |
BM_atomic_fetch_add_seq_cst 9.71 ns 9.70 ns 72455484 | |
BM_atomic_acquire_fence 2.10 ns 2.10 ns 333774984 | |
BM_atomic_seq_cst_fence 27.0 ns 27.0 ns 25935192 | |
BM_atomic_fetch_add_cs 39.1 ns 39.0 ns 17922906 |
clang++ -O2 -o atomic_benchmark atomic_benchmark.cpp -lbenchmark -lbenchmark_main | |
# -D_LGPL_SOURCE is signficant | |
clang++ -O2 -o userspace_rcu_benchmark userspace_rcu_bp_benchmark.cpp -lurcu-bp -lbenchmark -lbenchmark_main -D_LGPL_SOURCE=1 | |
clang++ -O2 -o userspace_rcu_benchmark userspace_rcu_memb_benchmark.cpp -lurcu-memb -lbenchmark -lbenchmark_main -D_LGPL_SOURCE=1 |
------------------------------------------------------------------------------ | |
Benchmark Time CPU Iterations | |
------------------------------------------------------------------------------ | |
BM_userspace_rcu 3.55 ns 3.55 ns 180008939 # bp version | |
BM_userspace_rcu_uatomic_add 6.94 ns 6.94 ns 100860254 # bp version | |
BM_userspace_rcu_uatomic_add_return 9.64 ns 9.64 ns 72620501 # bp version | |
BM_atomic_empty 2.05 ns 2.05 ns 293076586 | |
BM_atomic_load_relaxed 2.07 ns 2.07 ns 338530084 | |
BM_atomic_load_acquire 2.10 ns 2.10 ns 333778097 | |
BM_atomic_store_release 2.02 ns 2.02 ns 347426790 | |
BM_atomic_store_seq_cst 9.64 ns 9.64 ns 72618247 | |
BM_atomic_fetch_add_relaxed 9.64 ns 9.64 ns 72614481 | |
BM_atomic_fetch_add_seq_cst 9.71 ns 9.70 ns 72455484 | |
BM_atomic_acquire_fence 2.10 ns 2.10 ns 333774984 | |
BM_atomic_seq_cst_fence 27.0 ns 27.0 ns 25935192 | |
BM_atomic_fetch_add_cs 39.1 ns 39.0 ns 17922906 |
#include <benchmark/benchmark.h> | |
#include <urcu/uatomic.h> | |
#if defined(RCU_MEMB) | |
#include <urcu.h> | |
#elif defined(RCU_BP) | |
#include <urcu-bp.h> | |
#else | |
#error None of RCU_* defined | |
#endif | |
#if !defined(_LGPL_SOURCE) | |
#error URCU is very slow w/o _LGPL_SOURCE | |
#endif | |
static void BM_userspace_rcu(benchmark::State& state) | |
{ | |
rcu_init(); | |
unsigned atomic = 0; | |
while (state.KeepRunning()) | |
{ | |
rcu_read_lock(); | |
++atomic; | |
rcu_read_unlock(); | |
} | |
} | |
BENCHMARK(BM_userspace_rcu); | |
static void BM_userspace_rcu_uatomic_add(benchmark::State& state) | |
{ | |
unsigned atomic = 0; | |
while (state.KeepRunning()) | |
{ | |
uatomic_add(&atomic, 1); | |
} | |
} | |
BENCHMARK(BM_userspace_rcu_uatomic_add); | |
static void BM_userspace_rcu_uatomic_add_return(benchmark::State& state) | |
{ | |
unsigned atomic = 0; | |
while (state.KeepRunning()) | |
{ | |
uatomic_add_return(&atomic, 1); | |
} | |
} | |
BENCHMARK(BM_userspace_rcu_uatomic_add_return); |
$ src/Common/benchmarks/userspace_rcu_bp_benchmark | |
2020-11-23T22:37:49+03:00 | |
Running src/Common/benchmarks/userspace_rcu_bp_benchmark | |
Run on (12 X 4500 MHz CPU s) | |
CPU Caches: | |
L1 Data 32 KiB (x6) | |
L1 Instruction 32 KiB (x6) | |
L2 Unified 256 KiB (x6) | |
L3 Unified 12288 KiB (x1) | |
Load Average: 6.48, 3.69, 1.56 | |
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. | |
------------------------------------------------------------------------------ | |
Benchmark Time CPU Iterations | |
------------------------------------------------------------------------------ | |
BM_userspace_rcu 3.55 ns 3.55 ns 180008939 | |
BM_userspace_rcu_uatomic_add 6.94 ns 6.94 ns 100860254 | |
BM_userspace_rcu_uatomic_add_return 9.64 ns 9.64 ns 72620501 |
$ src/Common/benchmarks/userspace_rcu_memb_benchmark | |
2020-11-23T22:38:01+03:00 | |
Running src/Common/benchmarks/userspace_rcu_memb_benchmark | |
Run on (12 X 4500 MHz CPU s) | |
CPU Caches: | |
L1 Data 32 KiB (x6) | |
L1 Instruction 32 KiB (x6) | |
L2 Unified 256 KiB (x6) | |
L3 Unified 12288 KiB (x1) | |
Load Average: 5.11, 3.52, 1.54 | |
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. | |
------------------------------------------------------------------------------ | |
Benchmark Time CPU Iterations | |
------------------------------------------------------------------------------ | |
BM_userspace_rcu 4.38 ns 4.38 ns 155844656 | |
BM_userspace_rcu_uatomic_add 6.96 ns 6.95 ns 100851261 | |
BM_userspace_rcu_uatomic_add_return 9.64 ns 9.64 ns 72623295 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment