Created
November 23, 2020 19:48
-
-
Save azat/066d165154fa1efe6000fac59062cc25 to your computer and use it in GitHub Desktop.
userspace-RCU-vs-std::atomic
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// https://github.com/aosp-mirror/platform_bionic/blob/master/benchmarks/atomic_benchmark.cpp | |
/* | |
* Copyright (C) 2017 The Android Open Source Project | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
// Our goal is to measure the cost of various C++ atomic operations. | |
// Android doesn't really control those. But since some of these operations can be quite | |
// expensive, this may be useful input for development of higher level code. | |
// Expected mappings from C++ atomics to hardware primitives can be found at | |
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html . | |
#include <atomic> | |
#include <mutex> | |
#include <benchmark/benchmark.h> | |
#include "util.h" | |
// We time atomic operations separated by a volatile (not atomic!) increment. This ensures | |
// that the compiler emits memory instructions (e.g. load or store) prior to any fence or the | |
// like. That in turn ensures that the CPU has outstanding memory operations when the fence | |
// is executed. | |
// In most respects, we compute best case values. Since there is only one thread, there are no | |
// coherence misses. | |
// We assume that the compiler is not smart enough to optimize away fences in a single-threaded | |
// program. If that changes, we'll need to add a second thread. | |
static volatile unsigned counter; | |
std::atomic<int> test_loc(0); | |
static volatile unsigned sink; | |
static std::mutex mtx; | |
void BM_atomic_empty(benchmark::State& state) { | |
while (state.KeepRunning()) { | |
++counter; | |
} | |
} | |
BIONIC_BENCHMARK(BM_atomic_empty); | |
static void BM_atomic_load_relaxed(benchmark::State& state) { | |
unsigned result = 0; | |
while (state.KeepRunning()) { | |
result += test_loc.load(std::memory_order_relaxed); | |
++counter; | |
} | |
sink = result; | |
} | |
BIONIC_BENCHMARK(BM_atomic_load_relaxed); | |
static void BM_atomic_load_acquire(benchmark::State& state) { | |
unsigned result = 0; | |
while (state.KeepRunning()) { | |
result += test_loc.load(std::memory_order_acquire); | |
++counter; | |
} | |
sink = result; | |
} | |
BIONIC_BENCHMARK(BM_atomic_load_acquire); | |
static void BM_atomic_store_release(benchmark::State& state) { | |
int i = counter; | |
while (state.KeepRunning()) { | |
test_loc.store(++i, std::memory_order_release); | |
++counter; | |
} | |
} | |
BIONIC_BENCHMARK(BM_atomic_store_release); | |
static void BM_atomic_store_seq_cst(benchmark::State& state) { | |
int i = counter; | |
while (state.KeepRunning()) { | |
test_loc.store(++i, std::memory_order_seq_cst); | |
++counter; | |
} | |
} | |
BIONIC_BENCHMARK(BM_atomic_store_seq_cst); | |
static void BM_atomic_fetch_add_relaxed(benchmark::State& state) { | |
unsigned result = 0; | |
while (state.KeepRunning()) { | |
result += test_loc.fetch_add(1, std::memory_order_relaxed); | |
++counter; | |
} | |
sink = result; | |
} | |
BIONIC_BENCHMARK(BM_atomic_fetch_add_relaxed); | |
static void BM_atomic_fetch_add_seq_cst(benchmark::State& state) { | |
unsigned result = 0; | |
while (state.KeepRunning()) { | |
result += test_loc.fetch_add(1, std::memory_order_seq_cst); | |
++counter; | |
} | |
sink = result; | |
} | |
BIONIC_BENCHMARK(BM_atomic_fetch_add_seq_cst); | |
// The fence benchmarks include a relaxed load to make it much harder to optimize away | |
// the fence. | |
static void BM_atomic_acquire_fence(benchmark::State& state) { | |
unsigned result = 0; | |
while (state.KeepRunning()) { | |
result += test_loc.load(std::memory_order_relaxed); | |
std::atomic_thread_fence(std::memory_order_acquire); | |
++counter; | |
} | |
sink = result; | |
} | |
BIONIC_BENCHMARK(BM_atomic_acquire_fence); | |
static void BM_atomic_seq_cst_fence(benchmark::State& state) { | |
unsigned result = 0; | |
while (state.KeepRunning()) { | |
result += test_loc.load(std::memory_order_relaxed); | |
std::atomic_thread_fence(std::memory_order_seq_cst); | |
++counter; | |
} | |
sink = result; | |
} | |
BIONIC_BENCHMARK(BM_atomic_seq_cst_fence); | |
// For comparison, also throw in a critical section version: | |
static void BM_atomic_fetch_add_cs(benchmark::State& state) { | |
unsigned result = 0; | |
while (state.KeepRunning()) { | |
{ | |
std::lock_guard<std::mutex> _(mtx); | |
result += ++counter; | |
} | |
} | |
sink = result; | |
} | |
BIONIC_BENCHMARK(BM_atomic_fetch_add_cs); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ src/Common/benchmarks/atomic_benchmark | |
2020-11-23T22:44:06+03:00 | |
Running src/Common/benchmarks/atomic_benchmark | |
Run on (12 X 4500 MHz CPU s) | |
CPU Caches: | |
L1 Data 32 KiB (x6) | |
L1 Instruction 32 KiB (x6) | |
L2 Unified 256 KiB (x6) | |
L3 Unified 12288 KiB (x1) | |
Load Average: 0.08, 1.15, 1.08 | |
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. | |
---------------------------------------------------------------------- | |
Benchmark Time CPU Iterations | |
---------------------------------------------------------------------- | |
BM_atomic_empty 2.05 ns 2.05 ns 293076586 | |
BM_atomic_load_relaxed 2.07 ns 2.07 ns 338530084 | |
BM_atomic_load_acquire 2.10 ns 2.10 ns 333778097 | |
BM_atomic_store_release 2.02 ns 2.02 ns 347426790 | |
BM_atomic_store_seq_cst 9.64 ns 9.64 ns 72618247 | |
BM_atomic_fetch_add_relaxed 9.64 ns 9.64 ns 72614481 | |
BM_atomic_fetch_add_seq_cst 9.71 ns 9.70 ns 72455484 | |
BM_atomic_acquire_fence 2.10 ns 2.10 ns 333774984 | |
BM_atomic_seq_cst_fence 27.0 ns 27.0 ns 25935192 | |
BM_atomic_fetch_add_cs 39.1 ns 39.0 ns 17922906 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
clang++ -O2 -o atomic_benchmark atomic_benchmark.cpp -lbenchmark -lbenchmark_main | |
# -D_LGPL_SOURCE is signficant | |
clang++ -O2 -o userspace_rcu_benchmark userspace_rcu_bp_benchmark.cpp -lurcu-bp -lbenchmark -lbenchmark_main -D_LGPL_SOURCE=1 | |
clang++ -O2 -o userspace_rcu_benchmark userspace_rcu_memb_benchmark.cpp -lurcu-memb -lbenchmark -lbenchmark_main -D_LGPL_SOURCE=1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
------------------------------------------------------------------------------ | |
Benchmark Time CPU Iterations | |
------------------------------------------------------------------------------ | |
BM_userspace_rcu 3.55 ns 3.55 ns 180008939 # bp version | |
BM_userspace_rcu_uatomic_add 6.94 ns 6.94 ns 100860254 # bp version | |
BM_userspace_rcu_uatomic_add_return 9.64 ns 9.64 ns 72620501 # bp version | |
BM_atomic_empty 2.05 ns 2.05 ns 293076586 | |
BM_atomic_load_relaxed 2.07 ns 2.07 ns 338530084 | |
BM_atomic_load_acquire 2.10 ns 2.10 ns 333778097 | |
BM_atomic_store_release 2.02 ns 2.02 ns 347426790 | |
BM_atomic_store_seq_cst 9.64 ns 9.64 ns 72618247 | |
BM_atomic_fetch_add_relaxed 9.64 ns 9.64 ns 72614481 | |
BM_atomic_fetch_add_seq_cst 9.71 ns 9.70 ns 72455484 | |
BM_atomic_acquire_fence 2.10 ns 2.10 ns 333774984 | |
BM_atomic_seq_cst_fence 27.0 ns 27.0 ns 25935192 | |
BM_atomic_fetch_add_cs 39.1 ns 39.0 ns 17922906 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <benchmark/benchmark.h> | |
#include <urcu/uatomic.h> | |
#if defined(RCU_MEMB) | |
#include <urcu.h> | |
#elif defined(RCU_BP) | |
#include <urcu-bp.h> | |
#else | |
#error None of RCU_* defined | |
#endif | |
#if !defined(_LGPL_SOURCE) | |
#error URCU is very slow w/o _LGPL_SOURCE | |
#endif | |
static void BM_userspace_rcu(benchmark::State& state) | |
{ | |
rcu_init(); | |
unsigned atomic = 0; | |
while (state.KeepRunning()) | |
{ | |
rcu_read_lock(); | |
++atomic; | |
rcu_read_unlock(); | |
} | |
} | |
BENCHMARK(BM_userspace_rcu); | |
static void BM_userspace_rcu_uatomic_add(benchmark::State& state) | |
{ | |
unsigned atomic = 0; | |
while (state.KeepRunning()) | |
{ | |
uatomic_add(&atomic, 1); | |
} | |
} | |
BENCHMARK(BM_userspace_rcu_uatomic_add); | |
static void BM_userspace_rcu_uatomic_add_return(benchmark::State& state) | |
{ | |
unsigned atomic = 0; | |
while (state.KeepRunning()) | |
{ | |
uatomic_add_return(&atomic, 1); | |
} | |
} | |
BENCHMARK(BM_userspace_rcu_uatomic_add_return); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ src/Common/benchmarks/userspace_rcu_bp_benchmark | |
2020-11-23T22:37:49+03:00 | |
Running src/Common/benchmarks/userspace_rcu_bp_benchmark | |
Run on (12 X 4500 MHz CPU s) | |
CPU Caches: | |
L1 Data 32 KiB (x6) | |
L1 Instruction 32 KiB (x6) | |
L2 Unified 256 KiB (x6) | |
L3 Unified 12288 KiB (x1) | |
Load Average: 6.48, 3.69, 1.56 | |
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. | |
------------------------------------------------------------------------------ | |
Benchmark Time CPU Iterations | |
------------------------------------------------------------------------------ | |
BM_userspace_rcu 3.55 ns 3.55 ns 180008939 | |
BM_userspace_rcu_uatomic_add 6.94 ns 6.94 ns 100860254 | |
BM_userspace_rcu_uatomic_add_return 9.64 ns 9.64 ns 72620501 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ src/Common/benchmarks/userspace_rcu_memb_benchmark | |
2020-11-23T22:38:01+03:00 | |
Running src/Common/benchmarks/userspace_rcu_memb_benchmark | |
Run on (12 X 4500 MHz CPU s) | |
CPU Caches: | |
L1 Data 32 KiB (x6) | |
L1 Instruction 32 KiB (x6) | |
L2 Unified 256 KiB (x6) | |
L3 Unified 12288 KiB (x1) | |
Load Average: 5.11, 3.52, 1.54 | |
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. | |
------------------------------------------------------------------------------ | |
Benchmark Time CPU Iterations | |
------------------------------------------------------------------------------ | |
BM_userspace_rcu 4.38 ns 4.38 ns 155844656 | |
BM_userspace_rcu_uatomic_add 6.96 ns 6.95 ns 100851261 | |
BM_userspace_rcu_uatomic_add_return 9.64 ns 9.64 ns 72623295 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment