Skip to content

Instantly share code, notes, and snippets.

@mpoeter
Created April 5, 2020 15:57
Show Gist options
  • Save mpoeter/926a80eda0c2c2dce8424b1175a24c84 to your computer and use it in GitHub Desktop.
Save mpoeter/926a80eda0c2c2dce8424b1175a24c84 to your computer and use it in GitHub Desktop.
Simple synthetic benchmark comparing a lock-free stack to a lock-based one.
#include <atomic>
#include <cassert>
#include <chrono>
#include <iostream>
#include <mutex>
#include <thread>
#include <vector>
#if defined(__sparc__)
#include <synch.h>
#elif defined(__x86_64) || defined(_M_AMD64)
#include <emmintrin.h>
#endif
#if defined(_WIN32)
void* aligned_malloc(size_t size, size_t alignment) {
return _aligned_malloc(size, alignment);
}
void aligned_free(void* pointer) {
_aligned_free(pointer);
}
#else
void* aligned_malloc(size_t size, size_t alignment) {
void* result;
posix_memalign(&result, alignment, size);
return result;
}
void aligned_free(void* pointer) {
free(pointer);
}
#endif
void backoff() {
#if defined(__sparc__)
smt_pause();
#elif defined(__x86_64) || defined(_M_AMD64)
_mm_pause();
#endif
}
struct spinlock {
void lock() {
for (;;) {
while (locked.load(std::memory_order_relaxed))
backoff();
bool expected = false;
if (locked.compare_exchange_weak(expected, true, std::memory_order_acquire, std::memory_order_relaxed))
return;
backoff();
}
}
void unlock() { locked.store(false, std::memory_order_release); }
private:
std::atomic<bool> locked{false};
};
struct lockfree_stack {
// We intentionally over-align the nodes here, so we can use to lower 16 bits for the version tag.
// This obviously comes with a high memory overhead, but it is the simplest solution that is fully portable.
static constexpr size_t alignment = 1 << 16;
struct node {
static void* operator new(size_t sz) { return aligned_malloc(sz, alignment); }
static void operator delete(void* p) { aligned_free(p); }
node() : data(), next() {}
unsigned data;
std::atomic<node*> next;
};
lockfree_stack() : head() {}
~lockfree_stack() {
auto n = get_ptr(head.load());
while (n) {
auto next = n->next.load();
delete n;
n = next;
}
}
void push(node* n) {
for (;;) {
auto h = head.load(std::memory_order_relaxed);
n->next.store(get_ptr(h), std::memory_order_relaxed);
auto new_h = make_ptr(n, h);
if (head.compare_exchange_weak(h, new_h, std::memory_order_release, std::memory_order_relaxed))
return;
backoff();
}
}
node* pop() {
for (;;) {
auto n = head.load(std::memory_order_acquire);
auto ptr = get_ptr(n);
auto new_h = make_ptr(ptr->next.load(std::memory_order_relaxed), n);
if (head.compare_exchange_weak(n, new_h, std::memory_order_release, std::memory_order_relaxed))
return ptr;
backoff();
}
}
private:
static constexpr uintptr_t tag_mask = alignment - 1;
static constexpr uintptr_t tag_inc = 1;
std::atomic<node*> head;
node* make_ptr(node* new_ptr, node* old_ptr) {
uintptr_t new_ptr_val = reinterpret_cast<uintptr_t>(new_ptr);
assert((new_ptr_val & tag_mask) == 0);
auto tag = (reinterpret_cast<uintptr_t>(old_ptr) & tag_mask) + tag_inc;
tag &= tag_mask;
return reinterpret_cast<node*>(new_ptr_val | tag);
}
node* get_ptr(node* n) {
uintptr_t val = reinterpret_cast<uintptr_t>(n);
return reinterpret_cast<node*>(val & ~tag_mask);
}
};
template <class Lock>
struct threadsafe_stack {
struct node {
node() : data(), next() {}
unsigned data;
node* next;
};
threadsafe_stack() : head(), lock() {}
~threadsafe_stack() {
auto n = head;
while (n) {
auto next = n->next;
delete n;
n = next;
}
}
void push(node* n) {
std::lock_guard<Lock> guard(lock);
n->next = head;
head = n;
}
node* pop() {
std::lock_guard<Lock> guard(lock);
auto result = head;
head = head->next;
return result;
}
private:
node* head;
char padding[64]; // avoid false sharing
Lock lock;
};
static constexpr size_t num_ops = 500000;
template <class T>
void run_benchmark(size_t num_threads, const char* type) {
std::cout << type << " :\t";
std::vector<std::thread> threads;
threads.reserve(num_threads);
T stack;
std::atomic<bool> start{false};
std::atomic<size_t> num_threads_ready{0};
for (size_t i = 0; i < num_threads; ++i) {
threads.push_back(
std::thread(
[&stack, &start, &num_threads_ready]() {
typename T::node* n = new typename T::node();
++num_threads_ready;
while (start.load(std::memory_order_relaxed) == false)
backoff();
for (size_t op = 0; op < num_ops; ++op) {
auto sum = n->data;
stack.push(n);
for (unsigned x = 0; x < 1000; ++x) {
sum += x; // simulate some workload
}
n = stack.pop();
n->data = sum;
}
stack.push(n); // push the node so the destructor can free it
}
)
);
}
while (num_threads_ready.load(std::memory_order_relaxed) < threads.size())
; // wait until all threads are ready
auto start_time = std::chrono::high_resolution_clock::now();
start.store(true); // signal start
// wait for threads to finish...
for (auto& thread : threads)
thread.join();
auto end_time = std::chrono::high_resolution_clock::now();
auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
auto total_ops = num_threads * num_ops * 2;
std::cout << duration_ms << "ms\t" << total_ops / duration_ms << " ops/ms" << std::endl;
};
size_t step_size(size_t n) {
if (n < 2) return 1;
if (n < 8) return 2;
if (n < 24) return 4;
if (n < 48) return 8;
if (n < 160) return 16;
return 32;
}
int main() {
std::cout << "Running concurrent stack bechmarks; each thread performs " << num_ops << " push and pop operations." << std::endl;
const auto max_threads = std::thread::hardware_concurrency() * 2;
for (size_t threads = 1; threads <= max_threads; threads += step_size(threads)) {
std::cout << "\n" << threads << " threads" << std::endl;
run_benchmark<lockfree_stack>(threads, "lockfree_stack ");
run_benchmark<threadsafe_stack<std::mutex>>(threads, "threadsafe_stack<mutex> ");
if (threads <= 64) {
// spinlock performs poorly for large number of threads...!
run_benchmark<threadsafe_stack<spinlock>>(threads, "threadsafe_stack<spinlock>");
}
}
}
@mpoeter
Copy link
Author

mpoeter commented Apr 5, 2020

I have created this benchmark because of the discussion between David Schwartz (alias @JoelKatz) and me in this stackoverflow thread.

Here are some results from different machines:

Linux 4.19.0-8-amd64 #1 SMP Debian 4.19.98-1 (2020-01-26) x86_64 GNU/Linux
AMD EPYC 7351P 16-Core Processor
16 cores, SMT disabled

image

Linux 4.19.0-6-amd64 #1 SMP Debian 4.19.67-2+deb10u2 (2019-11-11) x86_64 GNU/Linux
4x Intel(R) Xeon(R) CPU E7- 8850 @ 2.00GHz
80 cores, SMT disabled

image

SunOS 5.11 11.3 sun4v sparc sun4v
4x SPARC-T5-4
512 hardware threads; 64 cores, 8x SMT

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment