-
-
Save jkriegshauser/c57ea23dc41beff96e13c9c381a3f008 to your computer and use it in GitHub Desktop.
Lockless Stack
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright 2020 NVIDIA CORPORATION | |
// Licensed under the Apache License, Version 2.0 (the "License"); | |
// you may not use this file except in compliance with the License. | |
// You may obtain a copy of the License at | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// Unless required by applicable law or agreed to in writing, software | |
// distributed under the License is distributed on an "AS IS" BASIS, | |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
// See the License for the specific language governing permissions and | |
// limitations under the License. | |
// | |
// Using LocklessStack is designed to be simple. For a class `Foo` that you want to be contained in a LocklessStack, | |
// it must have a member of type `LocklessStackLink<Foo>`. This member is what the LocklessStack will use for tracking | |
// data. | |
// | |
// LocklessStack is entirely thread-safe except where declared otherwise. No allocation happens with a LocklessStack; | |
// instead the caller is responsible for construction/destruction of contained objects. | |
// | |
// Pushing to LocklessStack is simply done through `push()`, which is entirely thread-safe. LocklessStack ensures LIFO | |
// for each producer pushing to LocklessStack. | |
// | |
// Popping is done through `pop()`, which is also entirely thread-safe. Multiple threads may all attempt to pop from | |
// the same LocklessStack simultaneously. | |
// | |
// Simple example: | |
// class Foo | |
// { | |
// public: | |
// LocklessStackLink<Foo> m_link; | |
// }; | |
// | |
// LocklessStack<Foo, &Foo::m_link> stack; | |
// stack.push(new Foo); | |
// Foo* p = stack.pop(); | |
// delete p; | |
#pragma once | |
#include <atomic> | |
#include <cassert> | |
#include <exception> | |
#if __GNUC__ | |
# include <signal.h> | |
#endif | |
namespace details | |
{ | |
#if __GNUC__ | |
// Section start and stop locations for the LocklessStackDetails section automatically created by the linker. | |
extern "C" unsigned char __start_LocklessStackDetails[]; | |
extern "C" unsigned char __stop_LocklessStackDetails[]; | |
class SignalHandler | |
{ | |
public: | |
SignalHandler() | |
{ | |
struct sigaction newAction; | |
newAction.sa_flags = SA_SIGINFO; | |
sigemptyset(&newAction.sa_mask); | |
newAction.sa_sigaction = &Handler; | |
sigaction(SIGSEGV, &newAction, old()); | |
} | |
~SignalHandler() | |
{ | |
// Restore | |
sigaction(SIGSEGV, old(), nullptr); | |
} | |
// This function is *very specifically* always no-inline, maximum optimizations without a frame pointer, and placed | |
// in a special section. This is so `Handler()` can gracefully recover from a very specific SIGSEGV in this | |
// function. | |
static bool __attribute__((noinline, optimize(3, "omit-frame-pointer"), section("LocklessStackDetails"))) | |
readNext(void** out, void* in) | |
{ | |
// The asm for this function SHOULD be (x86_64): | |
// mov rax, qword ptr [rsi] | |
// mov qword ptr [rdi], rax | |
// mov rax, 1 | |
// ret | |
// And for ARMv8.1: | |
// ldr x1, [x1] | |
// str x1, [x0] | |
// mov w0, 1 | |
// ret | |
// Very important that the first instruction is the one that can fail, and there are no stack operations. | |
// This is all done very carefully because we expect that there is a very rare chance that the first operation | |
// could cause SIGSEGV if another thread has already modified the stack. We handle that signal with zero cost | |
// unless it happens. On Windows, this is done much simpler since SEH exists, but on Linux we have to very | |
// carefully handle with some creative signal handling. | |
*out = *(void**)in; | |
return true; | |
} | |
static void EnsureRegistered() | |
{ | |
static SignalHandler handler; | |
static_cast<void>(handler); // silence warning about unused | |
} | |
private: | |
static struct sigaction* old() | |
{ | |
static struct sigaction oldAction; | |
return &oldAction; | |
} | |
static void Handler(int sig, siginfo_t* info, void* ctx) | |
{ | |
ucontext_t* context = reinterpret_cast<ucontext_t*>(ctx); | |
# if defined(__x86_64__) | |
greg_t& rax = context->uc_mcontext.gregs[REG_RAX]; | |
greg_t& rip = context->uc_mcontext.gregs[REG_RIP]; | |
if (rip == greg_t(&readNext)) | |
{ | |
// The crash happened where we expected it to, on the first instructon of readNext(). Handle gracefully. | |
rax = 0; // Return value to false | |
// Set to the last instruction of readData. It should be the return instruction. | |
rip = greg_t(&__stop_LocklessStackDetails) - 1; | |
if (*(unsigned char*)rip != 0xC3) std::terminate(); // must be a return instruction | |
return; | |
} | |
// If this is the case, then either we crashed at a different location within the function, or there is some | |
// prologue code generated by the compiler. In the last case, we can't just forward `rip` to the ret instruction | |
// because any stack operations wouldn't be undone. | |
if (rip >= greg_t(&__start_LocklessStackDetails) && rip < greg_t(&__stop_LocklessStackDetails)) | |
// SIGSEGV in expected function but not at expected location! | |
std::terminate(); | |
# elif defined(__aarch64__) | |
__u64& pc = context->uc_mcontext.pc; | |
__u64& x0 = context->uc_mcontext.regs[0]; | |
if (pc == __u64(&readNext)) | |
{ | |
// The crash happened where we expected it to, on the first instruction of readNext(). Handle gracefully. | |
x0 = 0; // Return value to false | |
// Set to the last instruction of readData. ARM64 instructions are 32 bits. It should be the return | |
// instruction. | |
pc = __u64(&__stop_LocklessStackDetails) - sizeof(uint32_t); | |
if (*(uint32_t*)pc != 0xd65f03c0) std::terminate(); // Must be a return instruction | |
return; | |
} | |
// If this is the case, then either we crashed at a different location within the function, or there is some | |
// prologue code generated by the compiler. In the last case, we can't just forward `rip` to the ret instruction | |
// because any stack operations wouldn't be undone. | |
if (pc >= greg_t(&__start_LocklessStackDetails) && pc < greg_t(&__stop_LocklessStackDetails)) | |
// SIGSEGV in expected function but not at expected location! | |
std::terminate(); | |
# else | |
# error Unknown architecture! | |
# endif | |
// Chain to previous handler if one exists | |
struct sigaction* prev = old(); | |
if (prev->sa_handler == SIG_IGN) | |
return; | |
else if (prev->sa_handler != SIG_DFL) | |
prev->sa_sigaction(sig, info, ctx); | |
else | |
{ | |
// Run the default signal handler. Normally we would have to re-raise the signal, but since this is | |
// SIGSEGV, if we return without handling it should re-occur immediately. | |
sigaction(SIGSEGV, old(), nullptr); | |
} | |
} | |
}; | |
#elif _MSC_VER | |
class SignalHandler | |
{ | |
public: | |
static void EnsureRegistered() | |
{ | |
} | |
static bool readNext(void** out, void* in) | |
{ | |
// We do this in a SEH block (on Windows) because it's possible (though rare) that another thread could have | |
// already popped and destroyed `cur` which would cause EXCEPTION_ACCESS_VIOLATION. By handling it in an | |
// exception handler, we recover cleanly and try again. On 64-bit Windows, there is zero cost unless an | |
// exception is thrown, at which point the kernel will find the Exception info and Unwind Info for the function | |
// that we're in. | |
__try | |
{ | |
*out = *(void**)in; | |
return true; | |
} | |
__except (1) | |
{ | |
return false; | |
} | |
} | |
}; | |
#else | |
# error Unsupported compiler | |
#endif | |
} // namespace details | |
/** | |
* Defines the link object. Each class contained in `LocklessStack` must have a member of type `LocklessStackLink<T>` | |
*/ | |
template <class T> | |
class LocklessStackLink | |
{ | |
public: | |
constexpr LocklessStackLink() = default; | |
private: | |
LocklessStackLink<T>* m_next; | |
friend T; | |
template <class U, LocklessStackLink<U> U::* V> | |
friend class LocklessStack; | |
}; | |
/** | |
* An implementation of a lockless stack. The second template parameter is a pointer-to-member parameter to the | |
* LocklessStackLink<T> member. | |
*/ | |
template <class T, LocklessStackLink<T> T::* U> | |
class LocklessStack | |
{ | |
public: | |
/// Constructor | |
LocklessStack() | |
{ | |
// Ensure that our signal handler is registered | |
details::SignalHandler::EnsureRegistered(); | |
} | |
/// Destructor. Asserts that all items have been popped from the LocklessStack. | |
~LocklessStack() | |
{ | |
// Ensure the stack is empty | |
assert(isEmpty()); | |
} | |
/// Indicates whether the stack is empty. Note that this value is transient. Another thread could have changed the | |
/// stack before this call returns. | |
/// @return true if the stack appears empty; false if items appear to exist in the stack | |
bool isEmpty() const | |
{ | |
return !m_head.load(std::memory_order_relaxed); | |
} | |
/// Pushes an item onto the stack. This is safe to be executed from multiple threads simultaneously. | |
/// @param p The item to push onto the stack. | |
/// @return true if the stack was previously empty; false otherwise. Note that this is atomic as opposed to checking | |
/// `isEmpty()` prior to calling the function. | |
bool push(T* p) | |
{ | |
uint16_t seq; | |
uint64_t expected = m_head.load(std::memory_order_relaxed), temp; | |
do | |
{ | |
(p->*U).m_next = decode(expected, seq); | |
temp = encode(&(p->*U), seq + 1); | |
} while (!m_head.compare_exchange_strong(expected, temp, std::memory_order_release)); | |
return !expected; | |
} | |
/// Pops an item from the stack if available. This is safe to be executed from multiple threads simultaneously. | |
/// @return An item popped from the stack. If the stack was empty, then nullptr is returned. | |
T* pop() | |
{ | |
size_t expected = m_head.load(std::memory_order_relaxed), temp; | |
LocklessStackLink<T>* cur; | |
uint16_t seq; | |
for (;;) | |
{ | |
cur = decode(expected, seq); | |
if (!cur) | |
return nullptr; | |
// Attempt to read the next value | |
LocklessStackLink<T>* newhead; | |
if (!details::SignalHandler::readNext((void**)&newhead, cur)) | |
{ | |
// Another thread changed `cur`, so reload and try again. | |
expected = m_head.load(std::memory_order_relaxed); | |
continue; | |
} | |
// Clear sequence if head is nullptr, otherwise add 1 so it's changing for each op. | |
seq = !newhead ? 0 : seq + 1; | |
temp = encode(newhead, seq); | |
if (m_head.compare_exchange_strong(expected, temp, std::memory_order_release)) | |
break; | |
} | |
return convert(cur); | |
} | |
private: | |
// On 64-bit architectures, we make use of the fact that CPUs only use a certain number of address bits. | |
// Intel CPUs require that these 8 to 16 most-significant-bits match (all 1s or 0s). Since 8 appears to be the | |
// lowest common denominator, we steal 7 bits (to save the value of one of the bits so that they can match) for a | |
// sequence number. The sequence is important as it causes the resulting stored value to change even if the stack is | |
// pushing and popping the same value. | |
static_assert(sizeof(size_t) == 8, "64-bit only"); | |
constexpr const static size_t kCPUBits = 7; // MSBs that are limited by CPU hardware and must match the 56th bit | |
constexpr const static size_t kCPUMask = ((size_t(1) << (kCPUBits + 1)) - 1) << (63 - kCPUBits); | |
constexpr const static size_t kSeqBits = kCPUBits + 3; // We also use the lowest 3 bits as part of the sequence | |
constexpr const static size_t kSeqMask = (size_t(1) << kSeqBits) - 1; | |
std::atomic_uint64_t m_head{ ATOMIC_VAR_INIT(0) }; | |
static LocklessStackLink<T>* decode(const size_t& val, uint16_t& seq) | |
{ | |
seq = val & kSeqMask; | |
return reinterpret_cast<LocklessStackLink<T>*>(ptrdiff_t(val & ~kSeqMask) >> kCPUBits); // Use sign-extension | |
} | |
static size_t encode(LocklessStackLink<T>* p, uint16_t seq) | |
{ | |
// All OS bits should either be zero or one, and it needs to be 8-byte-aligned. Very rarely on Windows we'll see | |
// this value be the deleted memory marker (all Ds) during pop() | |
assert(size_t(p) == 0xddddddddddddddddull || (size_t(p) & kCPUMask) == 0 || (size_t(p) & kCPUMask) == kCPUMask); | |
assert(size_t(p) == 0xddddddddddddddddull || (size_t(p) % 8) == 0); | |
return ((size_t(p) << kCPUBits) & ~kSeqMask) + (seq & uint16_t(kSeqMask)); | |
} | |
static T* convert(LocklessStackLink<T>* p) | |
{ | |
// We need to calculate the offset of our link member and calculate where T is. | |
// Note that this doesn't work if T uses virtual inheritance | |
size_t offset = (size_t) reinterpret_cast<char*>(&(((T*)0)->*U)); | |
return reinterpret_cast<T*>(reinterpret_cast<char*>(p) - offset); | |
} | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment