Skip to content

Instantly share code, notes, and snippets.

@jkriegshauser
Last active April 16, 2020 03:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jkriegshauser/c57ea23dc41beff96e13c9c381a3f008 to your computer and use it in GitHub Desktop.
Save jkriegshauser/c57ea23dc41beff96e13c9c381a3f008 to your computer and use it in GitHub Desktop.
Lockless Stack
// Copyright 2020 NVIDIA CORPORATION
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Using LocklessStack is designed to be simple. For a class `Foo` that you want to be contained in a LocklessStack,
// it must have a member of type `LocklessStackLink<Foo>`. This member is what the LocklessStack will use for tracking
// data.
//
// LocklessStack is entirely thread-safe except where declared otherwise. No allocation happens with a LocklessStack;
// instead the caller is responsible for construction/destruction of contained objects.
//
// Pushing to LocklessStack is simply done through `push()`, which is entirely thread-safe. LocklessStack ensures LIFO
// for each producer pushing to LocklessStack.
//
// Popping is done through `pop()`, which is also entirely thread-safe. Multiple threads may all attempt to pop from
// the same LocklessStack simultaneously.
//
// Simple example:
// class Foo
// {
// public:
// LocklessStackLink<Foo> m_link;
// };
//
// LocklessStack<Foo, &Foo::m_link> stack;
// stack.push(new Foo);
// Foo* p = stack.pop();
// delete p;
#pragma once
#include <atomic>
#include <cassert>
#include <exception>
#if __GNUC__
# include <signal.h>
#endif
namespace details
{
#if __GNUC__
// Section start and stop locations for the LocklessStackDetails section automatically created by the linker.
extern "C" unsigned char __start_LocklessStackDetails[];
extern "C" unsigned char __stop_LocklessStackDetails[];
class SignalHandler
{
public:
SignalHandler()
{
struct sigaction newAction;
newAction.sa_flags = SA_SIGINFO;
sigemptyset(&newAction.sa_mask);
newAction.sa_sigaction = &Handler;
sigaction(SIGSEGV, &newAction, old());
}
~SignalHandler()
{
// Restore
sigaction(SIGSEGV, old(), nullptr);
}
// This function is *very specifically* always no-inline, maximum optimizations without a frame pointer, and placed
// in a special section. This is so `Handler()` can gracefully recover from a very specific SIGSEGV in this
// function.
static bool __attribute__((noinline, optimize(3, "omit-frame-pointer"), section("LocklessStackDetails")))
readNext(void** out, void* in)
{
// The asm for this function SHOULD be (x86_64):
// mov rax, qword ptr [rsi]
// mov qword ptr [rdi], rax
// mov rax, 1
// ret
// And for ARMv8.1:
// ldr x1, [x1]
// str x1, [x0]
// mov w0, 1
// ret
// Very important that the first instruction is the one that can fail, and there are no stack operations.
// This is all done very carefully because we expect that there is a very rare chance that the first operation
// could cause SIGSEGV if another thread has already modified the stack. We handle that signal with zero cost
// unless it happens. On Windows, this is done much simpler since SEH exists, but on Linux we have to very
// carefully handle with some creative signal handling.
*out = *(void**)in;
return true;
}
static void EnsureRegistered()
{
static SignalHandler handler;
static_cast<void>(handler); // silence warning about unused
}
private:
static struct sigaction* old()
{
static struct sigaction oldAction;
return &oldAction;
}
static void Handler(int sig, siginfo_t* info, void* ctx)
{
ucontext_t* context = reinterpret_cast<ucontext_t*>(ctx);
# if defined(__x86_64__)
greg_t& rax = context->uc_mcontext.gregs[REG_RAX];
greg_t& rip = context->uc_mcontext.gregs[REG_RIP];
if (rip == greg_t(&readNext))
{
// The crash happened where we expected it to, on the first instructon of readNext(). Handle gracefully.
rax = 0; // Return value to false
// Set to the last instruction of readData. It should be the return instruction.
rip = greg_t(&__stop_LocklessStackDetails) - 1;
if (*(unsigned char*)rip != 0xC3) std::terminate(); // must be a return instruction
return;
}
// If this is the case, then either we crashed at a different location within the function, or there is some
// prologue code generated by the compiler. In the last case, we can't just forward `rip` to the ret instruction
// because any stack operations wouldn't be undone.
if (rip >= greg_t(&__start_LocklessStackDetails) && rip < greg_t(&__stop_LocklessStackDetails))
// SIGSEGV in expected function but not at expected location!
std::terminate();
# elif defined(__aarch64__)
__u64& pc = context->uc_mcontext.pc;
__u64& x0 = context->uc_mcontext.regs[0];
if (pc == __u64(&readNext))
{
// The crash happened where we expected it to, on the first instruction of readNext(). Handle gracefully.
x0 = 0; // Return value to false
// Set to the last instruction of readData. ARM64 instructions are 32 bits. It should be the return
// instruction.
pc = __u64(&__stop_LocklessStackDetails) - sizeof(uint32_t);
if (*(uint32_t*)pc != 0xd65f03c0) std::terminate(); // Must be a return instruction
return;
}
// If this is the case, then either we crashed at a different location within the function, or there is some
// prologue code generated by the compiler. In the last case, we can't just forward `rip` to the ret instruction
// because any stack operations wouldn't be undone.
if (pc >= greg_t(&__start_LocklessStackDetails) && pc < greg_t(&__stop_LocklessStackDetails))
// SIGSEGV in expected function but not at expected location!
std::terminate();
# else
# error Unknown architecture!
# endif
// Chain to previous handler if one exists
struct sigaction* prev = old();
if (prev->sa_handler == SIG_IGN)
return;
else if (prev->sa_handler != SIG_DFL)
prev->sa_sigaction(sig, info, ctx);
else
{
// Run the default signal handler. Normally we would have to re-raise the signal, but since this is
// SIGSEGV, if we return without handling it should re-occur immediately.
sigaction(SIGSEGV, old(), nullptr);
}
}
};
#elif _MSC_VER
class SignalHandler
{
public:
static void EnsureRegistered()
{
}
static bool readNext(void** out, void* in)
{
// We do this in a SEH block (on Windows) because it's possible (though rare) that another thread could have
// already popped and destroyed `cur` which would cause EXCEPTION_ACCESS_VIOLATION. By handling it in an
// exception handler, we recover cleanly and try again. On 64-bit Windows, there is zero cost unless an
// exception is thrown, at which point the kernel will find the Exception info and Unwind Info for the function
// that we're in.
__try
{
*out = *(void**)in;
return true;
}
__except (1)
{
return false;
}
}
};
#else
# error Unsupported compiler
#endif
} // namespace details
/**
* Defines the link object. Each class contained in `LocklessStack` must have a member of type `LocklessStackLink<T>`
*/
template <class T>
class LocklessStackLink
{
public:
constexpr LocklessStackLink() = default;
private:
LocklessStackLink<T>* m_next;
friend T;
template <class U, LocklessStackLink<U> U::* V>
friend class LocklessStack;
};
/**
* An implementation of a lockless stack. The second template parameter is a pointer-to-member parameter to the
* LocklessStackLink<T> member.
*/
template <class T, LocklessStackLink<T> T::* U>
class LocklessStack
{
public:
/// Constructor
LocklessStack()
{
// Ensure that our signal handler is registered
details::SignalHandler::EnsureRegistered();
}
/// Destructor. Asserts that all items have been popped from the LocklessStack.
~LocklessStack()
{
// Ensure the stack is empty
assert(isEmpty());
}
/// Indicates whether the stack is empty. Note that this value is transient. Another thread could have changed the
/// stack before this call returns.
/// @return true if the stack appears empty; false if items appear to exist in the stack
bool isEmpty() const
{
return !m_head.load(std::memory_order_relaxed);
}
/// Pushes an item onto the stack. This is safe to be executed from multiple threads simultaneously.
/// @param p The item to push onto the stack.
/// @return true if the stack was previously empty; false otherwise. Note that this is atomic as opposed to checking
/// `isEmpty()` prior to calling the function.
bool push(T* p)
{
uint16_t seq;
uint64_t expected = m_head.load(std::memory_order_relaxed), temp;
do
{
(p->*U).m_next = decode(expected, seq);
temp = encode(&(p->*U), seq + 1);
} while (!m_head.compare_exchange_strong(expected, temp, std::memory_order_release));
return !expected;
}
/// Pops an item from the stack if available. This is safe to be executed from multiple threads simultaneously.
/// @return An item popped from the stack. If the stack was empty, then nullptr is returned.
T* pop()
{
size_t expected = m_head.load(std::memory_order_relaxed), temp;
LocklessStackLink<T>* cur;
uint16_t seq;
for (;;)
{
cur = decode(expected, seq);
if (!cur)
return nullptr;
// Attempt to read the next value
LocklessStackLink<T>* newhead;
if (!details::SignalHandler::readNext((void**)&newhead, cur))
{
// Another thread changed `cur`, so reload and try again.
expected = m_head.load(std::memory_order_relaxed);
continue;
}
// Clear sequence if head is nullptr, otherwise add 1 so it's changing for each op.
seq = !newhead ? 0 : seq + 1;
temp = encode(newhead, seq);
if (m_head.compare_exchange_strong(expected, temp, std::memory_order_release))
break;
}
return convert(cur);
}
private:
// On 64-bit architectures, we make use of the fact that CPUs only use a certain number of address bits.
// Intel CPUs require that these 8 to 16 most-significant-bits match (all 1s or 0s). Since 8 appears to be the
// lowest common denominator, we steal 7 bits (to save the value of one of the bits so that they can match) for a
// sequence number. The sequence is important as it causes the resulting stored value to change even if the stack is
// pushing and popping the same value.
static_assert(sizeof(size_t) == 8, "64-bit only");
constexpr const static size_t kCPUBits = 7; // MSBs that are limited by CPU hardware and must match the 56th bit
constexpr const static size_t kCPUMask = ((size_t(1) << (kCPUBits + 1)) - 1) << (63 - kCPUBits);
constexpr const static size_t kSeqBits = kCPUBits + 3; // We also use the lowest 3 bits as part of the sequence
constexpr const static size_t kSeqMask = (size_t(1) << kSeqBits) - 1;
std::atomic_uint64_t m_head{ ATOMIC_VAR_INIT(0) };
static LocklessStackLink<T>* decode(const size_t& val, uint16_t& seq)
{
seq = val & kSeqMask;
return reinterpret_cast<LocklessStackLink<T>*>(ptrdiff_t(val & ~kSeqMask) >> kCPUBits); // Use sign-extension
}
static size_t encode(LocklessStackLink<T>* p, uint16_t seq)
{
// All OS bits should either be zero or one, and it needs to be 8-byte-aligned. Very rarely on Windows we'll see
// this value be the deleted memory marker (all Ds) during pop()
assert(size_t(p) == 0xddddddddddddddddull || (size_t(p) & kCPUMask) == 0 || (size_t(p) & kCPUMask) == kCPUMask);
assert(size_t(p) == 0xddddddddddddddddull || (size_t(p) % 8) == 0);
return ((size_t(p) << kCPUBits) & ~kSeqMask) + (seq & uint16_t(kSeqMask));
}
static T* convert(LocklessStackLink<T>* p)
{
// We need to calculate the offset of our link member and calculate where T is.
// Note that this doesn't work if T uses virtual inheritance
size_t offset = (size_t) reinterpret_cast<char*>(&(((T*)0)->*U));
return reinterpret_cast<T*>(reinterpret_cast<char*>(p) - offset);
}
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment