jkriegshauser/LocklessStack.h Secret

## LocklessStack.h
// Copyright 2020 NVIDIA CORPORATION
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//     http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Using LocklessStack is designed to be simple. For a class `Foo` that you want to be contained in a LocklessStack,
// it must have a member of type `LocklessStackLink<Foo>`. This member is what the LocklessStack will use for tracking
// data.
//
// LocklessStack is entirely thread-safe except where declared otherwise. No allocation happens with a LocklessStack;
// instead the caller is responsible for construction/destruction of contained objects.
//
// Pushing to LocklessStack is simply done through `push()`, which is entirely thread-safe. LocklessStack ensures LIFO
// for each producer pushing to LocklessStack.
//
// Popping is done through `pop()`, which is also entirely thread-safe. Multiple threads may all attempt to pop from
// the same LocklessStack simultaneously.
//
// Simple example:
// class Foo
// {
// public:
//     LocklessStackLink<Foo> m_link;
// };
//
// LocklessStack<Foo, &Foo::m_link> stack;
// stack.push(new Foo);
// Foo* p = stack.pop();
// delete p;
#pragma once

#include <atomic>
#include <cassert>
#include <exception>

#if __GNUC__
#    include <signal.h>
#endif

namespace details
{

#if __GNUC__
    // Section start and stop locations for the LocklessStackDetails section automatically created by the linker.
    extern "C" unsigned char __start_LocklessStackDetails[];
    extern "C" unsigned char __stop_LocklessStackDetails[];

    class SignalHandler
    {
    public:
        SignalHandler()
        {
            struct sigaction newAction;
            newAction.sa_flags = SA_SIGINFO;
            sigemptyset(&newAction.sa_mask);
            newAction.sa_sigaction = &Handler;

            sigaction(SIGSEGV, &newAction, old());
        }
        ~SignalHandler()
        {
            // Restore
            sigaction(SIGSEGV, old(), nullptr);
        }

        // This function is *very specifically* always no-inline, maximum optimizations without a frame pointer, and placed
        // in a special section. This is so `Handler()` can gracefully recover from a very specific SIGSEGV in this
        // function.
        static bool __attribute__((noinline, optimize(3, "omit-frame-pointer"), section("LocklessStackDetails")))
            readNext(void** out, void* in)
        {
            // The asm for this function SHOULD be (x86_64):
            //   mov rax, qword ptr [rsi]
            //   mov qword ptr [rdi], rax
            //   mov rax, 1
            //   ret
            // And for ARMv8.1:
            //   ldr     x1, [x1]
            //   str     x1, [x0]
            //   mov     w0, 1
            //   ret
            // Very important that the first instruction is the one that can fail, and there are no stack operations.

            // This is all done very carefully because we expect that there is a very rare chance that the first operation
            // could cause SIGSEGV if another thread has already modified the stack. We handle that signal with zero cost
            // unless it happens. On Windows, this is done much simpler since SEH exists, but on Linux we have to very
            // carefully handle with some creative signal handling.
            *out = *(void**)in;
            return true;
        }
        static void EnsureRegistered()
        {
            static SignalHandler handler;
            static_cast<void>(handler); // silence warning about unused
        }

    private:
        static struct sigaction* old()
        {
            static struct sigaction oldAction;
            return &oldAction;
        }

        static void Handler(int sig, siginfo_t* info, void* ctx)
        {
            ucontext_t* context = reinterpret_cast<ucontext_t*>(ctx);
#    if defined(__x86_64__)
            greg_t& rax = context->uc_mcontext.gregs[REG_RAX];
            greg_t& rip = context->uc_mcontext.gregs[REG_RIP];
            if (rip == greg_t(&readNext))
            {
                // The crash happened where we expected it to, on the first instructon of readNext(). Handle gracefully.
                rax = 0; // Return value to false
                // Set to the last instruction of readData. It should be the return instruction.
                rip = greg_t(&__stop_LocklessStackDetails) - 1;
                if (*(unsigned char*)rip != 0xC3) std::terminate(); // must be a return instruction
                return;
            }
            // If this is the case, then either we crashed at a different location within the function, or there is some
            // prologue code generated by the compiler. In the last case, we can't just forward `rip` to the ret instruction
            // because any stack operations wouldn't be undone.
            if (rip >= greg_t(&__start_LocklessStackDetails) && rip < greg_t(&__stop_LocklessStackDetails))
                // SIGSEGV in expected function but not at expected location!
                std::terminate();
#    elif defined(__aarch64__)
            __u64& pc = context->uc_mcontext.pc;
            __u64& x0 = context->uc_mcontext.regs[0];
            if (pc == __u64(&readNext))
            {
                // The crash happened where we expected it to, on the first instruction of readNext(). Handle gracefully.
                x0 = 0; // Return value to false
                // Set to the last instruction of readData. ARM64 instructions are 32 bits.  It should be the return
                // instruction.
                pc = __u64(&__stop_LocklessStackDetails) - sizeof(uint32_t);
                if (*(uint32_t*)pc != 0xd65f03c0) std::terminate(); // Must be a return instruction
                return;
            }
            // If this is the case, then either we crashed at a different location within the function, or there is some
            // prologue code generated by the compiler. In the last case, we can't just forward `rip` to the ret instruction
            // because any stack operations wouldn't be undone.
            if (pc >= greg_t(&__start_LocklessStackDetails) && pc < greg_t(&__stop_LocklessStackDetails))
                // SIGSEGV in expected function but not at expected location!
                std::terminate();
#    else
#            error Unknown architecture!
#    endif

            // Chain to previous handler if one exists
            struct sigaction* prev = old();
            if (prev->sa_handler == SIG_IGN)
                return;
            else if (prev->sa_handler != SIG_DFL)
                prev->sa_sigaction(sig, info, ctx);
            else
            {
                // Run the default signal handler. Normally we would have to re-raise the signal, but since this is
                // SIGSEGV, if we return without handling it should re-occur immediately.
                sigaction(SIGSEGV, old(), nullptr);
            }
        }
    };
#elif _MSC_VER
    class SignalHandler
    {
    public:
        static void EnsureRegistered()
        {
        }

        static bool readNext(void** out, void* in)
        {
            // We do this in a SEH block (on Windows) because it's possible (though rare) that another thread could have
            // already popped and destroyed `cur` which would cause EXCEPTION_ACCESS_VIOLATION. By handling it in an
            // exception handler, we recover cleanly and try again. On 64-bit Windows, there is zero cost unless an
            // exception is thrown, at which point the kernel will find the Exception info and Unwind Info for the function
            // that we're in.
            __try
            {
                *out = *(void**)in;
                return true;
            }
            __except (1)
            {
                return false;
            }
        }
    };
#else
#    error Unsupported compiler
#endif

} // namespace details

/**
 * Defines the link object. Each class contained in `LocklessStack` must have a member of type `LocklessStackLink<T>`
 */
template <class T>
class LocklessStackLink
{
public:
    constexpr LocklessStackLink() = default;

private:
    LocklessStackLink<T>* m_next;

    friend T;
    template <class U, LocklessStackLink<U> U::* V>
    friend class LocklessStack;
};

/**
 * An implementation of a lockless stack. The second template parameter is a pointer-to-member parameter to the
 * LocklessStackLink<T> member.
 */
template <class T, LocklessStackLink<T> T::* U>
class LocklessStack
{
public:
    /// Constructor
    LocklessStack()
    {
        // Ensure that our signal handler is registered
        details::SignalHandler::EnsureRegistered();
    }

    /// Destructor. Asserts that all items have been popped from the LocklessStack.
    ~LocklessStack()
    {
        // Ensure the stack is empty
        assert(isEmpty());
    }

    /// Indicates whether the stack is empty. Note that this value is transient. Another thread could have changed the
    /// stack before this call returns.
    /// @return true if the stack appears empty; false if items appear to exist in the stack
    bool isEmpty() const
    {
        return !m_head.load(std::memory_order_relaxed);
    }

    /// Pushes an item onto the stack. This is safe to be executed from multiple threads simultaneously.
    /// @param p The item to push onto the stack.
    /// @return true if the stack was previously empty; false otherwise. Note that this is atomic as opposed to checking
    ///         `isEmpty()` prior to calling the function.
    bool push(T* p)
    {
        uint16_t seq;
        uint64_t expected = m_head.load(std::memory_order_relaxed), temp;
        do
        {
            (p->*U).m_next = decode(expected, seq);
            temp = encode(&(p->*U), seq + 1);
        } while (!m_head.compare_exchange_strong(expected, temp, std::memory_order_release));
        return !expected;
    }

    /// Pops an item from the stack if available. This is safe to be executed from multiple threads simultaneously.
    /// @return An item popped from the stack. If the stack was empty, then nullptr is returned.
    T* pop()
    {
        size_t expected = m_head.load(std::memory_order_relaxed), temp;
        LocklessStackLink<T>* cur;
        uint16_t seq;
        for (;;)
        {
            cur = decode(expected, seq);
            if (!cur)
                return nullptr;

            // Attempt to read the next value
            LocklessStackLink<T>* newhead;
            if (!details::SignalHandler::readNext((void**)&newhead, cur))
            {
                // Another thread changed `cur`, so reload and try again.
                expected = m_head.load(std::memory_order_relaxed);
                continue;
            }

            // Clear sequence if head is nullptr, otherwise add 1 so it's changing for each op.
            seq = !newhead ? 0 : seq + 1;
            temp = encode(newhead, seq);
            if (m_head.compare_exchange_strong(expected, temp, std::memory_order_release))
                break;
        }
        return convert(cur);
    }

private:
    // On 64-bit architectures, we make use of the fact that CPUs only use a certain number of address bits.
    // Intel CPUs require that these 8 to 16 most-significant-bits match (all 1s or 0s). Since 8 appears to be the
    // lowest common denominator, we steal 7 bits (to save the value of one of the bits so that they can match) for a
    // sequence number. The sequence is important as it causes the resulting stored value to change even if the stack is
    // pushing and popping the same value.
    static_assert(sizeof(size_t) == 8, "64-bit only");
    constexpr const static size_t kCPUBits = 7; // MSBs that are limited by CPU hardware and must match the 56th bit
    constexpr const static size_t kCPUMask = ((size_t(1) << (kCPUBits + 1)) - 1) << (63 - kCPUBits);
    constexpr const static size_t kSeqBits = kCPUBits + 3; // We also use the lowest 3 bits as part of the sequence
    constexpr const static size_t kSeqMask = (size_t(1) << kSeqBits) - 1;
    std::atomic_uint64_t m_head{ ATOMIC_VAR_INIT(0) };

    static LocklessStackLink<T>* decode(const size_t& val, uint16_t& seq)
    {
        seq = val & kSeqMask;
        return reinterpret_cast<LocklessStackLink<T>*>(ptrdiff_t(val & ~kSeqMask) >> kCPUBits); // Use sign-extension
    }
    static size_t encode(LocklessStackLink<T>* p, uint16_t seq)
    {
        // All OS bits should either be zero or one, and it needs to be 8-byte-aligned. Very rarely on Windows we'll see
        // this value be the deleted memory marker (all Ds) during pop()
        assert(size_t(p) == 0xddddddddddddddddull || (size_t(p) & kCPUMask) == 0 || (size_t(p) & kCPUMask) == kCPUMask);
        assert(size_t(p) == 0xddddddddddddddddull || (size_t(p) % 8) == 0);
        return ((size_t(p) << kCPUBits) & ~kSeqMask) + (seq & uint16_t(kSeqMask));
    }

    static T* convert(LocklessStackLink<T>* p)
    {
        // We need to calculate the offset of our link member and calculate where T is.
        // Note that this doesn't work if T uses virtual inheritance
        size_t offset = (size_t) reinterpret_cast<char*>(&(((T*)0)->*U));
        return reinterpret_cast<T*>(reinterpret_cast<char*>(p) - offset);
    }
};
	// Copyright 2020 NVIDIA CORPORATION
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	// http://www.apache.org/licenses/LICENSE-2.0
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//
	// Using LocklessStack is designed to be simple. For a class `Foo` that you want to be contained in a LocklessStack,
	// it must have a member of type `LocklessStackLink<Foo>`. This member is what the LocklessStack will use for tracking
	// data.
	//
	// LocklessStack is entirely thread-safe except where declared otherwise. No allocation happens with a LocklessStack;
	// instead the caller is responsible for construction/destruction of contained objects.
	//
	// Pushing to LocklessStack is simply done through `push()`, which is entirely thread-safe. LocklessStack ensures LIFO
	// for each producer pushing to LocklessStack.
	//
	// Popping is done through `pop()`, which is also entirely thread-safe. Multiple threads may all attempt to pop from
	// the same LocklessStack simultaneously.
	//
	// Simple example:
	// class Foo
	// {
	// public:
	// LocklessStackLink<Foo> m_link;
	// };
	//
	// LocklessStack<Foo, &Foo::m_link> stack;
	// stack.push(new Foo);
	// Foo* p = stack.pop();
	// delete p;
	#pragma once

	#include <atomic>
	#include <cassert>
	#include <exception>

	#if __GNUC__
	# include <signal.h>
	#endif

	namespace details
	{

	#if __GNUC__
	// Section start and stop locations for the LocklessStackDetails section automatically created by the linker.
	extern "C" unsigned char __start_LocklessStackDetails[];
	extern "C" unsigned char __stop_LocklessStackDetails[];

	class SignalHandler
	{
	public:
	SignalHandler()
	{
	struct sigaction newAction;
	newAction.sa_flags = SA_SIGINFO;
	sigemptyset(&newAction.sa_mask);
	newAction.sa_sigaction = &Handler;

	sigaction(SIGSEGV, &newAction, old());
	}
	~SignalHandler()
	{
	// Restore
	sigaction(SIGSEGV, old(), nullptr);
	}

	// This function is very specifically always no-inline, maximum optimizations without a frame pointer, and placed
	// in a special section. This is so `Handler()` can gracefully recover from a very specific SIGSEGV in this
	// function.
	static bool __attribute__((noinline, optimize(3, "omit-frame-pointer"), section("LocklessStackDetails")))
	readNext(void** out, void* in)
	{
	// The asm for this function SHOULD be (x86_64):
	// mov rax, qword ptr [rsi]
	// mov qword ptr [rdi], rax
	// mov rax, 1
	// ret
	// And for ARMv8.1:
	// ldr x1, [x1]
	// str x1, [x0]
	// mov w0, 1
	// ret
	// Very important that the first instruction is the one that can fail, and there are no stack operations.

	// This is all done very carefully because we expect that there is a very rare chance that the first operation
	// could cause SIGSEGV if another thread has already modified the stack. We handle that signal with zero cost
	// unless it happens. On Windows, this is done much simpler since SEH exists, but on Linux we have to very
	// carefully handle with some creative signal handling.
	out = (void**)in;
	return true;
	}
	static void EnsureRegistered()
	{
	static SignalHandler handler;
	static_cast<void>(handler); // silence warning about unused
	}

	private:
	static struct sigaction* old()
	{
	static struct sigaction oldAction;
	return &oldAction;
	}

	static void Handler(int sig, siginfo_t* info, void* ctx)
	{
	ucontext_t* context = reinterpret_cast<ucontext_t*>(ctx);
	# if defined(__x86_64__)
	greg_t& rax = context->uc_mcontext.gregs[REG_RAX];
	greg_t& rip = context->uc_mcontext.gregs[REG_RIP];
	if (rip == greg_t(&readNext))
	{
	// The crash happened where we expected it to, on the first instructon of readNext(). Handle gracefully.
	rax = 0; // Return value to false
	// Set to the last instruction of readData. It should be the return instruction.
	rip = greg_t(&__stop_LocklessStackDetails) - 1;
	if ((unsigned char)rip != 0xC3) std::terminate(); // must be a return instruction
	return;
	}
	// If this is the case, then either we crashed at a different location within the function, or there is some
	// prologue code generated by the compiler. In the last case, we can't just forward `rip` to the ret instruction
	// because any stack operations wouldn't be undone.
	if (rip >= greg_t(&__start_LocklessStackDetails) && rip < greg_t(&__stop_LocklessStackDetails))
	// SIGSEGV in expected function but not at expected location!
	std::terminate();
	# elif defined(__aarch64__)
	__u64& pc = context->uc_mcontext.pc;
	__u64& x0 = context->uc_mcontext.regs[0];
	if (pc == __u64(&readNext))
	{
	// The crash happened where we expected it to, on the first instruction of readNext(). Handle gracefully.
	x0 = 0; // Return value to false
	// Set to the last instruction of readData. ARM64 instructions are 32 bits. It should be the return
	// instruction.
	pc = __u64(&__stop_LocklessStackDetails) - sizeof(uint32_t);
	if ((uint32_t)pc != 0xd65f03c0) std::terminate(); // Must be a return instruction
	return;
	}
	// If this is the case, then either we crashed at a different location within the function, or there is some
	// prologue code generated by the compiler. In the last case, we can't just forward `rip` to the ret instruction
	// because any stack operations wouldn't be undone.
	if (pc >= greg_t(&__start_LocklessStackDetails) && pc < greg_t(&__stop_LocklessStackDetails))
	// SIGSEGV in expected function but not at expected location!
	std::terminate();
	# else
	# error Unknown architecture!
	# endif

	// Chain to previous handler if one exists
	struct sigaction* prev = old();
	if (prev->sa_handler == SIG_IGN)
	return;
	else if (prev->sa_handler != SIG_DFL)
	prev->sa_sigaction(sig, info, ctx);
	else
	{
	// Run the default signal handler. Normally we would have to re-raise the signal, but since this is
	// SIGSEGV, if we return without handling it should re-occur immediately.
	sigaction(SIGSEGV, old(), nullptr);
	}
	}
	};
	#elif _MSC_VER
	class SignalHandler
	{
	public:
	static void EnsureRegistered()
	{
	}

	static bool readNext(void** out, void* in)
	{
	// We do this in a SEH block (on Windows) because it's possible (though rare) that another thread could have
	// already popped and destroyed `cur` which would cause EXCEPTION_ACCESS_VIOLATION. By handling it in an
	// exception handler, we recover cleanly and try again. On 64-bit Windows, there is zero cost unless an
	// exception is thrown, at which point the kernel will find the Exception info and Unwind Info for the function
	// that we're in.
	__try
	{
	out = (void**)in;
	return true;
	}
	__except (1)
	{
	return false;
	}
	}
	};
	#else
	# error Unsupported compiler
	#endif

	} // namespace details

	/**
	* Defines the link object. Each class contained in `LocklessStack` must have a member of type `LocklessStackLink<T>`
	*/
	template <class T>
	class LocklessStackLink
	{
	public:
	constexpr LocklessStackLink() = default;

	private:
	LocklessStackLink<T>* m_next;

	friend T;
	template <class U, LocklessStackLink<U> U::* V>
	friend class LocklessStack;
	};

	/**
	* An implementation of a lockless stack. The second template parameter is a pointer-to-member parameter to the
	* LocklessStackLink<T> member.
	*/
	template <class T, LocklessStackLink<T> T::* U>
	class LocklessStack
	{
	public:
	/// Constructor
	LocklessStack()
	{
	// Ensure that our signal handler is registered
	details::SignalHandler::EnsureRegistered();
	}

	/// Destructor. Asserts that all items have been popped from the LocklessStack.
	~LocklessStack()
	{
	// Ensure the stack is empty
	assert(isEmpty());
	}

	/// Indicates whether the stack is empty. Note that this value is transient. Another thread could have changed the
	/// stack before this call returns.
	/// @return true if the stack appears empty; false if items appear to exist in the stack
	bool isEmpty() const
	{
	return !m_head.load(std::memory_order_relaxed);
	}

	/// Pushes an item onto the stack. This is safe to be executed from multiple threads simultaneously.
	/// @param p The item to push onto the stack.
	/// @return true if the stack was previously empty; false otherwise. Note that this is atomic as opposed to checking
	/// `isEmpty()` prior to calling the function.
	bool push(T* p)
	{
	uint16_t seq;
	uint64_t expected = m_head.load(std::memory_order_relaxed), temp;
	do
	{
	(p->*U).m_next = decode(expected, seq);
	temp = encode(&(p->*U), seq + 1);
	} while (!m_head.compare_exchange_strong(expected, temp, std::memory_order_release));
	return !expected;
	}

	/// Pops an item from the stack if available. This is safe to be executed from multiple threads simultaneously.
	/// @return An item popped from the stack. If the stack was empty, then nullptr is returned.
	T* pop()
	{
	size_t expected = m_head.load(std::memory_order_relaxed), temp;
	LocklessStackLink<T>* cur;
	uint16_t seq;
	for (;;)
	{
	cur = decode(expected, seq);
	if (!cur)
	return nullptr;

	// Attempt to read the next value
	LocklessStackLink<T>* newhead;
	if (!details::SignalHandler::readNext((void**)&newhead, cur))
	{
	// Another thread changed `cur`, so reload and try again.
	expected = m_head.load(std::memory_order_relaxed);
	continue;
	}

	// Clear sequence if head is nullptr, otherwise add 1 so it's changing for each op.
	seq = !newhead ? 0 : seq + 1;
	temp = encode(newhead, seq);
	if (m_head.compare_exchange_strong(expected, temp, std::memory_order_release))
	break;
	}
	return convert(cur);
	}

	private:
	// On 64-bit architectures, we make use of the fact that CPUs only use a certain number of address bits.
	// Intel CPUs require that these 8 to 16 most-significant-bits match (all 1s or 0s). Since 8 appears to be the
	// lowest common denominator, we steal 7 bits (to save the value of one of the bits so that they can match) for a
	// sequence number. The sequence is important as it causes the resulting stored value to change even if the stack is
	// pushing and popping the same value.
	static_assert(sizeof(size_t) == 8, "64-bit only");
	constexpr const static size_t kCPUBits = 7; // MSBs that are limited by CPU hardware and must match the 56th bit
	constexpr const static size_t kCPUMask = ((size_t(1) << (kCPUBits + 1)) - 1) << (63 - kCPUBits);
	constexpr const static size_t kSeqBits = kCPUBits + 3; // We also use the lowest 3 bits as part of the sequence
	constexpr const static size_t kSeqMask = (size_t(1) << kSeqBits) - 1;
	std::atomic_uint64_t m_head{ ATOMIC_VAR_INIT(0) };

	static LocklessStackLink<T>* decode(const size_t& val, uint16_t& seq)
	{
	seq = val & kSeqMask;
	return reinterpret_cast<LocklessStackLink<T>*>(ptrdiff_t(val & ~kSeqMask) >> kCPUBits); // Use sign-extension
	}
	static size_t encode(LocklessStackLink<T>* p, uint16_t seq)
	{
	// All OS bits should either be zero or one, and it needs to be 8-byte-aligned. Very rarely on Windows we'll see
	// this value be the deleted memory marker (all Ds) during pop()
	assert(size_t(p) == 0xddddddddddddddddull \|\| (size_t(p) & kCPUMask) == 0 \|\| (size_t(p) & kCPUMask) == kCPUMask);
	assert(size_t(p) == 0xddddddddddddddddull \|\| (size_t(p) % 8) == 0);
	return ((size_t(p) << kCPUBits) & ~kSeqMask) + (seq & uint16_t(kSeqMask));
	}

	static T* convert(LocklessStackLink<T>* p)
	{
	// We need to calculate the offset of our link member and calculate where T is.
	// Note that this doesn't work if T uses virtual inheritance
	size_t offset = (size_t) reinterpret_cast<char>(&(((T)0)->*U));
	return reinterpret_cast<T>(reinterpret_cast<char>(p) - offset);
	}
	};