glampert/linear_allocator.cpp

## linear_allocator.cpp
// ============================================================================
// This code is in the Public Domain - I claim no copyrights over it.
// Use at your own leisure and risk.
//
// Compiled and tested with:
//  c++ -std=c++11 -fno-exceptions -Wall -Wextra -pedantic -O3 linear_allocator.cpp
// ============================================================================

#include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>

#include <atomic>
#include <memory>
#include <thread>
#include <utility>
#include <vector>

// ========================================================

inline bool is_power_of_two(const std::size_t size)
{
    return size != 0 && (size & (size - 1)) == 0;
}

inline bool is_aligned_ptr(const void * ptr, const std::size_t alignment)
{
    // Aligned if the pointer is evenly divisible by the alignment value.
    return (reinterpret_cast<std::uintptr_t>(ptr) & (alignment - 1)) == 0;
}

inline std::size_t align_size(const std::size_t size, const std::size_t alignment)
{
    // Add the minimum extra needed to the size for pointer alignment.
    // This size can then be used to malloc() some memory
    // and then have the pointer aligned with align_ptr().
    return size + (alignment - 1);
}

template<typename T>
inline T * align_ptr(const T * ptr, const std::size_t alignment)
{
    // Cast to integer and align:
    const std::uintptr_t uint_ptr    = reinterpret_cast<std::uintptr_t>(ptr);
    const std::uintptr_t aligned_ptr = (uint_ptr + (alignment - 1)) & ~(alignment - 1);

    // Re-cast to void*, validate and return:
    T * user_ptr = reinterpret_cast<T *>(aligned_ptr);
    assert(is_aligned_ptr(user_ptr, alignment));
    return user_ptr;
}

// ========================================================

struct allocator_out_of_mem_abort
{
    void operator()(const char * user_msg, std::size_t num_bytes, std::size_t alignment) const
    {
        std::fprintf(stderr, "[%s]: Failed to allocate %zu bytes with %zub alignment!\n",
                     user_msg, num_bytes, alignment);
        std::abort();
    }
};

// ========================================================

template<typename OutOfMemHandler = allocator_out_of_mem_abort>
class linear_allocator final
{
public:

    // Not copyable.
    linear_allocator(const linear_allocator &) = delete;
    linear_allocator & operator = (const linear_allocator &) = delete;

    // Allocated the stack memory block.
    explicit linear_allocator(std::uint32_t max_stack_size_bytes)
        : m_stack{ new std::uint8_t[max_stack_size_bytes] }
        , m_size{ max_stack_size_bytes }
        , m_top{ 0 }
    {
        // Should be on any decent architecture.
        assert(m_top.is_lock_free());
    }

    // Allocates a chunk of bytes from the linear stack, bumping the stack top counter
    // and calling the user-defined error handler if out-of-memory (stack overflowed).
    std::uint8_t * allocate_bytes(std::size_t num_bytes, std::size_t alignment = 16)
    {
        assert(num_bytes != 0);
        assert(alignment != 0);
        assert(is_power_of_two(alignment));

        num_bytes = align_size(num_bytes, alignment);
        assert(num_bytes <= UINT32_MAX);

        const std::uint32_t old_top = m_top.fetch_add(static_cast<std::uint32_t>(num_bytes));
        if ((old_top + num_bytes) > m_size)
        {
            OutOfMemHandler out_of_mem{};
            out_of_mem("linear_allocator", num_bytes, alignment);
            return nullptr;
        }

        return align_ptr(&m_stack[old_top], alignment);
    }

    // Allocate with cast to user-defined type T.
    // The returned element is *not* constructed, so the caller is
    // responsible for calling placement new on the pointer to run
    // class constructors.
    template<typename T>
    T * allocate(std::size_t count, std::size_t alignment = 16)
    {
        return reinterpret_cast<T *>(allocate_bytes(count * sizeof(T), alignment));
    }

    // Does nothing - only provided for consistency.
    // The only way to deallocate memory is by rolling back the stack top counter.
    void free(const void *) { }

    // Rests the stack top counter to a given position (zero by default),
    // effectively marking any memory beyond that position as free for
    // new allocations.
    void reset(std::uint32_t position = 0)
    {
        m_top = position;
    }

    // Accessors:
    std::uint8_t * get_base() const { return m_stack; }
    std::uint32_t  get_size() const { return m_size;  }
    std::uint32_t  get_top()  const { return m_top;   }

private:

    // The linear_allocator is a simple block of memory where
    // allocations always move forward in the block - i.e.: it
    // is a stack-like allocator. Total size of the stack mem
    // block never changes after initialization.
    std::unique_ptr<std::uint8_t[]> m_stack;
    const std::uint32_t m_size;

    // The stack top counter is atomic, so it is safe to allocate
    // from the same linear_allocator across different threads.
    std::atomic<std::uint32_t> m_top;
};

// ========================================================

template<typename T>
T * construct(T * obj, const T & other) // Copy constructor
{
    return ::new(obj) T(other);
}

template<typename T, typename... Args>
T * construct(T * obj, Args&&... args) // Parameterized or default constructor
{
    return ::new(obj) T(std::forward<Args>(args)...);
}

template<typename T>
void destroy(T * obj)
{
    if (obj != nullptr)
    {
        obj->~T();
    }
}

// ========================================================

void thread_func(std::uint8_t thread_idx, std::uint64_t * out, linear_allocator<> * shared_allocator)
{
    const int MaxThreadAlloc = 2048;
    const int N = std::max(1, std::rand() % 128);

    std::printf("Thread %u running %i iterations...\n", static_cast<unsigned>(thread_idx), N);

    std::vector<std::pair<std::uint8_t *, int>> blocks;
    blocks.reserve(N);

    // Do a bunch of small/medium allocation every other iteration of the loop:
    for (int i = 0; i < N; ++i)
    {
        if (i & 1)
        {
            const int size = std::max(1, std::rand() % MaxThreadAlloc);
            std::uint8_t * ptr = shared_allocator->allocate_bytes(size);
            std::memset(ptr, thread_idx, size);
            blocks.emplace_back(ptr, size);
        }
    }

    // Makes sure we have a visible side-effect so the compiler
    // doesn't strip out the whole loop; this value is ignored.
    std::uint64_t sum = 0;

    // Assert that our memory was not overwritten by other threads:
    for (const auto & block_and_size : blocks)
    {
        const std::uint8_t * ptr = block_and_size.first;
        const int size = block_and_size.second;

        for (int i = 0; i < size; ++i)
        {
            assert(ptr[i] == thread_idx);
            sum += ptr[i];
        }
    }

    (*out) += sum;
}

// ========================================================

int main()
{
    linear_allocator<> shared_allocator{ 2048 * 1024 }; // 2MB

    std::uint64_t ignored = 0;
    std::vector<std::thread> threads;

    // 10 runs with 10 thread each
    for (int i = 0; i < 10; ++i)
    {
        for (int t = 0; t < 10; ++t)
        {
            threads.emplace_back(thread_func, static_cast<std::uint8_t>(t), &ignored, &shared_allocator);
        }

        for (auto & t : threads)
        {
            t.join();
        }

        threads.clear();

        std::printf("Mem used: %ub\n", shared_allocator.get_top());
        shared_allocator.reset();
    }

    return static_cast<int>(ignored);
}
	// ============================================================================
	// This code is in the Public Domain - I claim no copyrights over it.
	// Use at your own leisure and risk.
	//
	// Compiled and tested with:
	// c++ -std=c++11 -fno-exceptions -Wall -Wextra -pedantic -O3 linear_allocator.cpp
	// ============================================================================

	#include <cassert>
	#include <cstdint>
	#include <cstdio>
	#include <cstdlib>
	#include <cstring>

	#include <atomic>
	#include <memory>
	#include <thread>
	#include <utility>
	#include <vector>

	// ========================================================

	inline bool is_power_of_two(const std::size_t size)
	{
	return size != 0 && (size & (size - 1)) == 0;
	}

	inline bool is_aligned_ptr(const void * ptr, const std::size_t alignment)
	{
	// Aligned if the pointer is evenly divisible by the alignment value.
	return (reinterpret_cast<std::uintptr_t>(ptr) & (alignment - 1)) == 0;
	}

	inline std::size_t align_size(const std::size_t size, const std::size_t alignment)
	{
	// Add the minimum extra needed to the size for pointer alignment.
	// This size can then be used to malloc() some memory
	// and then have the pointer aligned with align_ptr().
	return size + (alignment - 1);
	}

	template<typename T>
	inline T * align_ptr(const T * ptr, const std::size_t alignment)
	{
	// Cast to integer and align:
	const std::uintptr_t uint_ptr = reinterpret_cast<std::uintptr_t>(ptr);
	const std::uintptr_t aligned_ptr = (uint_ptr + (alignment - 1)) & ~(alignment - 1);

	// Re-cast to void*, validate and return:
	T * user_ptr = reinterpret_cast<T *>(aligned_ptr);
	assert(is_aligned_ptr(user_ptr, alignment));
	return user_ptr;
	}

	// ========================================================

	struct allocator_out_of_mem_abort
	{
	void operator()(const char * user_msg, std::size_t num_bytes, std::size_t alignment) const
	{
	std::fprintf(stderr, "[%s]: Failed to allocate %zu bytes with %zub alignment!\n",
	user_msg, num_bytes, alignment);
	std::abort();
	}
	};

	// ========================================================

	template<typename OutOfMemHandler = allocator_out_of_mem_abort>
	class linear_allocator final
	{
	public:

	// Not copyable.
	linear_allocator(const linear_allocator &) = delete;
	linear_allocator & operator = (const linear_allocator &) = delete;

	// Allocated the stack memory block.
	explicit linear_allocator(std::uint32_t max_stack_size_bytes)
	: m_stack{ new std::uint8_t[max_stack_size_bytes] }
	, m_size{ max_stack_size_bytes }
	, m_top{ 0 }
	{
	// Should be on any decent architecture.
	assert(m_top.is_lock_free());
	}

	// Allocates a chunk of bytes from the linear stack, bumping the stack top counter
	// and calling the user-defined error handler if out-of-memory (stack overflowed).
	std::uint8_t * allocate_bytes(std::size_t num_bytes, std::size_t alignment = 16)
	{
	assert(num_bytes != 0);
	assert(alignment != 0);
	assert(is_power_of_two(alignment));

	num_bytes = align_size(num_bytes, alignment);
	assert(num_bytes <= UINT32_MAX);

	const std::uint32_t old_top = m_top.fetch_add(static_cast<std::uint32_t>(num_bytes));
	if ((old_top + num_bytes) > m_size)
	{
	OutOfMemHandler out_of_mem{};
	out_of_mem("linear_allocator", num_bytes, alignment);
	return nullptr;
	}

	return align_ptr(&m_stack[old_top], alignment);
	}

	// Allocate with cast to user-defined type T.
	// The returned element is not constructed, so the caller is
	// responsible for calling placement new on the pointer to run
	// class constructors.
	template<typename T>
	T * allocate(std::size_t count, std::size_t alignment = 16)
	{
	return reinterpret_cast<T >(allocate_bytes(count sizeof(T), alignment));
	}

	// Does nothing - only provided for consistency.
	// The only way to deallocate memory is by rolling back the stack top counter.
	void free(const void *) { }

	// Rests the stack top counter to a given position (zero by default),
	// effectively marking any memory beyond that position as free for
	// new allocations.
	void reset(std::uint32_t position = 0)
	{
	m_top = position;
	}

	// Accessors:
	std::uint8_t * get_base() const { return m_stack; }
	std::uint32_t get_size() const { return m_size; }
	std::uint32_t get_top() const { return m_top; }

	private:

	// The linear_allocator is a simple block of memory where
	// allocations always move forward in the block - i.e.: it
	// is a stack-like allocator. Total size of the stack mem
	// block never changes after initialization.
	std::unique_ptr<std::uint8_t[]> m_stack;
	const std::uint32_t m_size;

	// The stack top counter is atomic, so it is safe to allocate
	// from the same linear_allocator across different threads.
	std::atomic<std::uint32_t> m_top;
	};

	// ========================================================

	template<typename T>
	T * construct(T * obj, const T & other) // Copy constructor
	{
	return ::new(obj) T(other);
	}

	template<typename T, typename... Args>
	T * construct(T * obj, Args&&... args) // Parameterized or default constructor
	{
	return ::new(obj) T(std::forward<Args>(args)...);
	}

	template<typename T>
	void destroy(T * obj)
	{
	if (obj != nullptr)
	{
	obj->~T();
	}
	}

	// ========================================================

	void thread_func(std::uint8_t thread_idx, std::uint64_t * out, linear_allocator<> * shared_allocator)
	{
	const int MaxThreadAlloc = 2048;
	const int N = std::max(1, std::rand() % 128);

	std::printf("Thread %u running %i iterations...\n", static_cast<unsigned>(thread_idx), N);

	std::vector<std::pair<std::uint8_t *, int>> blocks;
	blocks.reserve(N);

	// Do a bunch of small/medium allocation every other iteration of the loop:
	for (int i = 0; i < N; ++i)
	{
	if (i & 1)
	{
	const int size = std::max(1, std::rand() % MaxThreadAlloc);
	std::uint8_t * ptr = shared_allocator->allocate_bytes(size);
	std::memset(ptr, thread_idx, size);
	blocks.emplace_back(ptr, size);
	}
	}

	// Makes sure we have a visible side-effect so the compiler
	// doesn't strip out the whole loop; this value is ignored.
	std::uint64_t sum = 0;

	// Assert that our memory was not overwritten by other threads:
	for (const auto & block_and_size : blocks)
	{
	const std::uint8_t * ptr = block_and_size.first;
	const int size = block_and_size.second;

	for (int i = 0; i < size; ++i)
	{
	assert(ptr[i] == thread_idx);
	sum += ptr[i];
	}
	}

	(*out) += sum;
	}

	// ========================================================

	int main()
	{
	linear_allocator<> shared_allocator{ 2048 * 1024 }; // 2MB

	std::uint64_t ignored = 0;
	std::vector<std::thread> threads;

	// 10 runs with 10 thread each
	for (int i = 0; i < 10; ++i)
	{
	for (int t = 0; t < 10; ++t)
	{
	threads.emplace_back(thread_func, static_cast<std::uint8_t>(t), &ignored, &shared_allocator);
	}

	for (auto & t : threads)
	{
	t.join();
	}

	threads.clear();

	std::printf("Mem used: %ub\n", shared_allocator.get_top());
	shared_allocator.reset();
	}

	return static_cast<int>(ignored);
	}