Gerold103/BenchObjectPool.cpp

## BenchObjectPool.cpp
#include <atomic>
#include <cassert>
#include <condition_variable>
#include <ctime>
#include <iostream>
#include <mutex>
#include <random>
#include <thread>

template <typename T, uint32_t BatchSize>
class ThreadGlobalPool;

static constexpr uint32_t theThreadLocalBatchSize = 32;
static constexpr uint32_t theThreadCount = 5;
static constexpr uint32_t theValueCount = 300000;
static constexpr uint32_t theIterCount = 200;

//////////////////////////////////////////////////////////////////////////////////////////

template<typename T, T* T::*myLink = &T::myNext>
class ForwardList
{
public:
    ForwardList() : myHead(nullptr), myTail(nullptr) {}

    bool IsEmpty() const { return myHead == nullptr; }

    T* PopFirst() { T* res = myHead; if ((myHead = myHead->*myLink) == nullptr) myTail = nullptr; return res; }

    void Append(
        T* aItem) { if (myHead != nullptr) myTail->*myLink = aItem; else myHead = aItem; myTail = aItem; aItem->*myLink = nullptr; }

    void Prepend(
        T* aItem) { aItem->*myLink = myHead; myHead = aItem; if (myTail == nullptr) myTail = aItem; }

private:
    T* myHead;
    T* myTail;
};

//////////////////////////////////////////////////////////////////////////////////////////

template <typename T>
struct ThreadLocalPoolObject
{
    ThreadLocalPoolObject() {}
    ~ThreadLocalPoolObject() {}

    // Union allows to keep proper size and alignment while not to invoke any
    // constructors.
    union
    {
        T myBytes;
        ThreadLocalPoolObject* myNext;
    };
};

//////////////////////////////////////////////////////////////////////////////////////////

template <typename T>
struct ThreadLocalPoolObjectChain
{
    using Object = ThreadLocalPoolObject<T>;
    using ObjectList = ForwardList<Object>;

    ThreadLocalPoolObjectChain();

    ObjectList myList;
    uint32_t mySize;
    ThreadLocalPoolObjectChain* myNext;
};

//////////////////////////////////////////////////////////////////////////////////////////

template <typename T>
struct ThreadLocalPoolUnit
{
    using Object = ThreadLocalPoolObject<T>;
    using Chain = ThreadLocalPoolObjectChain<T>;

    ThreadLocalPoolUnit() {}
    ~ThreadLocalPoolUnit() {}

    // Each pooled unit is either an object or a list of objects (head and tail
    // pointer). This gives flexibility when a pool is empty and an object is freed -
    // no need to allocate any list-containers. The first freed object simply becomes
    // the container.
    union
    {
        Object myObject;
        Chain myChain;
    };
};

//////////////////////////////////////////////////////////////////////////////////////////

template <typename T, uint32_t BatchSize>
class ThreadLocalPool
{
public:
    using Chain = ThreadLocalPoolObjectChain<T>;
    using Object = typename Chain::Object;
    using ObjectList = typename Chain::ObjectList;
    using GlobalPool = ThreadGlobalPool<T, BatchSize>;
    using Unit = ThreadLocalPoolUnit<T>;

    ThreadLocalPool();
    ~ThreadLocalPool();

    static ThreadLocalPool& GetInstance();
    void* AllocateT();
    void FreeT(
        void* aObject);

    Chain* myChain;
    bool myIsDestroyed;

    static_assert(BatchSize > 0, "BatchSize must be > 0");
};

//////////////////////////////////////////////////////////////////////////////////////////

template <typename T, uint32_t BatchSize>
class ThreadGlobalPool
{
public:
    using LocalPool = ThreadLocalPool<T, BatchSize>;
    using ChainList = ForwardList<typename LocalPool::Chain>;

    static ThreadGlobalPool& GetInstance();

    std::mutex myMutex;
    // Thread-local pools return excess objects to the global pool. It allows to keep
    // reusing the objects even if some treads allocate more than free, or vice versa.
    ChainList myFreeChains;
};

//////////////////////////////////////////////////////////////////////////////////////////

template <typename T, size_t BatchSize>
class ThreadPooled
{
public:
    using LocalPool = ThreadLocalPool<T, BatchSize>;

    void* operator new(
        size_t) { return LocalPool::GetInstance().AllocateT(); }
    void operator delete(
        void* aPtr) { LocalPool::GetInstance().FreeT(aPtr); }
};

//////////////////////////////////////////////////////////////////////////////////////////

template<uint32_t PaddigSize>
struct ValueBase
{
    static constexpr uint32_t ourPaddingSize = PaddigSize;
    static constexpr uint32_t ourBatchSize = 0;
    static constexpr bool ourIsPooled = false;

    ValueBase(
        int aValue,
        int aOwner) : myValue(aValue), myOwner(aOwner) {}

    ValueBase(
        int aValue) : ValueBase(aValue, 0) {}

    int myValue;
    int myOwner;
    uint8_t myPadding[PaddigSize];
};

template<uint32_t PaddigSize, uint32_t BatchSize>
struct Value
    : public ValueBase<PaddigSize>
    , public ThreadPooled<Value<PaddigSize, BatchSize>, BatchSize>
{
    static constexpr uint32_t ourBatchSize = BatchSize;
    static constexpr bool ourIsPooled = true;

    Value(
        int aValue,
        int aOwner) : ValueBase<PaddigSize>(aValue, aOwner) {}

    Value(
        int aValue) : ValueBase<PaddigSize>(aValue) {}
};

//////////////////////////////////////////////////////////////////////////////////////////

static uint64_t
getUsec();

template<typename ValueT>
static void
run();

//////////////////////////////////////////////////////////////////////////////////////////

int
main()
{
    std::cout << "Thread count: " << theThreadCount
        << ", value count: " << theValueCount
        << '\n' << std::endl;

    run<ValueBase<1>>();
    run<Value<1, theThreadLocalBatchSize>>();

    run<ValueBase<512>>();
    run<Value<512, theThreadLocalBatchSize>>();

    run<ValueBase<1024>>();
    run<Value<1024, theThreadLocalBatchSize>>();
    return 0;
}

//////////////////////////////////////////////////////////////////////////////////////////

template<typename T>
inline
ThreadLocalPoolObjectChain<T>::ThreadLocalPoolObjectChain()
    : mySize(0)
{
}

//////////////////////////////////////////////////////////////////////////////////////////

template <typename T, uint32_t BatchSize>
inline
ThreadLocalPool<T, BatchSize>::ThreadLocalPool()
    : myChain(nullptr)
    , myIsDestroyed(false)
{
}

template <typename T, uint32_t BatchSize>
ThreadLocalPool<T, BatchSize>::~ThreadLocalPool()
{
    assert(!myIsDestroyed);
    myIsDestroyed = true;
    if (myChain != nullptr)
    {
        GlobalPool& glob = GlobalPool::GetInstance();

        glob.myMutex.lock();
        // Prepend to pop cache-hot data first, when need to allocate next time.
        glob.myFreeChains.Prepend(myChain);
        glob.myMutex.unlock();

        myChain = nullptr;
    }
}

template <typename T, uint32_t BatchSize>
inline ThreadLocalPool<T, BatchSize>&
ThreadLocalPool<T, BatchSize>::GetInstance()
{
    static thread_local ThreadLocalPool ourInstance;
    assert(!ourInstance.myIsDestroyed);
    return ourInstance;
}

template <typename T, uint32_t BatchSize>
void*
ThreadLocalPool<T, BatchSize>::AllocateT()
{
    assert(!myIsDestroyed);
    if (myChain == nullptr)
    {
        GlobalPool& glob = GlobalPool::GetInstance();

        glob.myMutex.lock();
        if (!glob.myFreeChains.IsEmpty())
            myChain = glob.myFreeChains.PopFirst();
        glob.myMutex.unlock();

        if (myChain == nullptr)
        {
            Unit* units = new Unit[BatchSize + 1];
            // First unit -> container. Other units - objects.
            Chain* chain = new (&units[0]) Chain();
            for (uint32_t i = 1; i < BatchSize + 1; ++i)
                chain->myList.Append(new (&units[i]) Object());
            chain->mySize = BatchSize;
            myChain = chain;
        }
    }
    ObjectList* list = &myChain->myList;
    if (!list->IsEmpty())
    {
        assert(myChain->mySize > 0);
        --myChain->mySize;
        return (void*)list->PopFirst();
    }
    assert(myChain->mySize == 0);
    // Can use the container itself as an object too.
    void* obj = (void*)myChain;
    myChain = nullptr;
    return obj;
}

template <typename T, uint32_t BatchSize>
void
ThreadLocalPool<T, BatchSize>::FreeT(
    void* aObject)
{
    assert(!myIsDestroyed);
    if (myChain == nullptr)
    {
        // First object becomes the container for next objects.
        myChain = new (aObject) Chain();
        return;
    }
    if (myChain->mySize == BatchSize)
    {
        GlobalPool& glob = GlobalPool::GetInstance();

        glob.myMutex.lock();
        // Prepend to pop cache-hot data first, when need to allocate next time.
        glob.myFreeChains.Prepend(myChain);
        glob.myMutex.unlock();

        myChain = new (aObject) Chain();;
        return;
    }
    // Prepend to pop cache-hot data first, when need to allocate next time.
    myChain->myList.Prepend((Object*)aObject);
    ++myChain->mySize;
}

//////////////////////////////////////////////////////////////////////////////////////////

template <typename T, uint32_t BatchSize>
inline ThreadGlobalPool<T, BatchSize>&
ThreadGlobalPool<T, BatchSize>::GetInstance()
{
    static ThreadGlobalPool ourInstance;
    return ourInstance;
}

//////////////////////////////////////////////////////////////////////////////////////////

static uint64_t
getUsec()
{
    timespec t;
    clock_gettime(CLOCK_MONOTONIC, &t);
    return t.tv_sec * 1'000'000 + t.tv_nsec / 1000;
}

template<typename ValueT>
static void
run()
{
    std::cout << "Pooled: " << ValueT::ourIsPooled
        << ", object size: " << sizeof(ValueT)
        << std::endl;

    std::vector<std::thread> threads;
    threads.reserve(theThreadCount);
    std::condition_variable condVar;
    std::mutex mutex;
    bool doStart = false;
    std::atomic_int threadIdCounter(1);
    for (uint32_t threadI = 0; threadI < theThreadCount; ++threadI)
    {
        threads.emplace_back([&]() {
            std::mt19937 randomGen;
            randomGen.seed(((uint64_t)&randomGen + time(NULL)) & UINT32_MAX);

            int threadId = threadIdCounter.fetch_add(1, std::memory_order_relaxed);
            std::vector<ValueT*> values;
            values.resize(theValueCount);
            {
                std::unique_lock lock(mutex);
                while (!doStart)
                    condVar.wait(lock);
            }

            for (uint32_t iterI = 0; iterI < theIterCount; ++iterI)
            {
                for (uint32_t valI = 0; valI < theValueCount; ++valI)
                    values[valI] = new ValueT(valI, threadId);
                for (uint32_t valI = 0; valI < theValueCount; ++valI)
                {
                    uint32_t lastIdx = theValueCount - valI - 1;
                    std::uniform_int_distribution<uint32_t> dist(0, lastIdx);
                    uint32_t idx = dist(randomGen);
                    ValueT* v = values[idx];
                    assert(v->myValue == (int)idx);
                    assert(v->myOwner == threadId);
                    delete v;
                    if (idx == lastIdx)
                        continue;
                    // Cyclic removal.
                    v = values[lastIdx];
                    assert(v->myValue == (int)lastIdx);
                    assert(v->myOwner == threadId);
                    v->myValue = idx;
                    values[idx] = v;
                }
            }
        });
    }

    uint64_t t1 = getUsec();
    mutex.lock();
    doStart = true;
    condVar.notify_all();
    mutex.unlock();

    for (std::thread& t : threads)
        t.join();
    uint64_t t2 = getUsec();

    std::cout << "Took " << (t2 - t1) / 1000.0 << " ms\n" << std::endl;
}
	#include <atomic>
	#include <cassert>
	#include <condition_variable>
	#include <ctime>
	#include <iostream>
	#include <mutex>
	#include <random>
	#include <thread>

	template <typename T, uint32_t BatchSize>
	class ThreadGlobalPool;

	static constexpr uint32_t theThreadLocalBatchSize = 32;
	static constexpr uint32_t theThreadCount = 5;
	static constexpr uint32_t theValueCount = 300000;
	static constexpr uint32_t theIterCount = 200;

	//////////////////////////////////////////////////////////////////////////////////////////

	template<typename T, T* T::*myLink = &T::myNext>
	class ForwardList
	{
	public:
	ForwardList() : myHead(nullptr), myTail(nullptr) {}

	bool IsEmpty() const { return myHead == nullptr; }

	T* PopFirst() { T* res = myHead; if ((myHead = myHead->*myLink) == nullptr) myTail = nullptr; return res; }

	void Append(
	T* aItem) { if (myHead != nullptr) myTail->myLink = aItem; else myHead = aItem; myTail = aItem; aItem->myLink = nullptr; }

	void Prepend(
	T* aItem) { aItem->*myLink = myHead; myHead = aItem; if (myTail == nullptr) myTail = aItem; }

	private:
	T* myHead;
	T* myTail;
	};

	//////////////////////////////////////////////////////////////////////////////////////////

	template <typename T>
	struct ThreadLocalPoolObject
	{
	ThreadLocalPoolObject() {}
	~ThreadLocalPoolObject() {}

	// Union allows to keep proper size and alignment while not to invoke any
	// constructors.
	union
	{
	T myBytes;
	ThreadLocalPoolObject* myNext;
	};
	};

	//////////////////////////////////////////////////////////////////////////////////////////

	template <typename T>
	struct ThreadLocalPoolObjectChain
	{
	using Object = ThreadLocalPoolObject<T>;
	using ObjectList = ForwardList<Object>;

	ThreadLocalPoolObjectChain();

	ObjectList myList;
	uint32_t mySize;
	ThreadLocalPoolObjectChain* myNext;
	};

	//////////////////////////////////////////////////////////////////////////////////////////

	template <typename T>
	struct ThreadLocalPoolUnit
	{
	using Object = ThreadLocalPoolObject<T>;
	using Chain = ThreadLocalPoolObjectChain<T>;

	ThreadLocalPoolUnit() {}
	~ThreadLocalPoolUnit() {}

	// Each pooled unit is either an object or a list of objects (head and tail
	// pointer). This gives flexibility when a pool is empty and an object is freed -
	// no need to allocate any list-containers. The first freed object simply becomes
	// the container.
	union
	{
	Object myObject;
	Chain myChain;
	};
	};

	//////////////////////////////////////////////////////////////////////////////////////////

	template <typename T, uint32_t BatchSize>
	class ThreadLocalPool
	{
	public:
	using Chain = ThreadLocalPoolObjectChain<T>;
	using Object = typename Chain::Object;
	using ObjectList = typename Chain::ObjectList;
	using GlobalPool = ThreadGlobalPool<T, BatchSize>;
	using Unit = ThreadLocalPoolUnit<T>;

	ThreadLocalPool();
	~ThreadLocalPool();

	static ThreadLocalPool& GetInstance();
	void* AllocateT();
	void FreeT(
	void* aObject);

	Chain* myChain;
	bool myIsDestroyed;

	static_assert(BatchSize > 0, "BatchSize must be > 0");
	};

	//////////////////////////////////////////////////////////////////////////////////////////

	template <typename T, uint32_t BatchSize>
	class ThreadGlobalPool
	{
	public:
	using LocalPool = ThreadLocalPool<T, BatchSize>;
	using ChainList = ForwardList<typename LocalPool::Chain>;

	static ThreadGlobalPool& GetInstance();

	std::mutex myMutex;
	// Thread-local pools return excess objects to the global pool. It allows to keep
	// reusing the objects even if some treads allocate more than free, or vice versa.
	ChainList myFreeChains;
	};

	//////////////////////////////////////////////////////////////////////////////////////////

	template <typename T, size_t BatchSize>
	class ThreadPooled
	{
	public:
	using LocalPool = ThreadLocalPool<T, BatchSize>;

	void* operator new(
	size_t) { return LocalPool::GetInstance().AllocateT(); }
	void operator delete(
	void* aPtr) { LocalPool::GetInstance().FreeT(aPtr); }
	};

	//////////////////////////////////////////////////////////////////////////////////////////

	template<uint32_t PaddigSize>
	struct ValueBase
	{
	static constexpr uint32_t ourPaddingSize = PaddigSize;
	static constexpr uint32_t ourBatchSize = 0;
	static constexpr bool ourIsPooled = false;

	ValueBase(
	int aValue,
	int aOwner) : myValue(aValue), myOwner(aOwner) {}

	ValueBase(
	int aValue) : ValueBase(aValue, 0) {}

	int myValue;
	int myOwner;
	uint8_t myPadding[PaddigSize];
	};

	template<uint32_t PaddigSize, uint32_t BatchSize>
	struct Value
	: public ValueBase<PaddigSize>
	, public ThreadPooled<Value<PaddigSize, BatchSize>, BatchSize>
	{
	static constexpr uint32_t ourBatchSize = BatchSize;
	static constexpr bool ourIsPooled = true;

	Value(
	int aValue,
	int aOwner) : ValueBase<PaddigSize>(aValue, aOwner) {}

	Value(
	int aValue) : ValueBase<PaddigSize>(aValue) {}
	};

	//////////////////////////////////////////////////////////////////////////////////////////

	static uint64_t
	getUsec();

	template<typename ValueT>
	static void
	run();

	//////////////////////////////////////////////////////////////////////////////////////////

	int
	main()
	{
	std::cout << "Thread count: " << theThreadCount
	<< ", value count: " << theValueCount
	<< '\n' << std::endl;

	run<ValueBase<1>>();
	run<Value<1, theThreadLocalBatchSize>>();

	run<ValueBase<512>>();
	run<Value<512, theThreadLocalBatchSize>>();

	run<ValueBase<1024>>();
	run<Value<1024, theThreadLocalBatchSize>>();
	return 0;
	}

	//////////////////////////////////////////////////////////////////////////////////////////

	template<typename T>
	inline
	ThreadLocalPoolObjectChain<T>::ThreadLocalPoolObjectChain()
	: mySize(0)
	{
	}

	//////////////////////////////////////////////////////////////////////////////////////////

	template <typename T, uint32_t BatchSize>
	inline
	ThreadLocalPool<T, BatchSize>::ThreadLocalPool()
	: myChain(nullptr)
	, myIsDestroyed(false)
	{
	}

	template <typename T, uint32_t BatchSize>
	ThreadLocalPool<T, BatchSize>::~ThreadLocalPool()
	{
	assert(!myIsDestroyed);
	myIsDestroyed = true;
	if (myChain != nullptr)
	{
	GlobalPool& glob = GlobalPool::GetInstance();

	glob.myMutex.lock();
	// Prepend to pop cache-hot data first, when need to allocate next time.
	glob.myFreeChains.Prepend(myChain);
	glob.myMutex.unlock();

	myChain = nullptr;
	}
	}

	template <typename T, uint32_t BatchSize>
	inline ThreadLocalPool<T, BatchSize>&
	ThreadLocalPool<T, BatchSize>::GetInstance()
	{
	static thread_local ThreadLocalPool ourInstance;
	assert(!ourInstance.myIsDestroyed);
	return ourInstance;
	}

	template <typename T, uint32_t BatchSize>
	void*
	ThreadLocalPool<T, BatchSize>::AllocateT()
	{
	assert(!myIsDestroyed);
	if (myChain == nullptr)
	{
	GlobalPool& glob = GlobalPool::GetInstance();

	glob.myMutex.lock();
	if (!glob.myFreeChains.IsEmpty())
	myChain = glob.myFreeChains.PopFirst();
	glob.myMutex.unlock();

	if (myChain == nullptr)
	{
	Unit* units = new Unit[BatchSize + 1];
	// First unit -> container. Other units - objects.
	Chain* chain = new (&units[0]) Chain();
	for (uint32_t i = 1; i < BatchSize + 1; ++i)
	chain->myList.Append(new (&units[i]) Object());
	chain->mySize = BatchSize;
	myChain = chain;
	}
	}
	ObjectList* list = &myChain->myList;
	if (!list->IsEmpty())
	{
	assert(myChain->mySize > 0);
	--myChain->mySize;
	return (void*)list->PopFirst();
	}
	assert(myChain->mySize == 0);
	// Can use the container itself as an object too.
	void* obj = (void*)myChain;
	myChain = nullptr;
	return obj;
	}

	template <typename T, uint32_t BatchSize>
	void
	ThreadLocalPool<T, BatchSize>::FreeT(
	void* aObject)
	{
	assert(!myIsDestroyed);
	if (myChain == nullptr)
	{
	// First object becomes the container for next objects.
	myChain = new (aObject) Chain();
	return;
	}
	if (myChain->mySize == BatchSize)
	{
	GlobalPool& glob = GlobalPool::GetInstance();

	glob.myMutex.lock();
	// Prepend to pop cache-hot data first, when need to allocate next time.
	glob.myFreeChains.Prepend(myChain);
	glob.myMutex.unlock();

	myChain = new (aObject) Chain();;
	return;
	}
	// Prepend to pop cache-hot data first, when need to allocate next time.
	myChain->myList.Prepend((Object*)aObject);
	++myChain->mySize;
	}

	//////////////////////////////////////////////////////////////////////////////////////////

	template <typename T, uint32_t BatchSize>
	inline ThreadGlobalPool<T, BatchSize>&
	ThreadGlobalPool<T, BatchSize>::GetInstance()
	{
	static ThreadGlobalPool ourInstance;
	return ourInstance;
	}

	//////////////////////////////////////////////////////////////////////////////////////////

	static uint64_t
	getUsec()
	{
	timespec t;
	clock_gettime(CLOCK_MONOTONIC, &t);
	return t.tv_sec * 1'000'000 + t.tv_nsec / 1000;
	}

	template<typename ValueT>
	static void
	run()
	{
	std::cout << "Pooled: " << ValueT::ourIsPooled
	<< ", object size: " << sizeof(ValueT)
	<< std::endl;

	std::vector<std::thread> threads;
	threads.reserve(theThreadCount);
	std::condition_variable condVar;
	std::mutex mutex;
	bool doStart = false;
	std::atomic_int threadIdCounter(1);
	for (uint32_t threadI = 0; threadI < theThreadCount; ++threadI)
	{
	threads.emplace_back([&]() {
	std::mt19937 randomGen;
	randomGen.seed(((uint64_t)&randomGen + time(NULL)) & UINT32_MAX);

	int threadId = threadIdCounter.fetch_add(1, std::memory_order_relaxed);
	std::vector<ValueT*> values;
	values.resize(theValueCount);
	{
	std::unique_lock lock(mutex);
	while (!doStart)
	condVar.wait(lock);
	}

	for (uint32_t iterI = 0; iterI < theIterCount; ++iterI)
	{
	for (uint32_t valI = 0; valI < theValueCount; ++valI)
	values[valI] = new ValueT(valI, threadId);
	for (uint32_t valI = 0; valI < theValueCount; ++valI)
	{
	uint32_t lastIdx = theValueCount - valI - 1;
	std::uniform_int_distribution<uint32_t> dist(0, lastIdx);
	uint32_t idx = dist(randomGen);
	ValueT* v = values[idx];
	assert(v->myValue == (int)idx);
	assert(v->myOwner == threadId);
	delete v;
	if (idx == lastIdx)
	continue;
	// Cyclic removal.
	v = values[lastIdx];
	assert(v->myValue == (int)lastIdx);
	assert(v->myOwner == threadId);
	v->myValue = idx;
	values[idx] = v;
	}
	}
	});
	}

	uint64_t t1 = getUsec();
	mutex.lock();
	doStart = true;
	condVar.notify_all();
	mutex.unlock();

	for (std::thread& t : threads)
	t.join();
	uint64_t t2 = getUsec();

	std::cout << "Took " << (t2 - t1) / 1000.0 << " ms\n" << std::endl;
	}