Skip to content

Instantly share code, notes, and snippets.

@martinus
Last active April 26, 2021 07:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save martinus/a5d1c5114cfd1930eec9b620a7803f7a to your computer and use it in GitHub Desktop.
Save martinus/a5d1c5114cfd1930eec9b620a7803f7a to your computer and use it in GitHub Desktop.
#include <cstddef>
#include <cstdint> // UINTPTR_MAX
#include <cstring> // memcpy
#include <memory> // allocator
#include <type_traits>
#include <utility>
#if _MSC_VER
# include <intrin.h> // required for CLZ builtin
#endif
namespace ankerl {
// This random-access container behaves very similar to std::vector, except it never invalidates references and never
// reallocates memory. It, it doubles its memory when it gets full. Random access is O(1). It implements the bit trick
// described in [1] (see discussion in [2]) It also seems to have similar properties as stationary_vector [3] and is highly
// similar to cphstl::pile [4].
//
// WARNING: This is just a prototype that is not (yet) well tested.
//
// Features are:
//
// * Small-Vector-Optimization: First bucket is used for SVO, no allocations are necessary.
// * Fast O(1) random access. In my benchmark access is ~67% slower than for a std::vector.
// * No memory relocations. References will always stay valid through the lifetime of the container.
//
//
// Benchmarks
//
// * std::sort, runtime in ns per n*log2(n)
//
// n | vector | stable_vector | how much slower?
// -----+--------+---------------+------------------
// 2^10 | 4.02 | 7.03 | 75% slower
// 2^15 | 4.02 | 6.97 | 73% slower
// 2^20 | 3.98 | 6.71 | 69% slower
// 2^25 | 3.98 | 6.65 | 67% slower
//
// * std::stable_sort, runtime in ns per n*log2(n)
//
// n | vector | stable_vector | how much slower?
// -----+--------+---------------+------------------
// 2^10 | 4.58 | 7.77 | 70% slower
// 2^15 | 4.54 | 7.40 | 63% slower
// 2^20 | 4.67 | 7.46 | 60% slower
// 2^25 | 4.83 | 7.52 | 56% slower
//
// On average stable_vector is 67% slower. That's much better than pile from [4], which averages 162% slower.
//
// References
// [1] https://plflib.org/matt_bentley_-_pot_blocks_bit_tricks.pdf
// [2] https://www.reddit.com/r/cpp/comments/mwi9dk/new_paper_random_access_of_elements_in_data/
// [3] https://github.com/mehrdadn/libvital/blob/master/doc/vital/container/stationary_vector.md
// [4] Worst-Case-Efficient Dynamic Arrays in Practice http://www.cphstl.dk/Paper/Dynamic-arrays/sea16.pdf
// [5] Experiences with the design and implementationof space-efficient deques
// http://www.cphstl.dk/Report/Deque/cphstl-report-2001-7.pdf
template <typename T, class Allocator = std::allocator<T>, std::size_t StartSize = 128 / sizeof(T) < 8U ? 8U : 64 / sizeof(T)>
class stable_vector : private Allocator {
// constexpr version to find the index of the most significant bit. E.g. for 33 (100001b) this returns 5.
static constexpr auto most_significant_active_bit_cx(std::size_t num) noexcept -> std::size_t {
auto result = std::size_t();
while ((num >>= 1U) != 0U) {
++result;
}
return result;
}
public:
using value_type = T;
template <bool is_const>
class iter {
using container_t = typename std::conditional_t<is_const, stable_vector const, stable_vector>;
container_t* m_sv{};
std::size_t m_idx{};
public:
using iterator_category = std::random_access_iterator_tag;
using value_type = T;
using difference_type = std::ptrdiff_t;
using pointer = std::conditional_t<is_const, T const*, T*>;
using reference = std::conditional_t<is_const, T const&, T&>;
iter() = default;
iter(container_t* sv, std::size_t idx) noexcept
: m_sv(sv)
, m_idx(idx) {}
auto operator++() noexcept -> iter& {
++m_idx;
return *this;
}
auto operator--() noexcept -> iter& {
--m_idx;
return *this;
}
auto operator++(int) noexcept -> iter {
return iter(m_idx++);
}
auto operator--(int) noexcept -> iter {
return iter(m_idx--);
}
auto operator-(iter const& other) const noexcept -> difference_type {
return static_cast<difference_type>(m_idx - other.m_idx);
}
auto operator-(difference_type d) const noexcept -> iter {
return iter(m_sv, m_idx - d);
}
auto operator+(difference_type d) const noexcept -> iter {
return iter(m_sv, m_idx + d);
}
auto operator+=(difference_type d) noexcept -> iter& {
m_idx += d;
return *this;
}
auto operator-=(difference_type d) noexcept -> iter& {
m_idx -= d;
return *this;
}
auto operator*() const noexcept -> reference {
return (*m_sv)[m_idx];
}
auto operator->() const noexcept -> pointer {
return &(*m_sv)[m_idx];
}
template <bool O>
auto operator==(iter<O> const& other) const noexcept -> bool {
return m_idx == other.m_idx;
}
template <bool O>
auto operator!=(iter<O> const& other) const noexcept -> bool {
return m_idx != other.m_idx;
}
template <bool O>
auto operator<(iter<O> const& other) const noexcept -> bool {
return m_idx < other.m_idx;
}
template <bool O>
auto operator>(iter<O> const& other) const noexcept -> bool {
return m_idx > other.m_idx;
}
template <bool O>
auto operator<=(iter<O> const& other) const noexcept -> bool {
return m_idx <= other.m_idx;
}
template <bool O>
auto operator>=(iter<O> const& other) const noexcept -> bool {
return m_idx >= other.m_idx;
}
};
using iterator = iter<false>;
using const_iterator = iter<true>;
auto begin() -> iterator {
return iterator(this, 0U);
}
auto end() -> iterator {
return iterator(this, m_size);
}
auto begin() const -> const_iterator {
return cbegin();
}
auto end() const -> const_iterator {
return cend();
}
auto cbegin() const -> const_iterator {
return const_iterator(this, 0U);
}
auto cend() const -> const_iterator {
return const_iterator(this, m_size);
}
explicit stable_vector() = default;
stable_vector(stable_vector const& other) {
copy(other);
}
auto operator=(stable_vector const& other) -> stable_vector& {
if (this != &other) {
copy(other);
}
return *this;
}
stable_vector(stable_vector&& other) noexcept {
move_into_uninitialized(other);
}
auto operator=(stable_vector&& other) noexcept -> stable_vector& {
clear();
deallocate_uninitialized_memory();
move_into_uninitialized(std::move(other));
return *this;
}
[[nodiscard]] auto size() const -> std::size_t {
return m_size;
}
template <typename... Args>
auto emplace_back(Args&&... args) -> T& {
if (m_size == m_capacity) {
grow();
}
auto* location = &operator[](m_size);
auto* ret = ::new (static_cast<void*>(location)) T(std::forward<Args>(args)...);
++m_size;
return *ret;
}
void push_back(T const& obj) {
emplace_back(obj);
}
void push_back(T&& obj) {
emplace_back(std::move(obj));
}
void pop_back() noexcept {
if constexpr (std::is_trivially_destructible_v<T>) {
--m_size;
} else {
--m_size;
auto* ptr = &operator[](m_size);
ptr->~T();
}
}
void clear() noexcept {
if constexpr (std::is_trivially_destructible_v<T>) {
m_size = 0;
} else {
while (!empty()) {
pop_back();
}
}
}
constexpr auto empty() noexcept -> bool {
return m_size == 0U;
}
// correctly access m_svo's memory through std::launder
constexpr auto svo() noexcept -> T* {
return std::launder(reinterpret_cast<T*>(m_svo));
}
constexpr auto svo() const noexcept -> T const* {
return std::launder(reinterpret_cast<T const*>(m_svo));
}
auto front() const noexcept -> T const& {
return svo()[0];
}
auto front() noexcept -> T& {
return svo()[0];
}
auto back() const noexcept -> T const& {
return operator[](m_size - 1);
}
auto back() noexcept -> T& {
return operator[](m_size - 1);
}
constexpr auto operator[](std::size_t i) const noexcept -> T const& {
if (i < StartSize) {
return svo()[i];
}
auto msab = most_significant_active_bit(i);
// block index is number of bits in i, minus number of bits already covered by the first block (the svo).
auto* block = m_blocks[msab - num_bits_svo];
// clear the highest bit to get to the index within the block
return block[i ^ (1U << msab)];
}
auto operator[](std::size_t i) noexcept -> T& {
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
return const_cast<T&>(std::as_const(*this)[i]);
}
~stable_vector() noexcept {
clear();
deallocate_uninitialized_memory();
}
[[nodiscard]] auto capacity() const noexcept -> std::size_t {
return m_capacity;
}
private:
// Uses intrinsics to find the most significant bit. E.g. for 33 (100001b) this returns 5.
// On Windows this can't be constexpr, so we have a separate function for that.
static inline auto most_significant_active_bit(std::size_t num) noexcept -> std::size_t {
#ifdef _MSC_VER
# if UINTPTR_MAX == UINT32_MAX
return 31U - __lzcnt(num);
# else
return 63U - __lzcnt64(num);
# endif
#else
# if UINTPTR_MAX == UINT32_MAX
return 31U - __builtin_clz(num);
# else
return 63U - __builtin_clzll(num);
# endif
#endif
}
// allocates a new block, doubling the container's capacity. Returns index to the newly allocated block
void grow() {
if (m_blocks == nullptr) {
m_blocks = ::new T*[max_num_blocks];
}
// we always double the capacity
m_blocks[calc_block_idx(m_capacity)] = Allocator::allocate(m_capacity);
m_capacity *= 2;
}
void reserve(std::size_t s) {
while (m_capacity < s) {
grow();
}
}
void deallocate_uninitialized_memory() noexcept {
clear();
if (m_blocks != nullptr) {
auto num_blocks = calc_block_idx(m_capacity);
auto block_size = StartSize;
for (std::size_t i = 0; i < num_blocks; ++i) {
Allocator::deallocate(m_blocks[i], block_size);
block_size *= 2;
}
delete[] m_blocks;
m_blocks = nullptr;
}
}
void copy(stable_vector const& other) {
clear();
reserve(other.size());
auto current_block_idx = std::size_t(-1);
auto block_size = StartSize;
auto num_left_to_copy = other.size();
while (num_left_to_copy != 0) {
auto num_copy = num_left_to_copy < block_size ? num_left_to_copy : block_size;
if (current_block_idx == std::size_t(-1)) {
std::uninitialized_copy_n(other.svo(), num_copy, svo());
} else {
std::uninitialized_copy_n(other.m_blocks[current_block_idx], num_copy, m_blocks[current_block_idx]);
}
num_left_to_copy -= num_copy;
++current_block_idx;
block_size = StartSize << current_block_idx;
}
m_size = other.size();
}
void move_into_uninitialized(stable_vector&& other) {
m_size = std::exchange(other.m_size, 0U);
m_capacity = std::exchange(other.m_capacity, StartSize);
m_blocks = std::exchange(other.m_blocks, nullptr);
// only the SVO (or parts of it) needs to be copied
auto num_copy = m_size < StartSize ? m_size : StartSize;
if (num_copy > 0) {
std::uninitialized_move_n(other.svo(), num_copy, svo());
}
}
static auto calc_block_idx(std::size_t i) noexcept -> std::size_t {
return most_significant_active_bit(i) - num_bits_svo;
}
static constexpr auto num_bits_svo = most_significant_active_bit_cx(StartSize);
static constexpr auto max_num_blocks = sizeof(std::size_t) * 8U - most_significant_active_bit_cx(StartSize);
std::size_t m_size{}; // current number of elements
std::size_t m_capacity{StartSize}; // currently allocated capacity
T** m_blocks{}; // array of all the blocks
// uninitialized memory for small-vector optimization (SVO)
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
typename std::aligned_storage<sizeof(T), alignof(T)>::type m_svo[StartSize];
};
} // namespace ankerl
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment