Skip to content

Instantly share code, notes, and snippets.

@Raimo33
Created May 8, 2025 22:00
Show Gist options
  • Save Raimo33/a128fbe54989be071980293ca4de853e to your computer and use it in GitHub Desktop.
Save Raimo33/a128fbe54989be071980293ca4de853e to your computer and use it in GitHub Desktop.
SIMD LowerBound
template <typename T, typename Comparator>
HOT ssize_t forward_lower_bound(std::span<const T> data, const T elem, const Comparator &comp) noexcept
{
static_assert(std::is_integral<T>::value, "T must be an integral type");
static_assert(std::hardware_constructive_interference_size == 64, "Cache line size must be 64 bytes");
static constexpr uint8_t chunk_size = sizeof(__m512i) / sizeof(T);
const T *begin = data.data();
const size_t size = data.size();
size_t remaining = size;
const T *it = begin;
bool keep_looking;
bool found = false;
PREFETCH_R(it, 0);
const bool safe = (remaining >= chunk_size);
PREFETCH_R(it + chunk_size * safe, 0);
uint8_t misalignment = utils::simd::misalignment_forwards(it, 64);
misalignment = (misalignment > remaining) ? remaining : misalignment;
keep_looking = (misalignment > 0);
while (keep_looking)
{
found = comp(*it, elem);
--misalignment;
--remaining;
keep_looking = (misalignment > 0);
keep_looking &= !found;
it += keep_looking;
}
#ifdef __AVX512F__
static constexpr uint8_t UNROLL_FACTOR = 8;
static constexpr uint16_t combined_chunks_size = UNROLL_FACTOR * chunk_size;
const __m512i elem_vec = utils::simd::create_vector<__m512i, T>(elem);
constexpr int opcode = utils::simd::get_opcode<T, Comparator>();
std::array<__m512i, UNROLL_FACTOR> chunks;
std::array<__mmask64, UNROLL_FACTOR> masks;
keep_looking = (remaining >= combined_chunks_size);
keep_looking &= !found;
while (keep_looking)
{
it = std::assume_aligned<64>(it);
#pragma GCC unroll UNROLL_FACTOR
for (uint8_t i = 0; i < UNROLL_FACTOR; ++i)
chunks[i] = _mm512_load_si512(it + i * chunk_size);
it += combined_chunks_size;
remaining -= combined_chunks_size;
keep_looking = (remaining >= combined_chunks_size);
const uint8_t safe_chunk_size = chunk_size * keep_looking;
#pragma GCC unroll UNROLL_FACTOR
for (uint8_t i = 0; i < UNROLL_FACTOR; ++i)
PREFETCH_R(it + i * safe_chunk_size, 0);
#pragma GCC unroll UNROLL_FACTOR
for (uint8_t i = 0; i < UNROLL_FACTOR; ++i)
masks[i] = utils::simd::compare<__m512i, T>(chunks[i], elem_vec, opcode);
#pragma GCC unroll UNROLL_FACTOR
for (uint8_t i = 0; i < UNROLL_FACTOR; ++i)
found |= masks[i];
keep_looking &= !found;
}
if (found) [[likely]]
{
uint8_t i = UNROLL_FACTOR;
for (const auto &mask : masks)
{
i++;
if (mask) [[likely]]
{
const uint8_t bit = __builtin_ctzll(mask);
const uint16_t matched_idx = i * chunk_size + bit;
return it - begin - combined_chunks_size + matched_idx;
}
}
}
#endif
keep_looking = (remaining > 0);
keep_looking &= !found;
while (keep_looking)
{
found = comp(*it, elem);
--remaining;
keep_looking = (remaining > 0);
keep_looking &= !found;
it += keep_looking;
}
return found ? it - begin : -1;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment