Skip to content

Instantly share code, notes, and snippets.

@bkietz
Last active September 21, 2023 21:04
Show Gist options
  • Save bkietz/345c0a76c010e7b23c3b9d9c4fc11e1f to your computer and use it in GitHub Desktop.
Save bkietz/345c0a76c010e7b23c3b9d9c4fc11e1f to your computer and use it in GitHub Desktop.
C++17 string search with SSE4.2's `_mm_cmpistrm`
#if defined(__SSE4_2__)
#if defined(__has_builtin)
#if __has_builtin(__builtin_ctz)
#include <immintrin.h>
#include <cstring>
#define ARROW_TOO_SIMD_FIND_FIRST_OF
#endif
#endif
#endif
namespace arrow_too {
/// Use like
///
/// find_first.of<' ', '\t', '\n', '\r', '\0'>("hello world");
///
/// Max 16 needles are supported
struct find_first {
template <char... NEEDLES>
size_t of(std::string_view s) const {
return _impl<false, NEEDLES...>(s);
}
template <char... NEEDLES>
size_t not_of(std::string_view s) const {
return _impl<true, NEEDLES...>(s);
}
template <bool SWAP_POLARITY, char... NEEDLES>
size_t _impl(std::string_view s) const {
size_t i = 0;
#ifdef ARROW_TOO_SIMD_FIND_FIRST_OF
#undef ARROW_TOO_SIMD_FIND_FIRST_OF
struct Batch {
explicit Batch(void const *data) { memcpy(&i, data, sizeof(i)); }
__m128i i;
};
thread_local const Batch NEEDLES_BATCH{[] {
char needles_array[sizeof(__m128i)] = {NEEDLES...};
// CXX(17:dcl.init.aggr#8.2) If the initializer list does not have an explicit
// intitializer for an element of the array, the corresponding element will be
// initialized from an empty initializer list. In this case that means any elements
// not initialized from NEEDLES will be `char{}`, aka '\0'. This would imlicitly
// add '\0' to every lookup set except ones with exactly 16 chars, so we overwrite
// that padding with the first character of NEEDLES.
for (size_t i = sizeof...(NEEDLES); i < sizeof(needles_array); ++i) {
needles_array[i] = needles_array[0];
}
return Batch{&needles_array};
}()};
constexpr auto FLAGS = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK
| (SWAP_POLARITY ? _SIDD_NEGATIVE_POLARITY : 0);
for (size_t batch_count = s.size() / sizeof(Batch); batch_count > 0; --batch_count) {
Batch batch{s.data() + i};
auto mask = static_cast<uint16_t>(
_mm_cvtsi128_si32(_mm_cmpistrm(NEEDLES_BATCH.i, batch.i, FLAGS)));
if (mask != 0) [[likely]] {
return i + __builtin_ctz(mask);
}
i += sizeof(Batch);
}
s = s.substr(i);
#endif
for (char c : s) {
auto any_match = (... or (c == NEEDLES));
if constexpr (not SWAP_POLARITY) {
if (any_match) break;
// For example, NEEDLES contains a delimiter which we're seeking.
// If any_match is true, we've found a delimiter.
} else {
if (not any_match) break;
// For example, NEEDLES contains whitespace which we're skipping.
// If any_match is false, we've found a non-whitespace.
}
++i;
}
return i;
}
} find_first;
} // namespace arrow_too
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment