Skip to content

Instantly share code, notes, and snippets.

@StefanoBelli
Last active January 2, 2019 23:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save StefanoBelli/8730f88d00a7075f6bf14cdd898c05fd to your computer and use it in GitHub Desktop.
Save StefanoBelli/8730f88d00a7075f6bf14cdd898c05fd to your computer and use it in GitHub Desktop.
Using MSVC, Clang and GCC x86 intrinsics to use x86's SSE4.2 ISA to implement some string manip functions
// x86_fast_string.cpp
#include <nmmintrin.h>
#include <immintrin.h>
#include <cstdint>
#if defined(_MSC_VER)
#define x86_fetch_eflags(mem_dst_32) \
__asm { \
__asm pushfd \
__asm pop eax \
__asm lea ecx, mem_dst_32 \
__asm mov [ecx], eax \
}
#define vec128 __m128i
#define intrin_loadunal_128 _mm_loadu_si128
#define intrin_pcmpistri _mm_cmpistri
#define LOADUNAL128_CAST(expr) ((vec128*)(expr))
#elif defined(__GNUC_MINOR__)
#if defined(__x86_64__)
#define x86_fetch_eflags(mem_dst_32) \
__asm__ __volatile__( \
"pushf;" \
"popq %%rax;" \
"leaq %0, %%rcx;" \
"movl %%eax, (%%rcx);" \
: "=m"(mem_dst_32))
#elif defined(__i386__) || defined(_X86_)
#define x86_fetch_eflags(mem_dst_32) \
__asm__ __volatile__( \
"pushf;" \
"popl %%eax;" \
"leal %0, %%ecx;" \
"movl %%eax, (%%ecx);" \
: "=m"(mem_dst_32))
#endif
#define vec128 __v16qi
#define intrin_loadunal_128 __builtin_ia32_loaddqu
#define intrin_pcmpistri __builtin_ia32_pcmpistri128
#define LOADUNAL128_CAST(expr) ((char*)(expr))
#endif
#if defined(__SSE4_2__) || defined(__AVX__)
//preprocessing-time check for SSE4.2 support
// GCC OK -- -msse4.2 -- __SSE4_2__ defined
// MSVC -- force /arch:avx -- __AVX__ defined / _M_IX86_FP should be eq. 2 (?)
bool fast_x86_sse42_streq(const char* str, const char* str1) noexcept {
constexpr int PCMPISTRI_MODE = _SIDD_CMP_EQUAL_EACH | _SIDD_SBYTE_OPS | _SIDD_NEGATIVE_POLARITY;
constexpr int END = 16;
int off = 0;
int eflags;
do {
vec128 xvec = intrin_loadunal_128(LOADUNAL128_CAST(str + off)); //MOVDQU
vec128 xvec1 = intrin_loadunal_128(LOADUNAL128_CAST(str1 + off)); //MOVDQU
int first_unmatch = intrin_pcmpistri(xvec, xvec1, PCMPISTRI_MODE); //PCMPISTRI
x86_fetch_eflags(eflags);
if (first_unmatch != END)
return false;
off += 16;
} while (!(eflags & (1 << 6)) && !(eflags & (1 << 7))); // ZF = 0 and SF = 0
return true;
}
std::size_t fast_x86_sse42_strlen(const char* nul_str) noexcept {
constexpr int PCMPISTRI_MODE = _SIDD_CMP_EQUAL_EACH | _SIDD_SBYTE_OPS;
std::size_t off = 0;
vec128 zeroed_vec = {0};
while(true) {
vec128 str_vec = intrin_loadunal_128(LOADUNAL128_CAST(nul_str + off));
std::size_t current = intrin_pcmpistri(str_vec, zeroed_vec, PCMPISTRI_MODE);
if (current < 16) return off + current;
off += 16;
}
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment