Last active
January 2, 2019 23:11
-
-
Save StefanoBelli/8730f88d00a7075f6bf14cdd898c05fd to your computer and use it in GitHub Desktop.
Using MSVC, Clang and GCC x86 intrinsics to use x86's SSE4.2 ISA to implement some string manip functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// x86_fast_string.cpp | |
#include <nmmintrin.h> | |
#include <immintrin.h> | |
#include <cstdint> | |
#if defined(_MSC_VER) | |
#define x86_fetch_eflags(mem_dst_32) \ | |
__asm { \ | |
__asm pushfd \ | |
__asm pop eax \ | |
__asm lea ecx, mem_dst_32 \ | |
__asm mov [ecx], eax \ | |
} | |
#define vec128 __m128i | |
#define intrin_loadunal_128 _mm_loadu_si128 | |
#define intrin_pcmpistri _mm_cmpistri | |
#define LOADUNAL128_CAST(expr) ((vec128*)(expr)) | |
#elif defined(__GNUC_MINOR__) | |
#if defined(__x86_64__) | |
#define x86_fetch_eflags(mem_dst_32) \ | |
__asm__ __volatile__( \ | |
"pushf;" \ | |
"popq %%rax;" \ | |
"leaq %0, %%rcx;" \ | |
"movl %%eax, (%%rcx);" \ | |
: "=m"(mem_dst_32)) | |
#elif defined(__i386__) || defined(_X86_) | |
#define x86_fetch_eflags(mem_dst_32) \ | |
__asm__ __volatile__( \ | |
"pushf;" \ | |
"popl %%eax;" \ | |
"leal %0, %%ecx;" \ | |
"movl %%eax, (%%ecx);" \ | |
: "=m"(mem_dst_32)) | |
#endif | |
#define vec128 __v16qi | |
#define intrin_loadunal_128 __builtin_ia32_loaddqu | |
#define intrin_pcmpistri __builtin_ia32_pcmpistri128 | |
#define LOADUNAL128_CAST(expr) ((char*)(expr)) | |
#endif | |
#if defined(__SSE4_2__) || defined(__AVX__) | |
//preprocessing-time check for SSE4.2 support | |
// GCC OK -- -msse4.2 -- __SSE4_2__ defined | |
// MSVC -- force /arch:avx -- __AVX__ defined / _M_IX86_FP should be eq. 2 (?) | |
bool fast_x86_sse42_streq(const char* str, const char* str1) noexcept { | |
constexpr int PCMPISTRI_MODE = _SIDD_CMP_EQUAL_EACH | _SIDD_SBYTE_OPS | _SIDD_NEGATIVE_POLARITY; | |
constexpr int END = 16; | |
int off = 0; | |
int eflags; | |
do { | |
vec128 xvec = intrin_loadunal_128(LOADUNAL128_CAST(str + off)); //MOVDQU | |
vec128 xvec1 = intrin_loadunal_128(LOADUNAL128_CAST(str1 + off)); //MOVDQU | |
int first_unmatch = intrin_pcmpistri(xvec, xvec1, PCMPISTRI_MODE); //PCMPISTRI | |
x86_fetch_eflags(eflags); | |
if (first_unmatch != END) | |
return false; | |
off += 16; | |
} while (!(eflags & (1 << 6)) && !(eflags & (1 << 7))); // ZF = 0 and SF = 0 | |
return true; | |
} | |
std::size_t fast_x86_sse42_strlen(const char* nul_str) noexcept { | |
constexpr int PCMPISTRI_MODE = _SIDD_CMP_EQUAL_EACH | _SIDD_SBYTE_OPS; | |
std::size_t off = 0; | |
vec128 zeroed_vec = {0}; | |
while(true) { | |
vec128 str_vec = intrin_loadunal_128(LOADUNAL128_CAST(nul_str + off)); | |
std::size_t current = intrin_pcmpistri(str_vec, zeroed_vec, PCMPISTRI_MODE); | |
if (current < 16) return off + current; | |
off += 16; | |
} | |
} | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment