Created
January 7, 2022 09:51
-
-
Save zwliew/d4058cd2053ecb18c4f1724427febfdc to your computer and use it in GitHub Desktop.
mempbrk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <nmmintrin.h> | |
#include <string.h> | |
#define HAVE_SSE4_2 | |
static const char ___m128i_shift_right[31] = | |
{ | |
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, | |
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 | |
}; | |
static inline __m128i | |
__m128i_shift_right (__m128i value, unsigned long int offset) | |
{ | |
/* _mm_loadu_si128() works with unaligned data, cast safe */ | |
return _mm_shuffle_epi8 (value, | |
_mm_loadu_si128 ((__m128i *) (___m128i_shift_right + offset))); | |
} | |
const char *__mempbrk_portable(const char *dest, size_t destlen, const char *breakset, size_t breaksetlen) | |
{ | |
for (; dest != dest + destlen; ++dest) { | |
for (const char *curbreak = breakset; curbreak != breakset + breaksetlen; ++curbreak) { | |
if (*dest == *curbreak) { | |
return dest; | |
} | |
} | |
} | |
return NULL; | |
} | |
/* We use 0x2: | |
_SIDD_SBYTE_OPS | |
| _SIDD_CMP_EQUAL_ANY | |
| _SIDD_POSITIVE_POLARITY | |
| _SIDD_LEAST_SIGNIFICANT | |
on pcmpistri to compare xmm/mem128 | |
0 1 2 3 4 5 6 7 8 9 A B C D E F | |
X X X X X X X X X X X X X X X X | |
against xmm | |
0 1 2 3 4 5 6 7 8 9 A B C D E F | |
A A A A A A A A A A A A A A A A | |
to find out if the first 16byte data element has any byte A and | |
the offset of the first byte. There are 3 cases: | |
1. The first 16byte data element has the byte A at the offset X. | |
2. The first 16byte data element has EOS and doesn't have the byte A. | |
3. The first 16byte data element is valid and doesn't have the byte A. | |
Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: | |
1 X 1 0/1 0 | |
2 16 0 1 0 | |
3 16 0 0 0 | |
We exit from the loop for cases 1 and 2 with jbe which branches | |
when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset | |
X for case 1. */ | |
const char * | |
__mempbrk_sse42(const char *s, size_t slen, const char *a, size_t alen) | |
{ | |
if (alen == 0) | |
return NULL; | |
const char *aligned; | |
int offset = (int) ((size_t) s & 15); | |
__m128i mask = _mm_setzero_si128(); | |
memcpy(&mask, a, alen); | |
if (offset != 0) | |
{ | |
/* Check partial string. */ | |
aligned = (const char *) ((size_t) s & -16L); | |
__m128i value = __m128i_shift_right (_mm_load_si128 ((__m128i *) aligned), offset); | |
int length = _mm_cmpistri (mask, value, 0x2); | |
/* No need to check ZFlag since ZFlag is always 1. */ | |
int cflag = _mm_cmpistrc (mask, value, 0x2); | |
if (cflag) { | |
return s + length; | |
} | |
int index = _mm_cmpistri (value, value, 0x3a); | |
/* Find where the NULL terminator is. */ | |
if (index < 16 - offset) | |
{ | |
/* found NUL @ 'index', need to switch to slower mempbrk */ | |
return __mempbrk_portable(s + index + 1, slen - index - 1, a, alen); /* destlen is bigger than 16 & idx < 16 so no underflow here */ | |
} | |
aligned += 16; | |
s -= (16 - offset); | |
} | |
else | |
aligned = s; | |
while (slen >= 16) | |
{ | |
__m128i value = _mm_load_si128 ((__m128i *) aligned); | |
int idx = _mm_cmpistri (mask, value, 0x2); | |
int cflag = _mm_cmpistrc (mask, value, 0x2); | |
int zflag = _mm_cmpistrz (mask, value, 0x2); | |
if (cflag) { | |
return aligned + idx; | |
} | |
if (zflag) | |
{ | |
/* found NUL, need to switch to slower mempbrk */ | |
return __mempbrk_portable(aligned, slen, a, alen); | |
} | |
aligned += 16; | |
slen -= 16; | |
} | |
return __mempbrk_portable(aligned, slen, a, alen); | |
} | |
const char *mempbrk(const char *dest, size_t destlen, const char *breakset, size_t breaksetlen) | |
{ | |
#ifdef HAVE_SSE4_2 | |
if (destlen >= 16 && breaksetlen <= 16) | |
return __mempbrk_sse42(dest, destlen, breakset, breaksetlen); | |
#endif | |
return __mempbrk_portable(dest, destlen, breakset, breaksetlen); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment