Skip to content

Instantly share code, notes, and snippets.

@zwliew
Created January 7, 2022 09:51
Show Gist options
  • Save zwliew/d4058cd2053ecb18c4f1724427febfdc to your computer and use it in GitHub Desktop.
Save zwliew/d4058cd2053ecb18c4f1724427febfdc to your computer and use it in GitHub Desktop.
mempbrk
#include <nmmintrin.h>
#include <string.h>
#define HAVE_SSE4_2
static const char ___m128i_shift_right[31] =
{
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
static inline __m128i
__m128i_shift_right (__m128i value, unsigned long int offset)
{
/* _mm_loadu_si128() works with unaligned data, cast safe */
return _mm_shuffle_epi8 (value,
_mm_loadu_si128 ((__m128i *) (___m128i_shift_right + offset)));
}
const char *__mempbrk_portable(const char *dest, size_t destlen, const char *breakset, size_t breaksetlen)
{
for (; dest != dest + destlen; ++dest) {
for (const char *curbreak = breakset; curbreak != breakset + breaksetlen; ++curbreak) {
if (*dest == *curbreak) {
return dest;
}
}
}
return NULL;
}
/* We use 0x2:
_SIDD_SBYTE_OPS
| _SIDD_CMP_EQUAL_ANY
| _SIDD_POSITIVE_POLARITY
| _SIDD_LEAST_SIGNIFICANT
on pcmpistri to compare xmm/mem128
0 1 2 3 4 5 6 7 8 9 A B C D E F
X X X X X X X X X X X X X X X X
against xmm
0 1 2 3 4 5 6 7 8 9 A B C D E F
A A A A A A A A A A A A A A A A
to find out if the first 16byte data element has any byte A and
the offset of the first byte. There are 3 cases:
1. The first 16byte data element has the byte A at the offset X.
2. The first 16byte data element has EOS and doesn't have the byte A.
3. The first 16byte data element is valid and doesn't have the byte A.
Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
1 X 1 0/1 0
2 16 0 1 0
3 16 0 0 0
We exit from the loop for cases 1 and 2 with jbe which branches
when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
X for case 1. */
const char *
__mempbrk_sse42(const char *s, size_t slen, const char *a, size_t alen)
{
if (alen == 0)
return NULL;
const char *aligned;
int offset = (int) ((size_t) s & 15);
__m128i mask = _mm_setzero_si128();
memcpy(&mask, a, alen);
if (offset != 0)
{
/* Check partial string. */
aligned = (const char *) ((size_t) s & -16L);
__m128i value = __m128i_shift_right (_mm_load_si128 ((__m128i *) aligned), offset);
int length = _mm_cmpistri (mask, value, 0x2);
/* No need to check ZFlag since ZFlag is always 1. */
int cflag = _mm_cmpistrc (mask, value, 0x2);
if (cflag) {
return s + length;
}
int index = _mm_cmpistri (value, value, 0x3a);
/* Find where the NULL terminator is. */
if (index < 16 - offset)
{
/* found NUL @ 'index', need to switch to slower mempbrk */
return __mempbrk_portable(s + index + 1, slen - index - 1, a, alen); /* destlen is bigger than 16 & idx < 16 so no underflow here */
}
aligned += 16;
s -= (16 - offset);
}
else
aligned = s;
while (slen >= 16)
{
__m128i value = _mm_load_si128 ((__m128i *) aligned);
int idx = _mm_cmpistri (mask, value, 0x2);
int cflag = _mm_cmpistrc (mask, value, 0x2);
int zflag = _mm_cmpistrz (mask, value, 0x2);
if (cflag) {
return aligned + idx;
}
if (zflag)
{
/* found NUL, need to switch to slower mempbrk */
return __mempbrk_portable(aligned, slen, a, alen);
}
aligned += 16;
slen -= 16;
}
return __mempbrk_portable(aligned, slen, a, alen);
}
const char *mempbrk(const char *dest, size_t destlen, const char *breakset, size_t breaksetlen)
{
#ifdef HAVE_SSE4_2
if (destlen >= 16 && breaksetlen <= 16)
return __mempbrk_sse42(dest, destlen, breakset, breaksetlen);
#endif
return __mempbrk_portable(dest, destlen, breakset, breaksetlen);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment