Skip to content

Instantly share code, notes, and snippets.

@magurosan
Last active November 19, 2020 00:14
Show Gist options
  • Save magurosan/11faa0e9650996bdbb5c65b77fa39174 to your computer and use it in GitHub Desktop.
Save magurosan/11faa0e9650996bdbb5c65b77fa39174 to your computer and use it in GitHub Desktop.
strlen : AVX2/AVX512BW instrinsics implementation
/*
require http://homepage1.nifty.com/herumi/soft/xbyak_e.html
g++ -O3 -fomit-frame-pointer -march=core2 -msse4 -fno-operator-names strlen_sse42.cpp && ./a.out
Xeon X5650 2.67GHz + Linux 2.6.32 + gcc 4.6.0
ave 2.00 5.04 7.03 9.95 12.03 16.35 20.04 33.08 66.62 132.98 261.78 518.13 1063.83
strlenLIBC 11.74 5.00 3.99 3.21 2.84 2.30 2.05 1.42 0.85 0.55 0.38 0.29 0.24
strlenC 13.94 8.18 6.69 5.43 4.87 4.16 3.76 3.08 2.58 2.29 2.17 2.05 2.02
strlenSSE2 12.72 6.30 4.97 3.81 3.27 2.55 2.20 1.53 0.94 0.56 0.36 0.25 0.19
strlenSSE42 8.84 3.73 3.01 2.59 2.44 2.16 1.99 1.46 0.84 0.54 0.35 0.27 0.21
strlenSSE42_C 8.93 3.76 3.03 2.59 2.43 2.12 1.94 1.42 0.84 0.54 0.35 0.26 0.21
Core i7-2600 CPU 3.40GHz + Linux 2.6.35 + gcc 4.4.5
ave 2.00 5.04 7.03 9.95 12.03 16.35 20.04 33.08 66.62 132.98 261.78 518.13 1063.83
strlenLIBC 11.58 5.69 4.59 3.55 3.04 2.37 2.03 1.21 0.56 0.36 0.27 0.22 0.20
strlenC 14.23 6.84 5.39 4.33 3.98 3.32 3.03 2.44 1.89 1.58 1.40 1.29 1.25
strlenSSE2 12.20 5.70 4.39 3.33 2.83 2.11 1.74 1.07 0.53 0.30 0.22 0.18 0.15
strlenSSE42 12.19 4.99 3.89 3.17 2.87 2.48 2.18 1.31 0.68 0.42 0.30 0.24 0.21
strlenSSE42_C 12.41 5.10 3.97 3.25 2.97 2.53 2.19 1.29 0.67 0.40 0.29 0.23 0.21
Core i7-2600 CPU 3.40GHz + Windows 7 + VC2010
ave 2.00 5.04 7.03 9.95 12.03 16.35 20.04 33.08 66.62 132.98 261.78 518.13 1063.83
strlenLIBC 16.10 8.20 6.41 5.12 4.52 3.54 2.89 1.78 0.96 0.60 0.45 0.38 0.34
strlenC 13.99 6.70 5.31 4.26 3.90 3.27 2.98 2.40 1.89 1.58 1.40 1.30 1.25
strlenSSE2 11.27 5.33 4.08 3.08 2.59 1.94 1.63 1.01 0.48 0.27 0.19 0.16 0.13
strlenSSE42 12.34 5.09 3.96 3.24 2.97 2.54 2.20 1.32 0.67 0.40 0.29 0.24 0.21
strlenSSE42_C 12.39 5.08 3.98 3.24 2.98 2.62 2.35 1.59 0.73 0.43 0.32 0.25 0.22
*/
/*
add AVX2 / AVX512BW intrinsic implementation by magurosan(Masaki Ota)
clang++ strlen_avx512bw.cpp -march=native -mavx512bw -m64 -O2 && ./a.out
Core i7-1068NG7 2.3GHz(max 4.1GHz) + MacOS 11.0 + Apple clang 12.0.0
ave 2.00 5.04 7.03 9.95 12.03 16.35 20.04 33.08 66.62 132.98 261.78 518.13 1063.83
strlenLIBC 6.75 2.90 2.32 1.61 1.24 0.93 0.76 0.46 0.26 0.16 0.11 0.10 0.08
strlenC 10.75 5.25 3.85 2.87 2.56 2.17 1.87 1.46 1.31 1.08 1.06 1.00 1.03
strlenSSE2 7.93 3.92 2.95 1.80 1.57 1.03 0.90 0.54 0.37 0.19 0.13 0.09 0.08
strlenSSE42 7.41 3.29 2.50 1.77 1.73 1.36 1.06 0.65 0.33 0.23 0.18 0.16 0.15
strlenSSE42_C 8.28 3.59 2.60 2.05 1.70 1.39 1.25 0.74 0.36 0.24 0.19 0.16 0.16
strlenAVX2 7.86 3.56 2.69 1.94 1.62 1.13 0.88 0.60 0.33 0.19 0.12 0.07 0.05
strlenAVX512BW 7.06 3.18 2.32 1.60 1.41 1.03 0.86 0.51 0.27 0.15 0.09 0.06 0.04
*/
#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <assert.h>
#include <vector>
#include <xbyak/xbyak.h>
#include <xbyak/xbyak_util.h>
#include "util.hpp"
#include <immintrin.h>
#ifdef COMPARE_LATEST_GLIBC
extern "C" size_t gcc_strlen(const char *p);
#endif
size_t strlenSSE2(const char *p)
{
const char *const top = p;
__m128i c16 = _mm_setzero_si128();
/* 16 byte alignment */
size_t ip = reinterpret_cast<size_t>(p);
size_t n = ip & 15;
if (n > 0) {
ip &= ~15;
__m128i x = *(const __m128i*)ip;
__m128i a = _mm_cmpeq_epi8(x, c16);
unsigned long mask = _mm_movemask_epi8(a);
mask &= 0xffffffffUL << n;
if (mask) {
return my_bsf(mask) - n;
}
p += 16 - n;
}
/*
thanks to egtra-san
*/
assert((reinterpret_cast<size_t>(p) & 15) == 0);
if (reinterpret_cast<size_t>(p) & 31) {
__m128i x = *(const __m128i*)&p[0];
__m128i a = _mm_cmpeq_epi8(x, c16);
unsigned long mask = _mm_movemask_epi8(a);
if (mask) {
return p + my_bsf(mask) - top;
}
p += 16;
}
assert((reinterpret_cast<size_t>(p) & 31) == 0);
for (;;) {
__m128i x = *(const __m128i*)&p[0];
__m128i y = *(const __m128i*)&p[16];
__m128i a = _mm_cmpeq_epi8(x, c16);
__m128i b = _mm_cmpeq_epi8(y, c16);
unsigned long mask = (_mm_movemask_epi8(b) << 16) | _mm_movemask_epi8(a);
if (mask) {
return p + my_bsf(mask) - top;
}
p += 32;
}
}
size_t strlenAVX2(const char *p)
{
const char *const top = p;
__m256i c32 = _mm256_setzero_si256();
/* 32 byte alignment */
size_t ip = reinterpret_cast<size_t>(p);
size_t n = ip & 31;
if (n > 0) {
ip &= ~31;
__m256i x = _mm256_load_si256((__m256i*)ip);
__m256i a = _mm256_cmpeq_epi8(c32, x);
unsigned long mask = _mm256_movemask_epi8(a);
mask = mask >> n;
if (mask) {
return my_bsf(mask);
}
p += 32 - n;
}
/* 64B alighment, for prevent page faults */
assert((reinterpret_cast<size_t>(p) & 31) == 0);
for (;;) {
__m256i v = _mm256_load_si256((__m256i*)&p[0]);
__m256i a = _mm256_cmpeq_epi8(c32, v);
unsigned long mask = _mm256_movemask_epi8(a);
if (mask) {
return p + my_bsf(mask) - top;
}
p += 32;
}
}
size_t strlenAVX512BW(const char *p)
{
const char *const top = p;
__m512i c64 = _mm512_setzero_si512();
/* 64 byte alignment */
size_t ip = reinterpret_cast<size_t>(p);
size_t n = ip & 63;
if (n > 0) {
ip &= ~63;
__m512i x = _mm512_load_si512((__m512i*)ip);
__mmask64 a = _mm512_testn_epi8_mask(x, x);
uint64_t mask = (uint64_t)a >> n;
if (mask) {
return _tzcnt_u64(mask);
}
p += 64 - n;
}
assert((reinterpret_cast<size_t>(p) & 63) == 0);
for(;;) {
__m512i x = _mm512_load_si512((__m512i*)&p[0]);
__mmask64 a = _mm512_testn_epi8_mask(x, x);
if (a) {
return p + _tzcnt_u64(a) - top;
}
p += 64;
}
}
struct StrlenSSE42 : Xbyak::CodeGenerator {
StrlenSSE42()
{
inLocalLabel();
using namespace Xbyak;
#if defined(XBYAK64_WIN)
const Reg64& p = rdx;
const Reg64& c = rcx;
const Reg64& a = rax;
mov(rdx, rcx);
#elif defined(XBYAK64_GCC)
const Reg64& p = rdi;
const Reg64& c = rcx;
const Reg64& a = rax;
#else
const Reg32& p = edx;
const Reg32& c = ecx;
const Reg32& a = eax;
mov(edx, ptr [esp + 4]);
#endif
mov(eax, 0xff01);
movd(xm0, eax);
#if 0 // generated code by gcc 4.6.0
lea(rdx, ptr [p - 16]);
xor(c, c);
jmp(".skip");
align(16);
L("@@");
mov(rdx, rax);
L(".skip");
lea(rax, ptr [edx + 16]);
pcmpistri(xm0, ptr [edx + 16], 0x14);
jnz("@b");
add(a, c);
sub(a, p);
ret();
#else
#if 0
lea(a, ptr [p - 16]);
#else
mov(a, p);
jmp(".in");
#endif
L("@@");
add(a, 16);
L(".in");
pcmpistri(xm0, ptr [a], 0x14);
jnz("@b");
add(a, c);
sub(a, p);
ret();
#endif
outLocalLabel();
}
} strlenSSE42_code;
size_t strlenSSE42_C(const char* top)
{
const __m128i im = _mm_set1_epi32(0xff01);
const char *p = top;
while (!_mm_cmpistrz(im, _mm_loadu_si128((const __m128i*)p), 0x14)) {
p += 16;
}
p += _mm_cmpistri(im, _mm_loadu_si128((const __m128i*)p), 0x14);
return p - top;
}
struct Result {
int hit;
double len;
double time;
Result() {}
Result(int hit, double len, double time) : hit(hit), len(len), time(time) {}
void put() const
{
printf("hit=%d len=%.2f time= %.2f clk\n", hit, len, time);
}
};
void createTable(char *p, size_t num, int ave)
{
XorShift128 r;
int c = 1;
for (size_t i = 0; i < num; i++) {
p[i] = (char)c++;
if (c == 256) c = 1;
if ((r.get() % ave) == 0) p[i] = 0;
}
p[num - 1] = 0;
}
Result test(const char *top, size_t n, size_t count, size_t func(const char*))
{
Xbyak::util::Clock clk;
int hit = 0;
for (size_t i = 0; i < count; i++) {
clk.begin();
const char *p = top;
int remain = n;
while (remain > 0) {
size_t len = func(p);
hit++;
remain -= len + 1;
p += len + 1;
}
clk.end();
}
return Result(hit, n / (double)hit * count, clk.getClock() / (double)count / n);
}
size_t strlenC(const char *p)
{
const char *top = p;
while (*p) {
p++;
}
return p - top;
}
size_t (*strlenSSE42)(const char*) = (size_t (*)(const char*))strlenSSE42_code.getCode();
int main()
{
const size_t count = 4000;
const size_t N = 100000;
std::vector<char> v(N);
char *begin = &v[0];
static const int aveTbl[] = { 2, 5, 7, 10, 12, 16, 20, 32, 64, 128, 256, 512, 1024 };
static const struct {
const char *name;
size_t (*func)(const char*);
} funcTbl[] = {
{ "strlenLIBC ", strlen },
#ifdef COMPARE_LATEST_GLIBC
{ "gcc_strlen ", gcc_strlen },
#endif
{ "strlenC ", strlenC },
{ "strlenSSE2 ", strlenSSE2 },
{ "strlenSSE42 ", strlenSSE42 },
{ "strlenSSE42_C ", strlenSSE42_C },
{ "strlenAVX2 ", strlenAVX2 },
{ "strlenAVX512BW", strlenAVX512BW },
};
Result rv[NUM_OF_ARRAY(funcTbl)][NUM_OF_ARRAY(aveTbl)];
for (size_t i = 0; i < NUM_OF_ARRAY(aveTbl); i++) {
int ave = aveTbl[i];
createTable(begin, N, ave);
printf("test %d, %d\n", (int)i, ave);
for (size_t j = 0; j < NUM_OF_ARRAY(funcTbl); j++) {
puts(funcTbl[j].name);
rv[j][i] = test(begin, N, count, funcTbl[j].func);
rv[j][i].put();
if (rv[j][i].hit != rv[0][i].hit) {
printf("ERROR!!! ok=%d, ng=%d\n", (int)rv[0][i].hit, (int)rv[j][i].hit);
}
}
}
puts("end");
printf("ave ");
for (size_t i = 0; i < NUM_OF_ARRAY(aveTbl); i++) {
printf("%6.2f ", rv[0][i].len);
}
printf("\n");
for (size_t i = 0; i < NUM_OF_ARRAY(funcTbl); i++) {
printf("%s ", funcTbl[i].name);
for (size_t j = 0; j < NUM_OF_ARRAY(aveTbl); j++) {
printf("%6.2f ", rv[i][j].time);
}
printf("\n");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment