Skip to content

Instantly share code, notes, and snippets.

@magurosan
Last active August 21, 2017 01:09
Show Gist options
  • Save magurosan/be2ae5a435941b67863d26a2dbb8f94b to your computer and use it in GitHub Desktop.
Save magurosan/be2ae5a435941b67863d26a2dbb8f94b to your computer and use it in GitHub Desktop.
C++/Xbyak strlen generator for AVX512BW
#include <xbyak/xbyak.h>
#include <xbyak/xbyak_util.h>
#include <stdint.h>
class StrlenGenerator : Xbyak::CodeGenerator {
public:
//
// e.g.
// StrlenGenerator gen(sizeof(char), true) => strlen_s
//
StrlenGenerator(uint32_t n_scale) {
using namespace Xbyak;
// assume charsize
assert(n_scale == 1 || n_scale == 2 || n_scale == 4);
auto cmpeq_insn = [&](const Opmask& mask, const Zmm& z, const Operand& op) {
if (n_scale == 1) vpcmpeqb(mask, z, op);
else if (n_scale == 2) vpcmpeqw(mask, z, op);
else vpcmpeqd(mask, z, op);
};
auto kortest_insn = [&](const Opmask& mask1, const Opmask& mask2) {
if (n_scale == 1) kortestq(mask1, mask2);
else if (n_scale == 2) kortestd(mask1, mask2);
else kortestw(mask1, mask2);
};
auto kmov_mask_to_reg = [&](const Reg64& reg, const Opmask& mask) {
if (n_scale == 1) kmovq(reg, mask);
else if (n_scale == 2) kmovd(Reg32(reg.getIdx()), mask);
else kmovw(Reg32(reg.getIdx()), mask);
};
auto kmov_reg_to_mask = [&](const Opmask& mask, const Reg64& reg) {
if (n_scale == 1) kmovq(mask, reg);
else if (n_scale == 2) kmovd(mask, Reg32(reg.getIdx()));
else kmovw(mask, Reg32(reg.getIdx()));
};
auto tzcnt_opt = [&](const Reg64& reg1, const Reg64& reg2) {
if (n_scale == 1) tzcnt(reg1, reg2);
else tzcnt(Reg32(reg1.getIdx()), Reg32(reg2.getIdx()));
};
#if defined(_WIN64) //assume Win64 x64 ABI
mov(rdx, rcx);
#else //assume x86-64 SysV ABI
mov(rdx, rdi);
mov(rcx, rdi);
#endif
vpxor(xmm0, xmm0, xmm0); // zmm0 all zero
// misalign offset
and(ecx, 0x3F);
if (n_scale != 1) shr(ecx, n_scale);
//gen mask
xor_(eax, eax);
if (n_scale == 1) {
neg(rax);
shl(rax, cl);
}else {
neg(eax);
shl(rax, cl);
}
kmov_reg_to_mask(k1, rax);
neg(rcx);
cmpeq_insn(k1|k1, zmm0, ptr[rdx + rcx*n_scale]);
kortest_insn(k1, k1);
jng("k1_end");
add(rcx, 64 / n_scale);
lea(rax, ptr[rdx + rcx*n_scale]);
test(rax, 128 - 1);
jz("strlen_loop");
cmpeq_insn(k1, zmm0, ptr[rdx + rcx*n_scale]);
kortest_insn(k1, k1);
jng("k1_end");
add(rcx, 64 / n_scale);
jmp("strlen_loop");
align(16);
L("strlen_loop"); {
cmpeq_insn(k1, zmm0, ptr[rdx + rcx*n_scale]);
cmpeq_insn(k2, zmm0, ptr[rdx + rcx*n_scale + 64]);
kortest_insn(k1, k2);
jnz("strlen_loop_end");
sub(rcx, -128 / n_scale);
jmp("strlen_loop");
}
L("strlen_loop_end");
ktestq(k1, k1);
jz("k2_end");
L("k1_end");
kmov_mask_to_reg(rdx, k1);
tzcnt_opt(rdx, rdx);
lea(rax, ptr[rcx + rdx]);
ret();
L("k2_end");
kmov_mask_to_reg(rdx, k2);
tzcnt_opt(rdx, rdx);
lea(rax, ptr[rcx + rdx + 64/n_scale]);
ret();
}
const uint8_t *getCode() const {
return Xbyak::CodeGenerator::getCode();
}
size_t getSize() {
return Xbyak::CodeGenerator::getSize();
}
};
#if defined(_DEBUG)
#include <stdio.h>
int main()
{
typedef size_t(*strlen_func)(char *);
typedef size_t(*wcslen_func)(wchar_t *);
StrlenGenerator strlenGen(sizeof(char)), wcslenGen(sizeof(wchar_t));
auto my_strlen = reinterpret_cast<strlen_func>(strlenGen.getCode());
auto my_wcslen = reinterpret_cast<wcslen_func>(wcslenGen.getCode());
my_strlen("");
my_wcslen(L"");
char* str = "abcde";
printf("%s: %d, %d", str, strlen(str), my_strlen(str));
wchar_t* wstr = L"5000兆円欲しい!";
wprintf(L"%s: %d, %d", wstr, wcslen(wstr), my_wcslen(wstr));
return 0;
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment