Skip to content

Instantly share code, notes, and snippets.

@usstq
Last active August 24, 2021 01:50
Show Gist options
  • Save usstq/46621706650cdb4200a9b3d1d66b494c to your computer and use it in GitHub Desktop.
Save usstq/46621706650cdb4200a9b3d1d66b494c to your computer and use it in GitHub Desktop.
xbyak profile with vtune
/*
Description:
Code2 is consistantly faster than Code because SSE2 instructions (like movdqu)
is not cooperating well with AVX registers. use AVX (vmovdqu) as much as possible.
------------------------------------------------------------------------------
Install vtune & make a symbolic link
/opt/intel/vtune_amplifier -> /home/xxx/intel/oneapi/vtune/2021.5.0/
How to compile: add following rules to Makefile:
mytest: mytest.cpp ../xbyak/xbyak_util.h
$(CXX) $(CFLAGS) mytest.cpp -o $@ -DXBYAK_USE_VTUNE -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl
How to profile JIT-code with perf or VTune
$ source /opt/intel/vtune_amplifier/env/vars.sh
$ vtune-gui
application : mytest
application parameters: 2
Hotspots
*/
#include <chrono>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <xbyak/xbyak_util.h>
struct Code : public Xbyak::CodeGenerator {
Code() {
sub(rsp, 32 * 3);
mov(rax, 200123 * 200);
L(".sss");
movdqu(ptr[rsp + 0], xmm5);
movdqu(ptr[rsp + 16], xmm6);
movdqu(ptr[rsp + 16 * 2], xmm7);
vpxor(ymm0, ymm0, ymm0);
vpxor(ymm1, ymm1, ymm1);
vpxor(ymm2, ymm2, ymm2);
sub(eax, 1);
jnz(".sss");
add(rsp, 32 * 3);
mov(eax, 1);
ret();
}
};
struct Code2 : public Xbyak::CodeGenerator {
Code2() {
sub(rsp, 32 * 3);
mov(rax, 200123 * 200);
L(".sss");
vmovdqu(ptr[rsp + 0], xmm5);
vmovdqu(ptr[rsp + 32], xmm6);
vmovdqu(ptr[rsp + 32 * 2], xmm7);
vpxor(ymm5, ymm5, ymm5);
vpxor(ymm6, ymm6, ymm6);
vpxor(ymm7, ymm7, ymm7);
/*
vpxor(ymm0, ymm0, ymm0);
vpxor(ymm1, ymm1, ymm1);
vpxor(ymm2, ymm2, ymm2);
*/
sub(eax, 1);
jnz(".sss");
add(rsp, 32 * 3);
mov(eax, 1);
ret();
}
};
template <typename F> float dura(const char *name, F func) {
auto t0 = std::chrono::high_resolution_clock::now();
func();
auto t1 = std::chrono::high_resolution_clock::now();
std::chrono::duration<float> fsec = t1 - t0;
float ret = fsec.count() * 1000;
if (name)
std::cout << name << " took " << ret << "ms\n";
return ret;
}
int main(int argc, char *argv[]) {
int mode = argc == 1 ? 0 : atoi(argv[1]);
Code c;
Code2 c2;
int (*f)() = (int (*)())c.getCode();
int (*g)() = (int (*)())c2.getCode();
printf("f:%p, %d\n", (const void *)f, (int)c.getSize());
printf("g:%p, %d\n", (const void *)g, (int)c2.getSize());
Xbyak::util::Profiler prof;
printf("mode=%d\n", mode);
prof.init(mode);
prof.set("f", (const void *)f, c.getSize());
prof.set("g", (const void *)g, c2.getSize());
auto ft = dura("f", f);
auto gt = dura("g", g);
printf("g is faster than f by %.1f%%\n", 100*((1./gt) - (1./ft))/(1./ft) );
puts("end");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment