Skip to content

Instantly share code, notes, and snippets.

@magurosan
Last active November 29, 2017 14:23
Show Gist options
  • Save magurosan/93e52bcf8c20737b204f664c7757aa1b to your computer and use it in GitHub Desktop.
Save magurosan/93e52bcf8c20737b204f664c7757aa1b to your computer and use it in GitHub Desktop.
collatz count benchmark in Xbyak (require Xbyak and collatz.cpp => http://bit.ly/2uTTEpd ). and slide is here => https://www.slideshare.net/MasakiOta3/intel-goldmontmpx
// collatz_bench.cpp : コンソール アプリケーションのエントリ ポイントを定義します。
//
#include "stdafx.h"
#include <stdlib.h>
#include <stdint.h>
#include "xbyak/xbyak.h"
#include "xbyak/xbyak_util.h"
//#include <Windows.h> // for _BitscanForward64
#include <mmsystem.h>
#include <intrin.h>
#pragma comment(lib, "winmm.lib")
#include <iostream>
#include <vector>
#include <string>
#include <tuple>
#include <algorithm>
//Singleton Classのようなもの
class CpuOptInfo {
private:
Xbyak::util::Cpu cpu_;
bool is_popcnt_alt_bsf;
bool benchPopcntBsf() const {
if (!hasPOPCNT()) return false;
static const int COUNT = 500;
volatile DWORD64 sum = 0;
DWORD64 x = __rdtsc();
for (DWORD64 i = 1; i <= COUNT; i++) {
DWORD c;
_BitScanForward64(&c, i);
sum += c;
}
DWORD64 bsf_time = __rdtsc() - x;
sum = 0;
x = __rdtsc();
for (DWORD64 i = 1; i <= COUNT; i++) {
sum += __popcnt64((DWORD64)((i & (0-i))) - 1);
}
DWORD64 popcnt_time = __rdtsc() - x;
return popcnt_time < bsf_time;
}
public:
~CpuOptInfo() {}
CpuOptInfo() : cpu_(), is_popcnt_alt_bsf(benchPopcntBsf()) {}
const Xbyak::util::Cpu& cpu() { return cpu_; }
static CpuOptInfo& getInstance() {
static CpuOptInfo inst;
return inst;
}
bool hasPOPCNT() const { return cpu_.has(Xbyak::util::Cpu::tPOPCNT); }
bool hasTZCNT() const { return cpu_.has(Xbyak::util::Cpu::tBMI1); }
bool hasMPX() const { return cpu_.has(Xbyak::util::Cpu::tMPX); }
bool altBSF_POPCNT() const { return is_popcnt_alt_bsf; }
};
class CollatzCountGenerator : public Xbyak::CodeGenerator {
CpuOptInfo cpuInfo;
// bsf, tzcnt, popcnt((n & -n) -1)
void bsf_opt(Xbyak::Reg dest, Xbyak::Reg src) {
if (cpuInfo.hasTZCNT()) {
tzcnt(dest, src);
} else if (cpuInfo.hasPOPCNT() && cpuInfo.altBSF_POPCNT()) {
xor_(dest, dest); // 0
sub(dest, src); // -n
and_(dest, src); // -n & n
dec(dest); // (-n & n) -1
popcnt(dest, dest);
} else {
bsf(rcx, rdx);
}
}
public:
// check_mode:
// 0: no-check
// 1: USE MPX
// 2: USE CMP
CollatzCountGenerator(int check_mode = 0)
: cpuInfo(CpuOptInfo::getInstance())
{
// generated function
// uint64_t func(uint64_t src);
// Note:
// Win64 x64 ABI: RAX, RCX, RDX, R8, R9, R10, R11 is caller save
bool useMPX = check_mode == 1 && cpuInfo.hasMPX();
xor_(eax, eax);
mov(rdx, rcx);
if (check_mode) {
mov(r8, ((UINT64_MAX/3) - 1));
if (useMPX) {
bndmk(bnd0, ptr[rax+r8]);
}
}
#define BND if (useMPX) {db(0xF2);} else {}// bnd prefix
cmp(rcx, 1);
BND jle("func_end");
test(rcx, 1);
BND jz("even");
//BND jmp("odd");
//align(16);
L("odd"); {
if (check_mode) {
if (useMPX) {
bndcu(bnd0, rdx);
} else {
cmp(r8, rdx);
jl("error");
}
}
lea(rdx, ptr[rdx + rdx * 2 + 1]); // n = n * 3 + 1;
inc(rax); //count++;
}
L("even"); {
bsf_opt(rcx, rdx); // tzcnt/bsf or emulation(popcnt)
shr(rdx, cl);
add(rax, rcx);
}
cmp(rdx, 1);
BND jne("odd");
L("func_end");
ret();
if (check_mode && !useMPX) {
L("error");
xor_(ecx, ecx);
div(rcx);
}
#undef BND
}
typedef uint64_t(*func_type)(uint64_t);
func_type getCode() {
return reinterpret_cast<func_type>(Xbyak::CodeGenerator::getCode());
}
};
int main(int argc, char argv)
{
HANDLE hCurrProcess = GetCurrentProcess();
::SetProcessAffinityMask(hCurrProcess, 1);
::SetPriorityClass(hCurrProcess, REALTIME_PRIORITY_CLASS);
static const int N_COUNT = 10000000;
typedef uint64_t(*collatz_func)(uint64_t);
typedef std::tuple<collatz_func, std::string> Bench;
std::cout << "Collatz Problem Step Count Benchmark" << std::endl;
extern uint64_t collatz_count_ref(uint64_t n);
extern uint64_t collatz_count_opt_loop(uint64_t n);
extern uint64_t collatz_count_opt_bsf(uint64_t n);
extern uint64_t collatz_count_opt_tzcnt(uint64_t n);
extern uint64_t collatz_count_opt_popcnt(uint64_t n);
CollatzCountGenerator dyn(0), dyn_chkmpx(1), dyn_chkcmp(2);
collatz_func
collatz_count_xbyak_no_chk = dyn.getCode(),
collatz_count_xbyak_chkmpx = dyn_chkmpx.getCode(),
collatz_count_xbyak_chkcmp = dyn_chkcmp.getCode();
std::vector<Bench> benchmarks = {
std::make_tuple(collatz_count_ref, "reference"),
std::make_tuple(collatz_count_opt_loop, "static(loop)"),
std::make_tuple(collatz_count_opt_bsf, "static(bsf)"),
std::make_tuple(collatz_count_opt_tzcnt, "static(tzcnt)"),
std::make_tuple(collatz_count_opt_popcnt, "static(popcnt)"),
std::make_tuple(collatz_count_xbyak_no_chk, "xbyak(no check)"),
std::make_tuple(collatz_count_xbyak_chkmpx, "xbyak_chk(mpx)"),
std::make_tuple(collatz_count_xbyak_chkcmp, "xbyak_ckk(no mpx)"),
};
#if defined(_DEBUG)
collatz_count_xbyak_chkmpx(0x7BADCAFEDEADBEEF);
#endif
// VC++構造化例外をC++例外に変換
_set_se_translator([](unsigned int code, struct _EXCEPTION_POINTERS* ep) {
char buf[80];
sprintf_s(buf, sizeof(buf), "Error(0x%08X)", code);
throw buf;
});
std::for_each(benchmarks.begin(), benchmarks.end(), [](Bench& x) {
collatz_func func = std::get<0>(x);
std::string name = std::get<1>(x);
DWORD startTime = timeGetTime();
uint64_t count = 0;
for (int i = 1; i <= N_COUNT; i++) {
try {
count += func(i);
}
catch (char* buf) {
std::cout << name << ": " << buf << "at param=" << i << std::endl;
return;
}
}
DWORD duration = timeGetTime() - startTime;
std::cout << name << ": ans=" << count << ", time=" << duration << "[ms]" << std::endl;
});
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment