Last active
November 29, 2017 14:23
-
-
Save magurosan/93e52bcf8c20737b204f664c7757aa1b to your computer and use it in GitHub Desktop.
collatz count benchmark in Xbyak (require Xbyak and collatz.cpp => http://bit.ly/2uTTEpd ). and slide is here => https://www.slideshare.net/MasakiOta3/intel-goldmontmpx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// collatz_bench.cpp : コンソール アプリケーションのエントリ ポイントを定義します。 | |
// | |
#include "stdafx.h" | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include "xbyak/xbyak.h" | |
#include "xbyak/xbyak_util.h" | |
//#include <Windows.h> // for _BitscanForward64 | |
#include <mmsystem.h> | |
#include <intrin.h> | |
#pragma comment(lib, "winmm.lib") | |
#include <iostream> | |
#include <vector> | |
#include <string> | |
#include <tuple> | |
#include <algorithm> | |
//Singleton Classのようなもの | |
class CpuOptInfo { | |
private: | |
Xbyak::util::Cpu cpu_; | |
bool is_popcnt_alt_bsf; | |
bool benchPopcntBsf() const { | |
if (!hasPOPCNT()) return false; | |
static const int COUNT = 500; | |
volatile DWORD64 sum = 0; | |
DWORD64 x = __rdtsc(); | |
for (DWORD64 i = 1; i <= COUNT; i++) { | |
DWORD c; | |
_BitScanForward64(&c, i); | |
sum += c; | |
} | |
DWORD64 bsf_time = __rdtsc() - x; | |
sum = 0; | |
x = __rdtsc(); | |
for (DWORD64 i = 1; i <= COUNT; i++) { | |
sum += __popcnt64((DWORD64)((i & (0-i))) - 1); | |
} | |
DWORD64 popcnt_time = __rdtsc() - x; | |
return popcnt_time < bsf_time; | |
} | |
public: | |
~CpuOptInfo() {} | |
CpuOptInfo() : cpu_(), is_popcnt_alt_bsf(benchPopcntBsf()) {} | |
const Xbyak::util::Cpu& cpu() { return cpu_; } | |
static CpuOptInfo& getInstance() { | |
static CpuOptInfo inst; | |
return inst; | |
} | |
bool hasPOPCNT() const { return cpu_.has(Xbyak::util::Cpu::tPOPCNT); } | |
bool hasTZCNT() const { return cpu_.has(Xbyak::util::Cpu::tBMI1); } | |
bool hasMPX() const { return cpu_.has(Xbyak::util::Cpu::tMPX); } | |
bool altBSF_POPCNT() const { return is_popcnt_alt_bsf; } | |
}; | |
class CollatzCountGenerator : public Xbyak::CodeGenerator { | |
CpuOptInfo cpuInfo; | |
// bsf, tzcnt, popcnt((n & -n) -1) | |
void bsf_opt(Xbyak::Reg dest, Xbyak::Reg src) { | |
if (cpuInfo.hasTZCNT()) { | |
tzcnt(dest, src); | |
} else if (cpuInfo.hasPOPCNT() && cpuInfo.altBSF_POPCNT()) { | |
xor_(dest, dest); // 0 | |
sub(dest, src); // -n | |
and_(dest, src); // -n & n | |
dec(dest); // (-n & n) -1 | |
popcnt(dest, dest); | |
} else { | |
bsf(rcx, rdx); | |
} | |
} | |
public: | |
// check_mode: | |
// 0: no-check | |
// 1: USE MPX | |
// 2: USE CMP | |
CollatzCountGenerator(int check_mode = 0) | |
: cpuInfo(CpuOptInfo::getInstance()) | |
{ | |
// generated function | |
// uint64_t func(uint64_t src); | |
// Note: | |
// Win64 x64 ABI: RAX, RCX, RDX, R8, R9, R10, R11 is caller save | |
bool useMPX = check_mode == 1 && cpuInfo.hasMPX(); | |
xor_(eax, eax); | |
mov(rdx, rcx); | |
if (check_mode) { | |
mov(r8, ((UINT64_MAX/3) - 1)); | |
if (useMPX) { | |
bndmk(bnd0, ptr[rax+r8]); | |
} | |
} | |
#define BND if (useMPX) {db(0xF2);} else {}// bnd prefix | |
cmp(rcx, 1); | |
BND jle("func_end"); | |
test(rcx, 1); | |
BND jz("even"); | |
//BND jmp("odd"); | |
//align(16); | |
L("odd"); { | |
if (check_mode) { | |
if (useMPX) { | |
bndcu(bnd0, rdx); | |
} else { | |
cmp(r8, rdx); | |
jl("error"); | |
} | |
} | |
lea(rdx, ptr[rdx + rdx * 2 + 1]); // n = n * 3 + 1; | |
inc(rax); //count++; | |
} | |
L("even"); { | |
bsf_opt(rcx, rdx); // tzcnt/bsf or emulation(popcnt) | |
shr(rdx, cl); | |
add(rax, rcx); | |
} | |
cmp(rdx, 1); | |
BND jne("odd"); | |
L("func_end"); | |
ret(); | |
if (check_mode && !useMPX) { | |
L("error"); | |
xor_(ecx, ecx); | |
div(rcx); | |
} | |
#undef BND | |
} | |
typedef uint64_t(*func_type)(uint64_t); | |
func_type getCode() { | |
return reinterpret_cast<func_type>(Xbyak::CodeGenerator::getCode()); | |
} | |
}; | |
int main(int argc, char argv) | |
{ | |
HANDLE hCurrProcess = GetCurrentProcess(); | |
::SetProcessAffinityMask(hCurrProcess, 1); | |
::SetPriorityClass(hCurrProcess, REALTIME_PRIORITY_CLASS); | |
static const int N_COUNT = 10000000; | |
typedef uint64_t(*collatz_func)(uint64_t); | |
typedef std::tuple<collatz_func, std::string> Bench; | |
std::cout << "Collatz Problem Step Count Benchmark" << std::endl; | |
extern uint64_t collatz_count_ref(uint64_t n); | |
extern uint64_t collatz_count_opt_loop(uint64_t n); | |
extern uint64_t collatz_count_opt_bsf(uint64_t n); | |
extern uint64_t collatz_count_opt_tzcnt(uint64_t n); | |
extern uint64_t collatz_count_opt_popcnt(uint64_t n); | |
CollatzCountGenerator dyn(0), dyn_chkmpx(1), dyn_chkcmp(2); | |
collatz_func | |
collatz_count_xbyak_no_chk = dyn.getCode(), | |
collatz_count_xbyak_chkmpx = dyn_chkmpx.getCode(), | |
collatz_count_xbyak_chkcmp = dyn_chkcmp.getCode(); | |
std::vector<Bench> benchmarks = { | |
std::make_tuple(collatz_count_ref, "reference"), | |
std::make_tuple(collatz_count_opt_loop, "static(loop)"), | |
std::make_tuple(collatz_count_opt_bsf, "static(bsf)"), | |
std::make_tuple(collatz_count_opt_tzcnt, "static(tzcnt)"), | |
std::make_tuple(collatz_count_opt_popcnt, "static(popcnt)"), | |
std::make_tuple(collatz_count_xbyak_no_chk, "xbyak(no check)"), | |
std::make_tuple(collatz_count_xbyak_chkmpx, "xbyak_chk(mpx)"), | |
std::make_tuple(collatz_count_xbyak_chkcmp, "xbyak_ckk(no mpx)"), | |
}; | |
#if defined(_DEBUG) | |
collatz_count_xbyak_chkmpx(0x7BADCAFEDEADBEEF); | |
#endif | |
// VC++構造化例外をC++例外に変換 | |
_set_se_translator([](unsigned int code, struct _EXCEPTION_POINTERS* ep) { | |
char buf[80]; | |
sprintf_s(buf, sizeof(buf), "Error(0x%08X)", code); | |
throw buf; | |
}); | |
std::for_each(benchmarks.begin(), benchmarks.end(), [](Bench& x) { | |
collatz_func func = std::get<0>(x); | |
std::string name = std::get<1>(x); | |
DWORD startTime = timeGetTime(); | |
uint64_t count = 0; | |
for (int i = 1; i <= N_COUNT; i++) { | |
try { | |
count += func(i); | |
} | |
catch (char* buf) { | |
std::cout << name << ": " << buf << "at param=" << i << std::endl; | |
return; | |
} | |
} | |
DWORD duration = timeGetTime() - startTime; | |
std::cout << name << ": ans=" << count << ", time=" << duration << "[ms]" << std::endl; | |
}); | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment