pervognsen/vm_bench.c

## vm_bench.c
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

#if defined(__x86_64__)
#define BREAK    asm("int3")
#else
#error Implement macros for your CPU.
#endif

#define R(x)     asm("" : : "r"(x))
#define RW(x)    asm("" : "+r"(x))
#define INLINE   __attribute__((always_inline))
#define NOINLINE __attribute__((noinline))
#define TAIL     __attribute__((musttail))
#define TAILABI  __attribute__((sysv_abi))

typedef int8_t   i8;
typedef uint8_t  u8;
typedef int16_t  i16;
typedef uint16_t u16;
typedef int32_t  i32;
typedef uint32_t u32;
typedef int64_t  i64;
typedef uint64_t u64;

//#define SYNCHRONOUS 1
#define PREFETCHED 1

#if SYNCHRONOUS // 5900HX (Zen 3): 6.23 secs/1000000000 ins, 160 MIPS, 28.12 cycles/ins

#define BLOCK(f, ...) TAILABI const u32 *(f)(const u32 *ip, u32 ins, __VA_ARGS__)
#define GOTO(f, ...) TAIL return (f)(ip, ins, __VA_ARGS__)

BLOCK(*vm_ops[256]);

INLINE
BLOCK(vm_dispatch) {
	ins = *ip;
	u8 op = ins;
	BLOCK(*blk) = vm_ops[op];
	GOTO(blk);
}

const u32 *vm_exec(const u32 *ip) {
	return vm_dispatch(ip, 0);
}

#elif PREFETCHED // 5900HX (Zen 3): 4.26 secs/1000000000 ins, 235 MIPS, 19.24 cycles/ins

#define BLOCK(f, ...) TAILABI const u32 *(f)(const u32 *ip, u32 ins, u32 next_ins, void *next_blk, __VA_ARGS__)
#define GOTO(f, ...) TAIL return (f)(ip, ins, next_ins, next_blk, __VA_ARGS__)

BLOCK(*vm_ops[256]);

INLINE
BLOCK(vm_dispatch) {
	ins = next_ins;
	BLOCK(*blk) = next_blk;
	next_ins = ip[1];
	u8 next_op = next_ins;
	next_blk = vm_ops[next_op];
	GOTO(blk);
}

const u32 *vm_exec(const u32 *ip) {
	u32 next_ins = *ip;
	u8 next_op = next_ins;
	void *next_blk = vm_ops[next_op];
	return vm_dispatch(ip, 0, next_ins, next_blk);
}

#else
#error Select an implementation.
#endif

enum {
	ADD = 0x00,
	ADDI = 0x01,
	SUB = 0x02,
	SUBI = 0x03,
	AND = 0x04,
	ANDI = 0x05,
	OR = 0x06,
	ORI = 0x07,
	XOR = 0x08,
	XORI = 0x09,
	LSL = 0x0A,
	LSLI = 0x0B,
	LSR = 0x0C,
	LSRI = 0x0D,
	ASR = 0x0E,
	ASRI = 0x0F,
	BRK = 0xFD,
	HLT = 0xFE,
	ERR = 0xFF,
};

u64 vm_regs[256];

#define a  ((u8)(ins >> 8))
#define b  ((u8)(ins >> 16))
#define c  ((u8)(ins >> 24))
#define ra (vm_regs[a])
#define rb (vm_regs[b])
#define rc (vm_regs[c])
#define ic ((u64)(i8)c)

INLINE
BLOCK(vm_next) {
	ip++;
	GOTO(vm_dispatch);
}

BLOCK(vm_add) {
	ra = rb + rc;
	GOTO(vm_next);
}

BLOCK(vm_addi) {
	ra = rb + ic;
	GOTO(vm_next);
}

BLOCK(vm_sub) {
	ra = rb - rc;
	GOTO(vm_next);
}

BLOCK(vm_subi) {
	ra = rb - ic;
	GOTO(vm_next);
}

BLOCK(vm_and) {
	ra = rb & rc;
	GOTO(vm_next);
}

BLOCK(vm_andi) {
	ra = rb & ic;
	GOTO(vm_next);
}

BLOCK(vm_or) {
	ra = rb | rc;
	GOTO(vm_next);
}

BLOCK(vm_ori) {
	ra = rb | ic;
	GOTO(vm_next);
}

BLOCK(vm_xor) {
	ra = rb ^ rc;
	GOTO(vm_next);
}

BLOCK(vm_xori) {
	ra = rb ^ ic;
	GOTO(vm_next);
}

BLOCK(vm_lsl) {
	ra = rb << (rc & 63);
	GOTO(vm_next);
}

BLOCK(vm_lsli) {
	ra = rb << (ic & 63);
	GOTO(vm_next);
}

BLOCK(vm_lsr) {
	ra = rb >> (rc & 63);
	GOTO(vm_next);
}

BLOCK(vm_lsri) {
	ra = rb >> (ic & 63);
	GOTO(vm_next);
}

BLOCK(vm_asr) {
	ra = (i64)rb >> (rc & 63);
	GOTO(vm_next);
}

BLOCK(vm_asri) {
	ra = (i64)rb >> (ic & 63);
	GOTO(vm_next);
}

BLOCK(vm_brk) {
	BREAK;
	GOTO(vm_next);
}

BLOCK(vm_hlt) {
	return ip;
}

BLOCK(vm_err) {
	BREAK;
	return NULL;
}

#undef a
#undef b
#undef c
#undef ra
#undef rb
#undef rc
#undef ic

void vm_init(void) {
	memset(vm_regs, 0, sizeof(vm_regs));

	for (int op = 0; op < 256; op++)
		vm_ops[op] = vm_err;

	vm_ops[ADD]  = vm_add;
	vm_ops[ADDI] = vm_addi;
	vm_ops[SUB]  = vm_sub;
	vm_ops[SUBI] = vm_subi;
	vm_ops[AND]  = vm_and;
	vm_ops[ANDI] = vm_andi;
	vm_ops[OR]   = vm_or;
	vm_ops[ORI]  = vm_ori;
	vm_ops[XOR]  = vm_xor;
	vm_ops[XORI] = vm_xori;
	vm_ops[LSL]  = vm_lsl;
	vm_ops[LSLI] = vm_lsli;
	vm_ops[LSR]  = vm_lsr;
	vm_ops[LSRI] = vm_lsri;
	vm_ops[ASR]  = vm_asr;
	vm_ops[ASRI] = vm_asri;
	vm_ops[BRK]  = vm_brk;
	vm_ops[HLT]  = vm_hlt;
}

#ifdef _WIN32
#pragma comment(lib, "kernel32.lib")

typedef int BOOL;
typedef int64_t LONGLONG;
typedef struct { LONGLONG QuadPart; } LARGE_INTEGER;

#define WINAPI __stdcall

WINAPI BOOL QueryPerformanceFrequency(LARGE_INTEGER *lpFrequency);
WINAPI BOOL QueryPerformanceCounter(LARGE_INTEGER *lpPerformanceCount);

LARGE_INTEGER timer_base;

void start_timer(void) {
	QueryPerformanceCounter(&timer_base);
}

double stop_timer(void) {
	if (timer_base.QuadPart == 0) return 0.0;
	LARGE_INTEGER now;
	QueryPerformanceCounter(&now);
	LARGE_INTEGER freq;
	QueryPerformanceFrequency(&freq);
	double time = (double)(now.QuadPart - timer_base.QuadPart) / freq.QuadPart;
	timer_base.QuadPart = 0;
	return time;
}
#else
#error Implement timers for your platform.
#endif

NOINLINE
int64_t delay_loop(int64_t cycles) {
	int64_t i = cycles;
	do {
		RW(i); --i; RW(i); --i; RW(i); --i; RW(i); --i;
		RW(i); --i; RW(i); --i; RW(i); --i; RW(i); --i;
		RW(i); --i; RW(i); --i; RW(i); --i; RW(i); --i;
		RW(i); --i; RW(i); --i; RW(i); --i; RW(i); --i;
	} while (i > 0);
	R(i);
	return i;
}

double cpufreq;

void calc_cpufreq(void) {
	int64_t cycles = 1ull << 33;
	start_timer();
	int64_t adjust = delay_loop(cycles);
	double time = stop_timer();
	cpufreq = (cycles - adjust) / time;
}

u64 random_u64(void) {
	static u64 x = 0x2545F4914F6CDD1D;
	x ^= x << 13;
	x ^= x >> 7;
	x ^= x << 17;
	return x;
}

int main() {
	printf("Estimating CPU frequency...\n");
	calc_cpufreq();
	printf("CPU frequency: %.2f GHz\n", 1e-9 * cpufreq);
	u64 n = 1e9;
	u32 *ip = malloc(n * sizeof(u32));
	for (u64 i = 0; i < n; i++)
		ip[i] = random_u64() & ~0xF0;
	ip[n-2] = HLT;
	ip[n-1] = ERR;
	for (int i = 0; i < 2; i++) {
		vm_init();
		start_timer();
		const u32 *last_ip = vm_exec(ip);
		double time = stop_timer();
		if (last_ip != &ip[n-2]) BREAK;
		double cycles = time * cpufreq;
		printf("Run %d: %.2f secs/%llu ins, %.0f MIPS, %.2f cycles/ins\n", i, time, n, 1e-6 * n / time, cycles / n);
	}
	return 0;
}
	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>

	#if defined(__x86_64__)
	#define BREAK asm("int3")
	#else
	#error Implement macros for your CPU.
	#endif

	#define R(x) asm("" : : "r"(x))
	#define RW(x) asm("" : "+r"(x))
	#define INLINE __attribute__((always_inline))
	#define NOINLINE __attribute__((noinline))
	#define TAIL __attribute__((musttail))
	#define TAILABI __attribute__((sysv_abi))

	typedef int8_t i8;
	typedef uint8_t u8;
	typedef int16_t i16;
	typedef uint16_t u16;
	typedef int32_t i32;
	typedef uint32_t u32;
	typedef int64_t i64;
	typedef uint64_t u64;

	//#define SYNCHRONOUS 1
	#define PREFETCHED 1

	#if SYNCHRONOUS // 5900HX (Zen 3): 6.23 secs/1000000000 ins, 160 MIPS, 28.12 cycles/ins

	#define BLOCK(f, ...) TAILABI const u32 (f)(const u32 ip, u32 ins, __VA_ARGS__)
	#define GOTO(f, ...) TAIL return (f)(ip, ins, __VA_ARGS__)

	BLOCK(*vm_ops[256]);

	INLINE
	BLOCK(vm_dispatch) {
	ins = *ip;
	u8 op = ins;
	BLOCK(*blk) = vm_ops[op];
	GOTO(blk);
	}

	const u32 vm_exec(const u32 ip) {
	return vm_dispatch(ip, 0);
	}

	#elif PREFETCHED // 5900HX (Zen 3): 4.26 secs/1000000000 ins, 235 MIPS, 19.24 cycles/ins

	#define BLOCK(f, ...) TAILABI const u32 (f)(const u32 ip, u32 ins, u32 next_ins, void *next_blk, __VA_ARGS__)
	#define GOTO(f, ...) TAIL return (f)(ip, ins, next_ins, next_blk, __VA_ARGS__)

	BLOCK(*vm_ops[256]);

	INLINE
	BLOCK(vm_dispatch) {
	ins = next_ins;
	BLOCK(*blk) = next_blk;
	next_ins = ip[1];
	u8 next_op = next_ins;
	next_blk = vm_ops[next_op];
	GOTO(blk);
	}

	const u32 vm_exec(const u32 ip) {
	u32 next_ins = *ip;
	u8 next_op = next_ins;
	void *next_blk = vm_ops[next_op];
	return vm_dispatch(ip, 0, next_ins, next_blk);
	}

	#else
	#error Select an implementation.
	#endif

	enum {
	ADD = 0x00,
	ADDI = 0x01,
	SUB = 0x02,
	SUBI = 0x03,
	AND = 0x04,
	ANDI = 0x05,
	OR = 0x06,
	ORI = 0x07,
	XOR = 0x08,
	XORI = 0x09,
	LSL = 0x0A,
	LSLI = 0x0B,
	LSR = 0x0C,
	LSRI = 0x0D,
	ASR = 0x0E,
	ASRI = 0x0F,
	BRK = 0xFD,
	HLT = 0xFE,
	ERR = 0xFF,
	};

	u64 vm_regs[256];

	#define a ((u8)(ins >> 8))
	#define b ((u8)(ins >> 16))
	#define c ((u8)(ins >> 24))
	#define ra (vm_regs[a])
	#define rb (vm_regs[b])
	#define rc (vm_regs[c])
	#define ic ((u64)(i8)c)

	INLINE
	BLOCK(vm_next) {
	ip++;
	GOTO(vm_dispatch);
	}

	BLOCK(vm_add) {
	ra = rb + rc;
	GOTO(vm_next);
	}

	BLOCK(vm_addi) {
	ra = rb + ic;
	GOTO(vm_next);
	}

	BLOCK(vm_sub) {
	ra = rb - rc;
	GOTO(vm_next);
	}

	BLOCK(vm_subi) {
	ra = rb - ic;
	GOTO(vm_next);
	}

	BLOCK(vm_and) {
	ra = rb & rc;
	GOTO(vm_next);
	}

	BLOCK(vm_andi) {
	ra = rb & ic;
	GOTO(vm_next);
	}

	BLOCK(vm_or) {
	ra = rb \| rc;
	GOTO(vm_next);
	}

	BLOCK(vm_ori) {
	ra = rb \| ic;
	GOTO(vm_next);
	}

	BLOCK(vm_xor) {
	ra = rb ^ rc;
	GOTO(vm_next);
	}

	BLOCK(vm_xori) {
	ra = rb ^ ic;
	GOTO(vm_next);
	}

	BLOCK(vm_lsl) {
	ra = rb << (rc & 63);
	GOTO(vm_next);
	}

	BLOCK(vm_lsli) {
	ra = rb << (ic & 63);
	GOTO(vm_next);
	}

	BLOCK(vm_lsr) {
	ra = rb >> (rc & 63);
	GOTO(vm_next);
	}

	BLOCK(vm_lsri) {
	ra = rb >> (ic & 63);
	GOTO(vm_next);
	}

	BLOCK(vm_asr) {
	ra = (i64)rb >> (rc & 63);
	GOTO(vm_next);
	}

	BLOCK(vm_asri) {
	ra = (i64)rb >> (ic & 63);
	GOTO(vm_next);
	}

	BLOCK(vm_brk) {
	BREAK;
	GOTO(vm_next);
	}

	BLOCK(vm_hlt) {
	return ip;
	}

	BLOCK(vm_err) {
	BREAK;
	return NULL;
	}

	#undef a
	#undef b
	#undef c
	#undef ra
	#undef rb
	#undef rc
	#undef ic

	void vm_init(void) {
	memset(vm_regs, 0, sizeof(vm_regs));

	for (int op = 0; op < 256; op++)
	vm_ops[op] = vm_err;

	vm_ops[ADD] = vm_add;
	vm_ops[ADDI] = vm_addi;
	vm_ops[SUB] = vm_sub;
	vm_ops[SUBI] = vm_subi;
	vm_ops[AND] = vm_and;
	vm_ops[ANDI] = vm_andi;
	vm_ops[OR] = vm_or;
	vm_ops[ORI] = vm_ori;
	vm_ops[XOR] = vm_xor;
	vm_ops[XORI] = vm_xori;
	vm_ops[LSL] = vm_lsl;
	vm_ops[LSLI] = vm_lsli;
	vm_ops[LSR] = vm_lsr;
	vm_ops[LSRI] = vm_lsri;
	vm_ops[ASR] = vm_asr;
	vm_ops[ASRI] = vm_asri;
	vm_ops[BRK] = vm_brk;
	vm_ops[HLT] = vm_hlt;
	}

	#ifdef _WIN32
	#pragma comment(lib, "kernel32.lib")

	typedef int BOOL;
	typedef int64_t LONGLONG;
	typedef struct { LONGLONG QuadPart; } LARGE_INTEGER;

	#define WINAPI __stdcall

	WINAPI BOOL QueryPerformanceFrequency(LARGE_INTEGER *lpFrequency);
	WINAPI BOOL QueryPerformanceCounter(LARGE_INTEGER *lpPerformanceCount);

	LARGE_INTEGER timer_base;

	void start_timer(void) {
	QueryPerformanceCounter(&timer_base);
	}

	double stop_timer(void) {
	if (timer_base.QuadPart == 0) return 0.0;
	LARGE_INTEGER now;
	QueryPerformanceCounter(&now);
	LARGE_INTEGER freq;
	QueryPerformanceFrequency(&freq);
	double time = (double)(now.QuadPart - timer_base.QuadPart) / freq.QuadPart;
	timer_base.QuadPart = 0;
	return time;
	}
	#else
	#error Implement timers for your platform.
	#endif

	NOINLINE
	int64_t delay_loop(int64_t cycles) {
	int64_t i = cycles;
	do {
	RW(i); --i; RW(i); --i; RW(i); --i; RW(i); --i;
	RW(i); --i; RW(i); --i; RW(i); --i; RW(i); --i;
	RW(i); --i; RW(i); --i; RW(i); --i; RW(i); --i;
	RW(i); --i; RW(i); --i; RW(i); --i; RW(i); --i;
	} while (i > 0);
	R(i);
	return i;
	}

	double cpufreq;

	void calc_cpufreq(void) {
	int64_t cycles = 1ull << 33;
	start_timer();
	int64_t adjust = delay_loop(cycles);
	double time = stop_timer();
	cpufreq = (cycles - adjust) / time;
	}

	u64 random_u64(void) {
	static u64 x = 0x2545F4914F6CDD1D;
	x ^= x << 13;
	x ^= x >> 7;
	x ^= x << 17;
	return x;
	}

	int main() {
	printf("Estimating CPU frequency...\n");
	calc_cpufreq();
	printf("CPU frequency: %.2f GHz\n", 1e-9 * cpufreq);
	u64 n = 1e9;
	u32 ip = malloc(n sizeof(u32));
	for (u64 i = 0; i < n; i++)
	ip[i] = random_u64() & ~0xF0;
	ip[n-2] = HLT;
	ip[n-1] = ERR;
	for (int i = 0; i < 2; i++) {
	vm_init();
	start_timer();
	const u32 *last_ip = vm_exec(ip);
	double time = stop_timer();
	if (last_ip != &ip[n-2]) BREAK;
	double cycles = time * cpufreq;
	printf("Run %d: %.2f secs/%llu ins, %.0f MIPS, %.2f cycles/ins\n", i, time, n, 1e-6 * n / time, cycles / n);
	}
	return 0;
	}