Skip to content

Instantly share code, notes, and snippets.

@pervognsen
Last active Jan 29, 2022
Embed
What would you like to do?
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#if defined(__x86_64__)
#define BREAK asm("int3")
#else
#error Implement macros for your CPU.
#endif
#define R(x) asm("" : : "r"(x))
#define RW(x) asm("" : "+r"(x))
#define INLINE __attribute__((always_inline))
#define NOINLINE __attribute__((noinline))
#define TAIL __attribute__((musttail))
#define TAILABI __attribute__((sysv_abi))
typedef int8_t i8;
typedef uint8_t u8;
typedef int16_t i16;
typedef uint16_t u16;
typedef int32_t i32;
typedef uint32_t u32;
typedef int64_t i64;
typedef uint64_t u64;
//#define SYNCHRONOUS 1
#define PREFETCHED 1
#if SYNCHRONOUS // 5900HX (Zen 3): 6.23 secs/1000000000 ins, 160 MIPS, 28.12 cycles/ins
#define BLOCK(f, ...) TAILABI const u32 *(f)(const u32 *ip, u32 ins, __VA_ARGS__)
#define GOTO(f, ...) TAIL return (f)(ip, ins, __VA_ARGS__)
BLOCK(*vm_ops[256]);
INLINE
BLOCK(vm_dispatch) {
ins = *ip;
u8 op = ins;
BLOCK(*blk) = vm_ops[op];
GOTO(blk);
}
const u32 *vm_exec(const u32 *ip) {
return vm_dispatch(ip, 0);
}
#elif PREFETCHED // 5900HX (Zen 3): 4.26 secs/1000000000 ins, 235 MIPS, 19.24 cycles/ins
#define BLOCK(f, ...) TAILABI const u32 *(f)(const u32 *ip, u32 ins, u32 next_ins, void *next_blk, __VA_ARGS__)
#define GOTO(f, ...) TAIL return (f)(ip, ins, next_ins, next_blk, __VA_ARGS__)
BLOCK(*vm_ops[256]);
INLINE
BLOCK(vm_dispatch) {
ins = next_ins;
BLOCK(*blk) = next_blk;
next_ins = ip[1];
u8 next_op = next_ins;
next_blk = vm_ops[next_op];
GOTO(blk);
}
const u32 *vm_exec(const u32 *ip) {
u32 next_ins = *ip;
u8 next_op = next_ins;
void *next_blk = vm_ops[next_op];
return vm_dispatch(ip, 0, next_ins, next_blk);
}
#else
#error Select an implementation.
#endif
enum {
ADD = 0x00,
ADDI = 0x01,
SUB = 0x02,
SUBI = 0x03,
AND = 0x04,
ANDI = 0x05,
OR = 0x06,
ORI = 0x07,
XOR = 0x08,
XORI = 0x09,
LSL = 0x0A,
LSLI = 0x0B,
LSR = 0x0C,
LSRI = 0x0D,
ASR = 0x0E,
ASRI = 0x0F,
BRK = 0xFD,
HLT = 0xFE,
ERR = 0xFF,
};
u64 vm_regs[256];
#define a ((u8)(ins >> 8))
#define b ((u8)(ins >> 16))
#define c ((u8)(ins >> 24))
#define ra (vm_regs[a])
#define rb (vm_regs[b])
#define rc (vm_regs[c])
#define ic ((u64)(i8)c)
INLINE
BLOCK(vm_next) {
ip++;
GOTO(vm_dispatch);
}
BLOCK(vm_add) {
ra = rb + rc;
GOTO(vm_next);
}
BLOCK(vm_addi) {
ra = rb + ic;
GOTO(vm_next);
}
BLOCK(vm_sub) {
ra = rb - rc;
GOTO(vm_next);
}
BLOCK(vm_subi) {
ra = rb - ic;
GOTO(vm_next);
}
BLOCK(vm_and) {
ra = rb & rc;
GOTO(vm_next);
}
BLOCK(vm_andi) {
ra = rb & ic;
GOTO(vm_next);
}
BLOCK(vm_or) {
ra = rb | rc;
GOTO(vm_next);
}
BLOCK(vm_ori) {
ra = rb | ic;
GOTO(vm_next);
}
BLOCK(vm_xor) {
ra = rb ^ rc;
GOTO(vm_next);
}
BLOCK(vm_xori) {
ra = rb ^ ic;
GOTO(vm_next);
}
BLOCK(vm_lsl) {
ra = rb << (rc & 63);
GOTO(vm_next);
}
BLOCK(vm_lsli) {
ra = rb << (ic & 63);
GOTO(vm_next);
}
BLOCK(vm_lsr) {
ra = rb >> (rc & 63);
GOTO(vm_next);
}
BLOCK(vm_lsri) {
ra = rb >> (ic & 63);
GOTO(vm_next);
}
BLOCK(vm_asr) {
ra = (i64)rb >> (rc & 63);
GOTO(vm_next);
}
BLOCK(vm_asri) {
ra = (i64)rb >> (ic & 63);
GOTO(vm_next);
}
BLOCK(vm_brk) {
BREAK;
GOTO(vm_next);
}
BLOCK(vm_hlt) {
return ip;
}
BLOCK(vm_err) {
BREAK;
return NULL;
}
#undef a
#undef b
#undef c
#undef ra
#undef rb
#undef rc
#undef ic
void vm_init(void) {
memset(vm_regs, 0, sizeof(vm_regs));
for (int op = 0; op < 256; op++)
vm_ops[op] = vm_err;
vm_ops[ADD] = vm_add;
vm_ops[ADDI] = vm_addi;
vm_ops[SUB] = vm_sub;
vm_ops[SUBI] = vm_subi;
vm_ops[AND] = vm_and;
vm_ops[ANDI] = vm_andi;
vm_ops[OR] = vm_or;
vm_ops[ORI] = vm_ori;
vm_ops[XOR] = vm_xor;
vm_ops[XORI] = vm_xori;
vm_ops[LSL] = vm_lsl;
vm_ops[LSLI] = vm_lsli;
vm_ops[LSR] = vm_lsr;
vm_ops[LSRI] = vm_lsri;
vm_ops[ASR] = vm_asr;
vm_ops[ASRI] = vm_asri;
vm_ops[BRK] = vm_brk;
vm_ops[HLT] = vm_hlt;
}
#ifdef _WIN32
#pragma comment(lib, "kernel32.lib")
typedef int BOOL;
typedef int64_t LONGLONG;
typedef struct { LONGLONG QuadPart; } LARGE_INTEGER;
#define WINAPI __stdcall
WINAPI BOOL QueryPerformanceFrequency(LARGE_INTEGER *lpFrequency);
WINAPI BOOL QueryPerformanceCounter(LARGE_INTEGER *lpPerformanceCount);
LARGE_INTEGER timer_base;
void start_timer(void) {
QueryPerformanceCounter(&timer_base);
}
double stop_timer(void) {
if (timer_base.QuadPart == 0) return 0.0;
LARGE_INTEGER now;
QueryPerformanceCounter(&now);
LARGE_INTEGER freq;
QueryPerformanceFrequency(&freq);
double time = (double)(now.QuadPart - timer_base.QuadPart) / freq.QuadPart;
timer_base.QuadPart = 0;
return time;
}
#else
#error Implement timers for your platform.
#endif
NOINLINE
int64_t delay_loop(int64_t cycles) {
int64_t i = cycles;
do {
RW(i); --i; RW(i); --i; RW(i); --i; RW(i); --i;
RW(i); --i; RW(i); --i; RW(i); --i; RW(i); --i;
RW(i); --i; RW(i); --i; RW(i); --i; RW(i); --i;
RW(i); --i; RW(i); --i; RW(i); --i; RW(i); --i;
} while (i > 0);
R(i);
return i;
}
double cpufreq;
void calc_cpufreq(void) {
int64_t cycles = 1ull << 33;
start_timer();
int64_t adjust = delay_loop(cycles);
double time = stop_timer();
cpufreq = (cycles - adjust) / time;
}
u64 random_u64(void) {
static u64 x = 0x2545F4914F6CDD1D;
x ^= x << 13;
x ^= x >> 7;
x ^= x << 17;
return x;
}
int main() {
printf("Estimating CPU frequency...\n");
calc_cpufreq();
printf("CPU frequency: %.2f GHz\n", 1e-9 * cpufreq);
u64 n = 1e9;
u32 *ip = malloc(n * sizeof(u32));
for (u64 i = 0; i < n; i++)
ip[i] = random_u64() & ~0xF0;
ip[n-2] = HLT;
ip[n-1] = ERR;
for (int i = 0; i < 2; i++) {
vm_init();
start_timer();
const u32 *last_ip = vm_exec(ip);
double time = stop_timer();
if (last_ip != &ip[n-2]) BREAK;
double cycles = time * cpufreq;
printf("Run %d: %.2f secs/%llu ins, %.0f MIPS, %.2f cycles/ins\n", i, time, n, 1e-6 * n / time, cycles / n);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment