This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Estimating CPU frequency... | |
// CPU frequency: 4.52 GHz | |
// sum1: value = 15182118497126522709, 0.31 secs, 5.14 cycles/elem | |
// sum2: value = 15182118497126522709, 0.17 secs, 2.93 cycles/elem | |
#define RW(x) asm("" : "+r"(x)) | |
typedef struct Node { | |
u64 value; | |
struct Node *next; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Example: Opcode dispatch in a bytecode VM. Assume the opcode case dispatching is mispredict heavy, | |
// and that pc, ins, next_ins, next_opcase are always in registers. | |
#define a ((ins >> 8) & 0xFF) | |
#define b ((ins >> 16) & 0xFF) | |
#define c ((ins >> 24) & 0xFF) | |
// Version 1: Synchronous instruction fetch and opcode dispatch. The big bottleneck is that given how light | |
// the essential work is for each opcode case (e.g. something like ADD is typical), you're dominated | |
// by the cost of the opcode dispatch branch mispredicts. When there's a mispredict, the pipeline restarts |