pervognsen/prefetch_and_hoist.c

## prefetch_and_hoist.c
// Example: Opcode dispatch in a bytecode VM. Assume the opcode case dispatching is mispredict heavy,
// and that pc, ins, next_ins, next_opcase are always in registers.

#define a ((ins >> 8) & 0xFF)
#define b ((ins >> 16) & 0xFF)
#define c ((ins >> 24) & 0xFF)

// Version 1: Synchronous instruction fetch and opcode dispatch. The big bottleneck is that given how light
// the essential work is for each opcode case (e.g. something like ADD is typical), you're dominated
// by the cost of the opcode dispatch branch mispredicts. When there's a mispredict, the pipeline restarts
// at the right opcode case. To prepare for the next opcode after that, the code in NEXT has to fetch the
// instruction at pc + 4 and then look up the opcode case pointer for the opcode in the fetched instruction.
// Assuming all the loads hit L1, on Zen 3 the first load would be 4 cycles, then the dependent load (with a
// complex addressing mode) from the opcode table is 5 cycles, plus the 1 cycle latency from the & 0xFF, so
// it's going to be at least 9 cycles before the control dependency for the next opcode dispatch is ready.
// So if that next opcode dispatch will also mispredict, you won't know until 9 cycles later than you could
// have known if you held the indirect jump target immediately after the restart. So, you can think of the
// latency of the control dependency chain as being added to the mispredict penalty. Depending on your CPU,
// the usual number quoted is 15-20 cycles, and 9 cycles is 50% or more of that. On Skylake where the mispredict
// penalty is 15-16 and L1 loads are slower, it's closer to 100%.

#define NEXT \
    do { \
        pc += 4; \
        ins = load_ins(pc); \
        goto *opcases[ins & 0xFF]; \
    } while (0)

case_ADD:
    regs[a] = regs[b] + regs[c];
    NEXT;

case_JEQ:
    pc = regs[b] == regs[c] ? pc + (int8_t)a : pc + 4;
    ins = load_ins(pc);
    goto *opcases[ins & 0xFF];

// Version 2: Speculative instruction/opcase prefetch. As soon as we enter an opcode case, we know the indirect jump target
// for the sequentially next PC, which is almost always the next PC (except for control flow opcodes where the branch is taken).
// This means the effective penalty for a back-to-back opcode dispatch mispredict goes down by the 9+ cycles quoted in the
// last comment block. One way to think about what's going on here is that if a dispatch mispredict happens, you have 15-20
// latency cycles of work you can have in flight "for free" which will be ready after the post-mispredict restart with 0 latency.
// Obviously "for free" has to be heavily qualified. In stop-and-go mispredict-heavy code like this example, your pipeline
// resources are underutilized (in an absolute and throughput vs goodput sense) and "stealing" from a predicted-but-maybe-wasted
// code path can be a good trade-off. E.g. if you have a hard-to-predict two-way branch where branch prediction is effectively
// useless, you can get started on work for _both_ branches in parallel so that whatever branch ends up being the right one, you
// will have a headstart on the initial wave, especially the latency-sensitive parts.

#define NEXT \
    do { \
        pc += 4; \
        ins = next_ins; \
        opcase = next_opcase; \
        next_ins = load_ins(pc + 4); \
        next_opcase = opcases[next_ins & 0xFF]; \
        goto *opcase; \
    } while (0)

case_ADD:
    regs[a] = regs[b] + regs[c];
    NEXT;

// Variant 1: Branchless, doesn't benefit from prefetched ins/opcase.
case_JEQ:
    pc = regs[b] == regs[c] ? pc + (int8_t)a : pc + 4;
    ins = load_ins(pc);
    opcase = opcases[ins & 0xFF];
    next_ins = load_ins(pc + 4);
    next_opcase = ops[next_ins & 0xFF];
    goto *opcase;

// Variant 2: Branch-based, predict JEQ not taken, benefits from prefetched ins/opcase when not taken.
case_JEQ: {
    // Save 9+ cycles (assuming the next 'goto *opcase' mispredicts) when the 'unlikely' path is taken.
    bool taken = regs[b] == regs[c];
    uint32_t taken_pc = pc + (int8_t)a;
    uint32_t taken_ins = load_ins(taken_pc);
    void *taken_opcase = opcases[taken_ins & 0xFF];
    if (unlikely(taken)) {
        pc = taken_pc;
        ins = taken_ins;
        opcase = taken_opcase;
    } else {
        pc += 4;
        ins = next_ins;
        opcase = next_opcase;
    }
    next_ins = load_ins(pc + 4);
    next_opcase = opcases[next_ins & 0xFF];
    goto *opcase;
}

// Version 3: Let's go nuts and add a fully associative, unbounded branch target buffer. This means taken branches
// will be on the hot path as long as their branch targets are consistent. This is mostly just a fun example.
// In order to avoid adding an extra load's worth of latency to the prefetch path, we store the predicted
// opcase directly in the BTB alongside the predicted PC. Note that next_opcase's latency is now that of 1 load
// instead of 2 loads although the BTB load will be colder than the shared opcase table load if the PC is cold.
// You could also cache the instruction in the BTB to remove the dependent instruction load, but remember we
// are focused mainly on reducing opcase latency. I'm not sure how practical this all is, but at least it's
// interesting to contemplate. Caching the predicted opcase in the BTB feels like it belongs somewhere in the
// family tree of threaded code techniques. A serious implementation of this technique would want to use
// something less naive than this hysteresis-free "last taken" predictor, but it's the simplest thing you can
// do beyond statically predicting pc + 4, so it's the first thing students are asked to implement when learning
// how to design and implement a pipelined CPU in a computer architecture course.

#define NEXT \
    do { \
        if (likely(next_pc == pc + 4)) { \
            pc = next_pc; \
            ins = next_ins; \
            opcase = next_opcase; \
        } else { \
            uint32_t prev_pc = pc; \
            pc += 4; \
            ins = load_ins(pc); \
            opcase = opcases[ins & 0xFF]; \
            btb_pc[prev_pc] = pc; \
            btb_opcase[prev_pc] = opcase; \
        } \
        next_pc = btb_pc[pc]; \
        next_ins = load_ins(next_pc); \
        next_opcase = btb_opcase[pc]; \
        goto *opcase; \
    } while (0)

case_ADD:
    // Same as before

case_JEQ: {
    bool taken = regs[b] == regs[c];
    uint32_t taken_pc = pc + (int8_t)a;
    uint32_t actual_next_pc = select(taken, taken_pc, pc + 4);
    // The condition is written like this to prevent short-circuiting branches.
    if (likely(next_pc == actual_next_pc)) {
        pc = next_pc;
        ins = next_ins;
        opcase = next_opcase;
    } else {
        uint32_t prev_pc = pc;
        // A branchless fallback path (variant 1) seems reasonable but branch-based (variant 2) would work too.
        pc = actual_next_pc;
        ins = load_ins(pc);
        opcase = opcases[ins & 0xFF];
        btb_pc[prev_pc] = pc;
        btb_opcase[prev_pc] = opcase;
    }
    next_pc = btb_pc[pc];
    next_ins = load_ins(next_pc);
    next_opcase = btb_opcase[pc];
    goto *opcase;
}
	// Example: Opcode dispatch in a bytecode VM. Assume the opcode case dispatching is mispredict heavy,
	// and that pc, ins, next_ins, next_opcase are always in registers.

	#define a ((ins >> 8) & 0xFF)
	#define b ((ins >> 16) & 0xFF)
	#define c ((ins >> 24) & 0xFF)

	// Version 1: Synchronous instruction fetch and opcode dispatch. The big bottleneck is that given how light
	// the essential work is for each opcode case (e.g. something like ADD is typical), you're dominated
	// by the cost of the opcode dispatch branch mispredicts. When there's a mispredict, the pipeline restarts
	// at the right opcode case. To prepare for the next opcode after that, the code in NEXT has to fetch the
	// instruction at pc + 4 and then look up the opcode case pointer for the opcode in the fetched instruction.
	// Assuming all the loads hit L1, on Zen 3 the first load would be 4 cycles, then the dependent load (with a
	// complex addressing mode) from the opcode table is 5 cycles, plus the 1 cycle latency from the & 0xFF, so
	// it's going to be at least 9 cycles before the control dependency for the next opcode dispatch is ready.
	// So if that next opcode dispatch will also mispredict, you won't know until 9 cycles later than you could
	// have known if you held the indirect jump target immediately after the restart. So, you can think of the
	// latency of the control dependency chain as being added to the mispredict penalty. Depending on your CPU,
	// the usual number quoted is 15-20 cycles, and 9 cycles is 50% or more of that. On Skylake where the mispredict
	// penalty is 15-16 and L1 loads are slower, it's closer to 100%.

	#define NEXT \
	do { \
	pc += 4; \
	ins = load_ins(pc); \
	goto *opcases[ins & 0xFF]; \
	} while (0)

	case_ADD:
	regs[a] = regs[b] + regs[c];
	NEXT;

	case_JEQ:
	pc = regs[b] == regs[c] ? pc + (int8_t)a : pc + 4;
	ins = load_ins(pc);
	goto *opcases[ins & 0xFF];

	// Version 2: Speculative instruction/opcase prefetch. As soon as we enter an opcode case, we know the indirect jump target
	// for the sequentially next PC, which is almost always the next PC (except for control flow opcodes where the branch is taken).
	// This means the effective penalty for a back-to-back opcode dispatch mispredict goes down by the 9+ cycles quoted in the
	// last comment block. One way to think about what's going on here is that if a dispatch mispredict happens, you have 15-20
	// latency cycles of work you can have in flight "for free" which will be ready after the post-mispredict restart with 0 latency.
	// Obviously "for free" has to be heavily qualified. In stop-and-go mispredict-heavy code like this example, your pipeline
	// resources are underutilized (in an absolute and throughput vs goodput sense) and "stealing" from a predicted-but-maybe-wasted
	// code path can be a good trade-off. E.g. if you have a hard-to-predict two-way branch where branch prediction is effectively
	// useless, you can get started on work for _both_ branches in parallel so that whatever branch ends up being the right one, you
	// will have a headstart on the initial wave, especially the latency-sensitive parts.

	#define NEXT \
	do { \
	pc += 4; \
	ins = next_ins; \
	opcase = next_opcase; \
	next_ins = load_ins(pc + 4); \
	next_opcase = opcases[next_ins & 0xFF]; \
	goto *opcase; \
	} while (0)

	case_ADD:
	regs[a] = regs[b] + regs[c];
	NEXT;

	// Variant 1: Branchless, doesn't benefit from prefetched ins/opcase.
	case_JEQ:
	pc = regs[b] == regs[c] ? pc + (int8_t)a : pc + 4;
	ins = load_ins(pc);
	opcase = opcases[ins & 0xFF];
	next_ins = load_ins(pc + 4);
	next_opcase = ops[next_ins & 0xFF];
	goto *opcase;

	// Variant 2: Branch-based, predict JEQ not taken, benefits from prefetched ins/opcase when not taken.
	case_JEQ: {
	// Save 9+ cycles (assuming the next 'goto *opcase' mispredicts) when the 'unlikely' path is taken.
	bool taken = regs[b] == regs[c];
	uint32_t taken_pc = pc + (int8_t)a;
	uint32_t taken_ins = load_ins(taken_pc);
	void *taken_opcase = opcases[taken_ins & 0xFF];
	if (unlikely(taken)) {
	pc = taken_pc;
	ins = taken_ins;
	opcase = taken_opcase;
	} else {
	pc += 4;
	ins = next_ins;
	opcase = next_opcase;
	}
	next_ins = load_ins(pc + 4);
	next_opcase = opcases[next_ins & 0xFF];
	goto *opcase;
	}

	// Version 3: Let's go nuts and add a fully associative, unbounded branch target buffer. This means taken branches
	// will be on the hot path as long as their branch targets are consistent. This is mostly just a fun example.
	// In order to avoid adding an extra load's worth of latency to the prefetch path, we store the predicted
	// opcase directly in the BTB alongside the predicted PC. Note that next_opcase's latency is now that of 1 load
	// instead of 2 loads although the BTB load will be colder than the shared opcase table load if the PC is cold.
	// You could also cache the instruction in the BTB to remove the dependent instruction load, but remember we
	// are focused mainly on reducing opcase latency. I'm not sure how practical this all is, but at least it's
	// interesting to contemplate. Caching the predicted opcase in the BTB feels like it belongs somewhere in the
	// family tree of threaded code techniques. A serious implementation of this technique would want to use
	// something less naive than this hysteresis-free "last taken" predictor, but it's the simplest thing you can
	// do beyond statically predicting pc + 4, so it's the first thing students are asked to implement when learning
	// how to design and implement a pipelined CPU in a computer architecture course.

	#define NEXT \
	do { \
	if (likely(next_pc == pc + 4)) { \
	pc = next_pc; \
	ins = next_ins; \
	opcase = next_opcase; \
	} else { \
	uint32_t prev_pc = pc; \
	pc += 4; \
	ins = load_ins(pc); \
	opcase = opcases[ins & 0xFF]; \
	btb_pc[prev_pc] = pc; \
	btb_opcase[prev_pc] = opcase; \
	} \
	next_pc = btb_pc[pc]; \
	next_ins = load_ins(next_pc); \
	next_opcase = btb_opcase[pc]; \
	goto *opcase; \
	} while (0)

	case_ADD:
	// Same as before

	case_JEQ: {
	bool taken = regs[b] == regs[c];
	uint32_t taken_pc = pc + (int8_t)a;
	uint32_t actual_next_pc = select(taken, taken_pc, pc + 4);
	// The condition is written like this to prevent short-circuiting branches.
	if (likely(next_pc == actual_next_pc)) {
	pc = next_pc;
	ins = next_ins;
	opcase = next_opcase;
	} else {
	uint32_t prev_pc = pc;
	// A branchless fallback path (variant 1) seems reasonable but branch-based (variant 2) would work too.
	pc = actual_next_pc;
	ins = load_ins(pc);
	opcase = opcases[ins & 0xFF];
	btb_pc[prev_pc] = pc;
	btb_opcase[prev_pc] = opcase;
	}
	next_pc = btb_pc[pc];
	next_ins = load_ins(next_pc);
	next_opcase = btb_opcase[pc];
	goto *opcase;
	}