Skip to content

Instantly share code, notes, and snippets.

@rygorous
Created May 4, 2017 00:07
Embed
What would you like to do?
quite approximate but good enough for the code it was targeting
struct CortexA53SchedModel : public SchedModel
{
// Forwarding network producer and consumer type classes. The combination of these
// determines the latency.
enum FwdProd
{
P_SHF, // variable shifts, imm movs, ADR/ADRP, ALU fast-forward: basic ALU ops; CCMP/CCMN (can forward to another ALU op in same cycle, "0-latency")
P_ALU, // ALU+shift; all bitfield ops; EXTR; RBIT/REV*; CLS/CLZ; CSEL/CSET etc.
P_LDR, // loads
P____, // dataflow sink, nothing produced
};
enum FwdCons
{
C_ALU, // basic ALU ops; CCMP/CCMN; CSEL/CSET etc; CLS/CLZ; first (unshifted) src in ALU+shift
C_SHF, // second (shifted) src in ALU+shift; SBFM/UBFM/BFM/EXTR/RBIT/REV*/var shifts all sources.
C_AGU, // address generation unit (loads/stores)
C_STR, // store data
C_BRA, // branch
C____, // nothing consumed
};
// Instruction classification into equivalence classes wrt scheduling properties.
//
// LLVM's scheduling model for the A53 seems incorrect. GCC's looks better, but
// still appears to disagree with my observations in some cases?
//
// Anyway, this table lists distincts scheduling classes, the producer type for
// the instruction, and the consumer type for the source operands 0-2.
#define A53_SCHED_CLASSES \
/*cls prod, con0, con1, con2 */ \
T(SC_BRANCH, P____, C_BRA, C____, C____) /* branch (not correct for indirect branches, but screw them) */ \
T(SC_IMMED, P_SHF, C_SHF, C____, C____) /* immediate moves, ADR/ADRP */ \
T(SC_ALU, P_ALU, C_ALU, C_ALU, C____) /* general ALU op */ \
T(SC_ALU_FAST, P_SHF, C_ALU, C_ALU, C____) /* fast ALU op (can forward within same cycle) */ \
T(SC_ALU_SHIFT, P_ALU, C_ALU, C_SHF, C____) /* ALU+shift op */ \
T(SC_ALU_COMPLEX, P_ALU, C_SHF, C_SHF, C____) /* complex ALU ops read inputs in shift stage */ \
T(SC_VARSHIFT, P_SHF, C_SHF, C_SHF, C____) /* variable shifts: at most 1 per cycle */ \
T(SC_LOAD, P_LDR, C_AGU, C____, C____) /* basic load */ \
T(SC_LOAD_IDX, P_LDR, C_AGU, C____, C____) /* load indexed */ \
T(SC_LOAD_PAIR_W, P_LDR, C_AGU, C____, C____) /* load pair (W-form) */ \
T(SC_LOAD_PAIR_IDX_W, P_LDR, C_AGU, C____, C____) /* load pair indexed (W-form) */ \
T(SC_LOAD_PAIR_X, P_LDR, C_AGU, C____, C____) /* load pair (X-form) */ \
T(SC_LOAD_PAIR_IDX_X, P_LDR, C_AGU, C____, C____) /* load pair indexed (X-form) */ \
T(SC_STORE, P_LDR, C_STR, C_AGU, C____) /* basic store */ \
T(SC_STORE_IDX, P_LDR, C_STR, C_AGU, C____) /* store indexed */ \
T(SC_STORE_PAIR, P_LDR, C_STR, C_STR, C_AGU) /* store pair */ \
T(SC_STORE_PAIR_IDX, P_LDR, C_STR, C_STR, C_AGU) /* store pair indexed */ \
/* end */
enum SchedClass
{
#define T(cls,prod,con0,con1,con2) cls,
A53_SCHED_CLASSES
#undef T
};
struct SchedInfo
{
uint8_t prod;
uint8_t cons[3];
};
static const SchedInfo &get_sched_info(SchedClass cls)
{
static const SchedInfo nfo[] = {
#define T(cls,prod,con0,con1,con2) { prod, { con0,con1,con2 } },
A53_SCHED_CLASSES
#undef T
};
return nfo[cls];
}
#undef A53_SCHED_CLASSES
static SchedClass get_sched_class(const Inst &inst)
{
switch (inst.op)
{
// Most basic branches
case OPC_B: case OPC_Bcc: case OPC_RET: case OPC_CBZ: case OPC_CBNZ: case OPC_TBZ: case OPC_TBNZ:
return SC_BRANCH;
// ALU ops
case OPC_ADD: case OPC_ADDS:
case OPC_ADC: case OPC_ADCS:
case OPC_SUB: case OPC_SUBS:
case OPC_SBC: case OPC_SBCS:
case OPC_AND: case OPC_ANDS:
case OPC_BIC: case OPC_BICS:
case OPC_EON: case OPC_EOR:
case OPC_ORN: case OPC_ORR:
case OPC_CCMP: case OPC_CCMN:
return inst.src[1].is_shift_or_extend() ? SC_ALU_SHIFT : SC_ALU_FAST;
case OPC_CLS: case OPC_CLZ:
case OPC_CSEL: case OPC_CSINC: case OPC_CSINV: case OPC_CSNEG:
return SC_ALU;
case OPC_MOV:
return (inst.src[0].cls == Op::IMM) ? SC_IMMED : SC_ALU_FAST;
case OPC_REV: case OPC_REV16: case OPC_REV32: case OPC_RBIT:
case OPC_BFM: case OPC_SBFM: case OPC_UBFM: case OPC_EXTR:
return SC_ALU_COMPLEX;
case OPC_ASRV: case OPC_LSLV: case OPC_LSRV: case OPC_RORV:
return SC_VARSHIFT;
case OPC_LDR: case OPC_LDUR: case OPC_LDRSW: case OPC_LDURSW:
case OPC_LDRB: case OPC_LDURB: case OPC_LDRSB: case OPC_LDURSB:
case OPC_LDRH: case OPC_LDURH: case OPC_LDRSH: case OPC_LDURSH:
case OPC_PRFM:
assert(inst.src[0].cls == Op::ADDR);
return inst.src[0].addr.has_writeback() ? SC_LOAD_IDX : SC_LOAD;
case OPC_LDP: case OPC_LDPSW:
assert(inst.src[0].cls == Op::ADDR);
if (inst.dst[0].cls == Op::R32)
return inst.src[0].addr.has_writeback() ? SC_LOAD_PAIR_W : SC_LOAD_PAIR_IDX_W;
else
return inst.src[0].addr.has_writeback() ? SC_LOAD_PAIR_X : SC_LOAD_PAIR_IDX_X;
case OPC_STR: case OPC_STUR:
case OPC_STRB: case OPC_STURB:
case OPC_STRH: case OPC_STURH:
assert(inst.src[1].cls == Op::ADDR);
return inst.src[1].addr.has_writeback() ? SC_STORE_IDX : SC_STORE;
case OPC_STP:
assert(inst.src[2].cls == Op::ADDR);
return inst.src[2].addr.has_writeback() ? SC_STORE_PAIR_IDX : SC_STORE_PAIR;
default:
printf("?%s\n", inst.info().asmname);
assert(!"NYI");
return SC_ALU;
}
}
struct CTracker : public ConflictTracker
{
// Resource masks
// NOTE: if changing this, update analyze_trace below!
static const uint64_t kResSlot = 0x000001ull; // total slots filled/cycle
static const uint64_t kResWBP = 0x000010ull; // writeback port
static const uint64_t kResI01 = 0x000100ull; // integer pipes 0/1
static const uint64_t kResLS = 0x001000ull; // load/store pipe
static const uint64_t kResB = 0x010000ull; // branch pipe
static const uint64_t kResVarS = 0x100000ull; // variable shift. we can have one in either pipe, but not in both at once.
// not yet modeled: multiplier, divider, float/SIMD
static const uint64_t kResAll = 2*kResSlot + 3*kResWBP + 2*kResI01 + kResLS + kResB + kResVarS;
static const uint64_t kResBias = 0x888888ull; // +8 bias in every category!
static const uint64_t kResInit = kResAll + kResBias;
// On init, cycle has all resources available.
uint64_t res_avail = kResInit;
// TODO make multi-cycle resource consumption work
virtual bool try_schedule(const Inst &inst)
{
SchedClass cls = get_sched_class(inst);
// Owen says: LDP X-form needs to go in the first issue slot.
// First in issue slot => all resources available for this cycle.
//
// NOTE multi-cycle instrs can dual-issue during the last cycle.
if ((cls == SC_LOAD_PAIR_X || cls == SC_LOAD_PAIR_IDX_X) && res_avail != kResInit)
return false;
// We subtract all occupied resources from the available mask. If that drops us below the bias value
// in any of the categories, that counter went below zero, and we can't dispatch in this cycle.
uint64_t updated = res_avail - get_resource_mask(inst, cls);
if ((updated & kResBias) != kResBias)
return false;
res_avail = updated;
return true;
}
void reset()
{
res_avail = kResInit;
}
static uint64_t get_resource_mask(const Inst &inst, SchedClass cls)
{
uint64_t wback = 0;
// Destination takes up a writeback port if it's not ZR
if ((inst.dst[0].cls == Op::R32 || inst.dst[0].cls == Op::R64) && inst.dst[0].reg.num != Reg::ZR)
wback = kResWBP;
switch (cls)
{
case SC_BRANCH: return kResSlot + kResB;
case SC_IMMED: return kResSlot + kResI01 + wback;
case SC_ALU: return kResSlot + kResI01 + wback;
case SC_ALU_FAST: return kResSlot + kResI01 + wback;
case SC_ALU_SHIFT: return kResSlot + kResI01 + wback;
case SC_ALU_COMPLEX: return kResSlot + kResI01 + wback;
case SC_VARSHIFT: return kResSlot + kResI01 + wback + kResVarS;
case SC_LOAD: return kResSlot + kResLS + kResWBP;
case SC_LOAD_IDX: return kResSlot + kResLS + 2*kResWBP;
case SC_LOAD_PAIR_W: return kResSlot + kResLS + 2*kResWBP;
case SC_LOAD_PAIR_IDX_W: return kResSlot + kResLS + 3*kResWBP;
case SC_LOAD_PAIR_X: return kResSlot + kResLS + kResWBP; // NOTE needs an extra cycle (64b load pipe)
case SC_LOAD_PAIR_IDX_X: return kResSlot + kResLS + 2*kResWBP; // NOTE needs an extra cycle (64b load pipe)
case SC_STORE: return kResSlot + kResLS;
case SC_STORE_IDX: return kResSlot + kResLS + kResWBP;
case SC_STORE_PAIR: return kResSlot + kResLS;
case SC_STORE_PAIR_IDX: return kResSlot + kResLS + kResWBP;
default: assert(!"NYI"); return kResSlot;
}
}
};
virtual int32_t determine_latency(const Inst &producer, Producer::Type type, const Inst &consumer, int cons_src) const
{
// The latency table
static const uint8_t lat[P____][C____] =
{
// ALU SHF AGU STR BRA
/* SHF->*/ { 0, 1, 2, 0, 0 },
/* ALU->*/ { 1, 2, 3, 0, 0 },
/* LDR->*/ { 2, 3, 3, 1, 0 }, // NOTE LDR->BRA not actually measured (the rest is tested, though)
};
SchedClass prod_class = get_sched_class(producer);
SchedClass cons_class = get_sched_class(consumer);
const SchedInfo &prod_info = get_sched_info(prod_class);
const SchedInfo &cons_info = get_sched_info(cons_class);
// Figure out what type of forwarding consumer/producer pair we have
uint8_t prod_code = prod_info.prod;
uint8_t cons_code = cons_info.cons[(cons_src >= 0) ? cons_src : 0];
assert(prod_code < P____ && cons_code < C____); // must be valid dataflow edges!
if (prod_code >= P____ || cons_code >= C____)
{
//printf("???prod=%s cons=%s src=%d\n", producer.info().asmname, consumer.info().asmname, cons_src);
return 0;
}
return lat[prod_code][cons_code];
}
virtual std::unique_ptr<ConflictTracker> create_conflict_tracker() const
{
return std::make_unique<CTracker>();
}
// no analyze_trace for now, this is in-order, print the schedule instead
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment