quite approximate but good enough for the code it was targeting
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| struct CortexA53SchedModel : public SchedModel | |
| { | |
| // Forwarding network producer and consumer type classes. The combination of these | |
| // determines the latency. | |
| enum FwdProd | |
| { | |
| P_SHF, // variable shifts, imm movs, ADR/ADRP, ALU fast-forward: basic ALU ops; CCMP/CCMN (can forward to another ALU op in same cycle, "0-latency") | |
| P_ALU, // ALU+shift; all bitfield ops; EXTR; RBIT/REV*; CLS/CLZ; CSEL/CSET etc. | |
| P_LDR, // loads | |
| P____, // dataflow sink, nothing produced | |
| }; | |
| enum FwdCons | |
| { | |
| C_ALU, // basic ALU ops; CCMP/CCMN; CSEL/CSET etc; CLS/CLZ; first (unshifted) src in ALU+shift | |
| C_SHF, // second (shifted) src in ALU+shift; SBFM/UBFM/BFM/EXTR/RBIT/REV*/var shifts all sources. | |
| C_AGU, // address generation unit (loads/stores) | |
| C_STR, // store data | |
| C_BRA, // branch | |
| C____, // nothing consumed | |
| }; | |
| // Instruction classification into equivalence classes wrt scheduling properties. | |
| // | |
| // LLVM's scheduling model for the A53 seems incorrect. GCC's looks better, but | |
| // still appears to disagree with my observations in some cases? | |
| // | |
| // Anyway, this table lists distincts scheduling classes, the producer type for | |
| // the instruction, and the consumer type for the source operands 0-2. | |
| #define A53_SCHED_CLASSES \ | |
| /*cls prod, con0, con1, con2 */ \ | |
| T(SC_BRANCH, P____, C_BRA, C____, C____) /* branch (not correct for indirect branches, but screw them) */ \ | |
| T(SC_IMMED, P_SHF, C_SHF, C____, C____) /* immediate moves, ADR/ADRP */ \ | |
| T(SC_ALU, P_ALU, C_ALU, C_ALU, C____) /* general ALU op */ \ | |
| T(SC_ALU_FAST, P_SHF, C_ALU, C_ALU, C____) /* fast ALU op (can forward within same cycle) */ \ | |
| T(SC_ALU_SHIFT, P_ALU, C_ALU, C_SHF, C____) /* ALU+shift op */ \ | |
| T(SC_ALU_COMPLEX, P_ALU, C_SHF, C_SHF, C____) /* complex ALU ops read inputs in shift stage */ \ | |
| T(SC_VARSHIFT, P_SHF, C_SHF, C_SHF, C____) /* variable shifts: at most 1 per cycle */ \ | |
| T(SC_LOAD, P_LDR, C_AGU, C____, C____) /* basic load */ \ | |
| T(SC_LOAD_IDX, P_LDR, C_AGU, C____, C____) /* load indexed */ \ | |
| T(SC_LOAD_PAIR_W, P_LDR, C_AGU, C____, C____) /* load pair (W-form) */ \ | |
| T(SC_LOAD_PAIR_IDX_W, P_LDR, C_AGU, C____, C____) /* load pair indexed (W-form) */ \ | |
| T(SC_LOAD_PAIR_X, P_LDR, C_AGU, C____, C____) /* load pair (X-form) */ \ | |
| T(SC_LOAD_PAIR_IDX_X, P_LDR, C_AGU, C____, C____) /* load pair indexed (X-form) */ \ | |
| T(SC_STORE, P_LDR, C_STR, C_AGU, C____) /* basic store */ \ | |
| T(SC_STORE_IDX, P_LDR, C_STR, C_AGU, C____) /* store indexed */ \ | |
| T(SC_STORE_PAIR, P_LDR, C_STR, C_STR, C_AGU) /* store pair */ \ | |
| T(SC_STORE_PAIR_IDX, P_LDR, C_STR, C_STR, C_AGU) /* store pair indexed */ \ | |
| /* end */ | |
| enum SchedClass | |
| { | |
| #define T(cls,prod,con0,con1,con2) cls, | |
| A53_SCHED_CLASSES | |
| #undef T | |
| }; | |
| struct SchedInfo | |
| { | |
| uint8_t prod; | |
| uint8_t cons[3]; | |
| }; | |
| static const SchedInfo &get_sched_info(SchedClass cls) | |
| { | |
| static const SchedInfo nfo[] = { | |
| #define T(cls,prod,con0,con1,con2) { prod, { con0,con1,con2 } }, | |
| A53_SCHED_CLASSES | |
| #undef T | |
| }; | |
| return nfo[cls]; | |
| } | |
| #undef A53_SCHED_CLASSES | |
| static SchedClass get_sched_class(const Inst &inst) | |
| { | |
| switch (inst.op) | |
| { | |
| // Most basic branches | |
| case OPC_B: case OPC_Bcc: case OPC_RET: case OPC_CBZ: case OPC_CBNZ: case OPC_TBZ: case OPC_TBNZ: | |
| return SC_BRANCH; | |
| // ALU ops | |
| case OPC_ADD: case OPC_ADDS: | |
| case OPC_ADC: case OPC_ADCS: | |
| case OPC_SUB: case OPC_SUBS: | |
| case OPC_SBC: case OPC_SBCS: | |
| case OPC_AND: case OPC_ANDS: | |
| case OPC_BIC: case OPC_BICS: | |
| case OPC_EON: case OPC_EOR: | |
| case OPC_ORN: case OPC_ORR: | |
| case OPC_CCMP: case OPC_CCMN: | |
| return inst.src[1].is_shift_or_extend() ? SC_ALU_SHIFT : SC_ALU_FAST; | |
| case OPC_CLS: case OPC_CLZ: | |
| case OPC_CSEL: case OPC_CSINC: case OPC_CSINV: case OPC_CSNEG: | |
| return SC_ALU; | |
| case OPC_MOV: | |
| return (inst.src[0].cls == Op::IMM) ? SC_IMMED : SC_ALU_FAST; | |
| case OPC_REV: case OPC_REV16: case OPC_REV32: case OPC_RBIT: | |
| case OPC_BFM: case OPC_SBFM: case OPC_UBFM: case OPC_EXTR: | |
| return SC_ALU_COMPLEX; | |
| case OPC_ASRV: case OPC_LSLV: case OPC_LSRV: case OPC_RORV: | |
| return SC_VARSHIFT; | |
| case OPC_LDR: case OPC_LDUR: case OPC_LDRSW: case OPC_LDURSW: | |
| case OPC_LDRB: case OPC_LDURB: case OPC_LDRSB: case OPC_LDURSB: | |
| case OPC_LDRH: case OPC_LDURH: case OPC_LDRSH: case OPC_LDURSH: | |
| case OPC_PRFM: | |
| assert(inst.src[0].cls == Op::ADDR); | |
| return inst.src[0].addr.has_writeback() ? SC_LOAD_IDX : SC_LOAD; | |
| case OPC_LDP: case OPC_LDPSW: | |
| assert(inst.src[0].cls == Op::ADDR); | |
| if (inst.dst[0].cls == Op::R32) | |
| return inst.src[0].addr.has_writeback() ? SC_LOAD_PAIR_W : SC_LOAD_PAIR_IDX_W; | |
| else | |
| return inst.src[0].addr.has_writeback() ? SC_LOAD_PAIR_X : SC_LOAD_PAIR_IDX_X; | |
| case OPC_STR: case OPC_STUR: | |
| case OPC_STRB: case OPC_STURB: | |
| case OPC_STRH: case OPC_STURH: | |
| assert(inst.src[1].cls == Op::ADDR); | |
| return inst.src[1].addr.has_writeback() ? SC_STORE_IDX : SC_STORE; | |
| case OPC_STP: | |
| assert(inst.src[2].cls == Op::ADDR); | |
| return inst.src[2].addr.has_writeback() ? SC_STORE_PAIR_IDX : SC_STORE_PAIR; | |
| default: | |
| printf("?%s\n", inst.info().asmname); | |
| assert(!"NYI"); | |
| return SC_ALU; | |
| } | |
| } | |
| struct CTracker : public ConflictTracker | |
| { | |
| // Resource masks | |
| // NOTE: if changing this, update analyze_trace below! | |
| static const uint64_t kResSlot = 0x000001ull; // total slots filled/cycle | |
| static const uint64_t kResWBP = 0x000010ull; // writeback port | |
| static const uint64_t kResI01 = 0x000100ull; // integer pipes 0/1 | |
| static const uint64_t kResLS = 0x001000ull; // load/store pipe | |
| static const uint64_t kResB = 0x010000ull; // branch pipe | |
| static const uint64_t kResVarS = 0x100000ull; // variable shift. we can have one in either pipe, but not in both at once. | |
| // not yet modeled: multiplier, divider, float/SIMD | |
| static const uint64_t kResAll = 2*kResSlot + 3*kResWBP + 2*kResI01 + kResLS + kResB + kResVarS; | |
| static const uint64_t kResBias = 0x888888ull; // +8 bias in every category! | |
| static const uint64_t kResInit = kResAll + kResBias; | |
| // On init, cycle has all resources available. | |
| uint64_t res_avail = kResInit; | |
| // TODO make multi-cycle resource consumption work | |
| virtual bool try_schedule(const Inst &inst) | |
| { | |
| SchedClass cls = get_sched_class(inst); | |
| // Owen says: LDP X-form needs to go in the first issue slot. | |
| // First in issue slot => all resources available for this cycle. | |
| // | |
| // NOTE multi-cycle instrs can dual-issue during the last cycle. | |
| if ((cls == SC_LOAD_PAIR_X || cls == SC_LOAD_PAIR_IDX_X) && res_avail != kResInit) | |
| return false; | |
| // We subtract all occupied resources from the available mask. If that drops us below the bias value | |
| // in any of the categories, that counter went below zero, and we can't dispatch in this cycle. | |
| uint64_t updated = res_avail - get_resource_mask(inst, cls); | |
| if ((updated & kResBias) != kResBias) | |
| return false; | |
| res_avail = updated; | |
| return true; | |
| } | |
| void reset() | |
| { | |
| res_avail = kResInit; | |
| } | |
| static uint64_t get_resource_mask(const Inst &inst, SchedClass cls) | |
| { | |
| uint64_t wback = 0; | |
| // Destination takes up a writeback port if it's not ZR | |
| if ((inst.dst[0].cls == Op::R32 || inst.dst[0].cls == Op::R64) && inst.dst[0].reg.num != Reg::ZR) | |
| wback = kResWBP; | |
| switch (cls) | |
| { | |
| case SC_BRANCH: return kResSlot + kResB; | |
| case SC_IMMED: return kResSlot + kResI01 + wback; | |
| case SC_ALU: return kResSlot + kResI01 + wback; | |
| case SC_ALU_FAST: return kResSlot + kResI01 + wback; | |
| case SC_ALU_SHIFT: return kResSlot + kResI01 + wback; | |
| case SC_ALU_COMPLEX: return kResSlot + kResI01 + wback; | |
| case SC_VARSHIFT: return kResSlot + kResI01 + wback + kResVarS; | |
| case SC_LOAD: return kResSlot + kResLS + kResWBP; | |
| case SC_LOAD_IDX: return kResSlot + kResLS + 2*kResWBP; | |
| case SC_LOAD_PAIR_W: return kResSlot + kResLS + 2*kResWBP; | |
| case SC_LOAD_PAIR_IDX_W: return kResSlot + kResLS + 3*kResWBP; | |
| case SC_LOAD_PAIR_X: return kResSlot + kResLS + kResWBP; // NOTE needs an extra cycle (64b load pipe) | |
| case SC_LOAD_PAIR_IDX_X: return kResSlot + kResLS + 2*kResWBP; // NOTE needs an extra cycle (64b load pipe) | |
| case SC_STORE: return kResSlot + kResLS; | |
| case SC_STORE_IDX: return kResSlot + kResLS + kResWBP; | |
| case SC_STORE_PAIR: return kResSlot + kResLS; | |
| case SC_STORE_PAIR_IDX: return kResSlot + kResLS + kResWBP; | |
| default: assert(!"NYI"); return kResSlot; | |
| } | |
| } | |
| }; | |
| virtual int32_t determine_latency(const Inst &producer, Producer::Type type, const Inst &consumer, int cons_src) const | |
| { | |
| // The latency table | |
| static const uint8_t lat[P____][C____] = | |
| { | |
| // ALU SHF AGU STR BRA | |
| /* SHF->*/ { 0, 1, 2, 0, 0 }, | |
| /* ALU->*/ { 1, 2, 3, 0, 0 }, | |
| /* LDR->*/ { 2, 3, 3, 1, 0 }, // NOTE LDR->BRA not actually measured (the rest is tested, though) | |
| }; | |
| SchedClass prod_class = get_sched_class(producer); | |
| SchedClass cons_class = get_sched_class(consumer); | |
| const SchedInfo &prod_info = get_sched_info(prod_class); | |
| const SchedInfo &cons_info = get_sched_info(cons_class); | |
| // Figure out what type of forwarding consumer/producer pair we have | |
| uint8_t prod_code = prod_info.prod; | |
| uint8_t cons_code = cons_info.cons[(cons_src >= 0) ? cons_src : 0]; | |
| assert(prod_code < P____ && cons_code < C____); // must be valid dataflow edges! | |
| if (prod_code >= P____ || cons_code >= C____) | |
| { | |
| //printf("???prod=%s cons=%s src=%d\n", producer.info().asmname, consumer.info().asmname, cons_src); | |
| return 0; | |
| } | |
| return lat[prod_code][cons_code]; | |
| } | |
| virtual std::unique_ptr<ConflictTracker> create_conflict_tracker() const | |
| { | |
| return std::make_unique<CTracker>(); | |
| } | |
| // no analyze_trace for now, this is in-order, print the schedule instead | |
| }; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment