Skip to content

Instantly share code, notes, and snippets.

@rygorous
Created May 29, 2017 22:22
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rygorous/ccaca25f29b8bde85fba3d47b318d1c4 to your computer and use it in GitHub Desktop.
Save rygorous/ccaca25f29b8bde85fba3d47b318d1c4 to your computer and use it in GitHub Desktop.
A53 latency tester
// latency tester generator
#define PROD_ALF "add %1,%1,%2\n" // ALU fast-forward: basic ALU ops; CCMP/CCMN
#define PROD_ALU "add %1,%1,%2,lsl #13\n" // ALU+shift; all bitfield move; EXTR; RBIT/REV*; CLS/CLZ; CSEL/CSET etc.
#define PROD_SHF "lslv %1,%1,%2\n" // variable shifts, imm movs (e.g. "movz %1,#0,lsl #16")
#define PROD_LDR "ldr %1,[%4]\n" // load
#define CONS_ALU "add %1,%1,%2\n" // basic ALU ops; CCMP/CCMN; CSEL/CSET etc; CLZ/CLS; first (unshifted) src in ALU+shift
#define CONS_SHF "add %1,%2,%1,lsl #13\n" // second (shifted) src in ALU+shift; SBFM/UBFM/BFM/RBIT/REV*/var shifts/EXTR all sources
#define CONS_AGU "ldr %1,[%4,%1]\n" // load/store address generation unit
#define CONS_STR "str %1,[%4]\n" // store data
#define LATENCY_TO_ALL(prod) \
T(prod,ALU) \
T(prod,SHF) \
T(prod,AGU) \
T(prod,STR) \
/* end */
#define ALL_LATENCY_TESTS \
LATENCY_TO_ALL(ALF) \
LATENCY_TO_ALL(ALU) \
LATENCY_TO_ALL(SHF) \
LATENCY_TO_ALL(LDR) \
/* end */
#define PROD_PREFIX(x) PROD_##x
#define CONS_PREFIX(x) CONS_##x
#define T(prod,cons) \
static void latency_##prod##_##cons(U32 niter) \
{ \
U64 buf[4] = { 0, 0, 0, 0 }; \
U64 *e = buf; \
U64 a=0,b=0,c=0,d=0,f=0; \
__asm__ volatile(".align 3\n" \
"1:\n" \
"ldp %3,%6,[%4,#16]\n" /* must be slot0 - to clear pipe */ \
"subs %w5,%w5,#1\n" /* dual-issues in second cycle of LDP */ \
PROD_PREFIX(prod) \
CONS_PREFIX(cons) \
"ldp %3,%6,[%4,#16]\n" /* forces us to slot0 again */ \
"b.ne 1b\n" /* dual-issues in second cycle of LDP */ \
: "+&r"(a), "+&r"(b), "+&r"(c), "+&r"(d), "+&r"(e), "+&r"(niter), "+&r"(f) : "r"(1ull<<63)); \
}
ALL_LATENCY_TESTS
#undef T
// prod=LDR cons=ADD gives 7 cycles/iter, assumed breakdown is:
// c0 ldp (first)
// c1 ldp (second) + subs
// c2 ldr
// c3 <stall>
// c4 add
// c5 ldp (first)
// c6 ldp (second) + b.ne
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment