A53 latency tester
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // latency tester generator | |
| #define PROD_ALF "add %1,%1,%2\n" // ALU fast-forward: basic ALU ops; CCMP/CCMN | |
| #define PROD_ALU "add %1,%1,%2,lsl #13\n" // ALU+shift; all bitfield move; EXTR; RBIT/REV*; CLS/CLZ; CSEL/CSET etc. | |
| #define PROD_SHF "lslv %1,%1,%2\n" // variable shifts, imm movs (e.g. "movz %1,#0,lsl #16") | |
| #define PROD_LDR "ldr %1,[%4]\n" // load | |
| #define CONS_ALU "add %1,%1,%2\n" // basic ALU ops; CCMP/CCMN; CSEL/CSET etc; CLZ/CLS; first (unshifted) src in ALU+shift | |
| #define CONS_SHF "add %1,%2,%1,lsl #13\n" // second (shifted) src in ALU+shift; SBFM/UBFM/BFM/RBIT/REV*/var shifts/EXTR all sources | |
| #define CONS_AGU "ldr %1,[%4,%1]\n" // load/store address generation unit | |
| #define CONS_STR "str %1,[%4]\n" // store data | |
| #define LATENCY_TO_ALL(prod) \ | |
| T(prod,ALU) \ | |
| T(prod,SHF) \ | |
| T(prod,AGU) \ | |
| T(prod,STR) \ | |
| /* end */ | |
| #define ALL_LATENCY_TESTS \ | |
| LATENCY_TO_ALL(ALF) \ | |
| LATENCY_TO_ALL(ALU) \ | |
| LATENCY_TO_ALL(SHF) \ | |
| LATENCY_TO_ALL(LDR) \ | |
| /* end */ | |
| #define PROD_PREFIX(x) PROD_##x | |
| #define CONS_PREFIX(x) CONS_##x | |
| #define T(prod,cons) \ | |
| static void latency_##prod##_##cons(U32 niter) \ | |
| { \ | |
| U64 buf[4] = { 0, 0, 0, 0 }; \ | |
| U64 *e = buf; \ | |
| U64 a=0,b=0,c=0,d=0,f=0; \ | |
| __asm__ volatile(".align 3\n" \ | |
| "1:\n" \ | |
| "ldp %3,%6,[%4,#16]\n" /* must be slot0 - to clear pipe */ \ | |
| "subs %w5,%w5,#1\n" /* dual-issues in second cycle of LDP */ \ | |
| PROD_PREFIX(prod) \ | |
| CONS_PREFIX(cons) \ | |
| "ldp %3,%6,[%4,#16]\n" /* forces us to slot0 again */ \ | |
| "b.ne 1b\n" /* dual-issues in second cycle of LDP */ \ | |
| : "+&r"(a), "+&r"(b), "+&r"(c), "+&r"(d), "+&r"(e), "+&r"(niter), "+&r"(f) : "r"(1ull<<63)); \ | |
| } | |
| ALL_LATENCY_TESTS | |
| #undef T | |
| // prod=LDR cons=ADD gives 7 cycles/iter, assumed breakdown is: | |
| // c0 ldp (first) | |
| // c1 ldp (second) + subs | |
| // c2 ldr | |
| // c3 <stall> | |
| // c4 add | |
| // c5 ldp (first) | |
| // c6 ldp (second) + b.ne | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment