Last active
July 17, 2016 14:08
-
-
Save rygorous/14f7b3afe12102ec88edfd1a2136fa6b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// values in nanoseconds | |
static U64 timer() | |
{ | |
struct timespec ts; | |
if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) | |
return 0; | |
return (U64)ts.tv_sec * 1000000000 + (U64)ts.tv_nsec; | |
} | |
static double duration_in_s(U64 start, U64 end) | |
{ | |
return (double)(end - start) / 1.0e9; | |
} | |
typedef void testfunc(U32 niter); | |
static void micro_nothing(U32 niter) | |
{ | |
U32 a=0,b=0,c=0,d=0,e=0; | |
__asm__ volatile( | |
"1:\n" | |
"subs %5,#1\n" | |
"bne 1b\n" | |
: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter)); | |
} | |
static void micro_four_dep_adds(U32 niter) | |
{ | |
U32 a=0,b=0,c=0,d=0,e=0; | |
__asm__ volatile( | |
"1:\n" | |
"add %0,%4\n" | |
"add %0,%4\n" | |
"add %0,%4\n" | |
"add %0,%4\n" | |
"subs %5,#1\n" | |
"bne 1b\n" | |
: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter)); | |
} | |
static void micro_four_indep_adds(U32 niter) | |
{ | |
U32 a=0,b=0,c=0,d=0,e=0; | |
__asm__ volatile( | |
"1:\n" | |
"add %0,%4\n" | |
"add %1,%4\n" | |
"add %2,%4\n" | |
"add %3,%4\n" | |
"subs %5,#1\n" | |
"bne 1b\n" | |
: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter)); | |
} | |
static void micro_aligned_store(U32 niter) | |
{ | |
U32 buf[4]; | |
U32 *e = buf; | |
U32 a=0,b=0,c=0,d=0; | |
__asm__ volatile( | |
"1:\n" | |
"str %0,[%4,#0]\n" | |
"str %1,[%4,#4]\n" | |
"str %2,[%4,#8]\n" | |
"str %3,[%4,#12]\n" | |
"subs %5,#1\n" | |
"bne 1b\n" | |
: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter)); | |
} | |
static void micro_unaligned_store(U32 niter) | |
{ | |
U32 buf[5]; | |
U32 *e = buf; | |
U32 a=0,b=0,c=0,d=0; | |
__asm__ volatile( | |
"1:\n" | |
"str %0,[%4,#1]\n" | |
"str %1,[%4,#5]\n" | |
"str %2,[%4,#9]\n" | |
"str %3,[%4,#13]\n" | |
"subs %5,#1\n" | |
"bne 1b\n" | |
: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter)); | |
} | |
static void micro_aligned_full_stlf(U32 niter) | |
{ | |
U32 buf[4]; | |
U32 *e = buf; | |
U32 a=0,b=0,c=0,d=0; | |
__asm__ volatile( | |
"1:\n" | |
"str %0,[%4,#0]\n" | |
"ldr %0,[%4,#0]\n" | |
"str %0,[%4,#0]\n" | |
"ldr %0,[%4,#0]\n" | |
"subs %5,#1\n" | |
"bne 1b\n" | |
: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter)); | |
} | |
static void micro_aligned_partial_stlf(U32 niter) | |
{ | |
U32 buf[4]; | |
U32 *e = buf; | |
U32 a=0,b=0,c=0,d=0; | |
__asm__ volatile( | |
"1:\n" | |
"str %0,[%4,#0]\n" | |
"ldrb %0,[%4,#0]\n" | |
"str %0,[%4,#0]\n" | |
"ldrb %0,[%4,#0]\n" | |
"subs %5,#1\n" | |
"bne 1b\n" | |
: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter)); | |
} | |
static void micro_unaligned_full_stlf(U32 niter) | |
{ | |
U32 buf[5]; | |
U32 *e = buf; | |
U32 a=0,b=0,c=0,d=0; | |
__asm__ volatile( | |
"1:\n" | |
"str %0,[%4,#1]\n" | |
"ldr %0,[%4,#1]\n" | |
"str %0,[%4,#1]\n" | |
"ldr %0,[%4,#1]\n" | |
"subs %5,#1\n" | |
"bne 1b\n" | |
: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter)); | |
} | |
static void micro_unaligned_partial_stlf(U32 niter) | |
{ | |
U32 buf[5]; | |
U32 *e = buf; | |
U32 a=0,b=0,c=0,d=0; | |
__asm__ volatile( | |
"1:\n" | |
"str %0,[%4,#1]\n" | |
"ldrb %0,[%4,#1]\n" | |
"str %0,[%4,#1]\n" | |
"ldrb %0,[%4,#1]\n" | |
"subs %5,#1\n" | |
"bne 1b\n" | |
: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter)); | |
} | |
static void arm_microbench(const char *name, testfunc *fn) | |
{ | |
int nIter = 1000000000; // 1 billion | |
// warm up | |
fn(nIter / 4); | |
U64 startt = timer(); | |
fn(nIter); | |
U64 endt = timer(); | |
double secs = duration_in_s(startt, endt); | |
double clock_rate_ghz = 1.65; | |
double ns_per_iter = (secs * 1e9) / (double)nIter; | |
double clocks_per_iter = ns_per_iter * clock_rate_ghz; | |
LOGI("%s: %.2f ns/iter (=%.2f cycles @ %.2fGHz)\n", name, ns_per_iter, clocks_per_iter, clock_rate_ghz); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[ExynosM1] Starting test. | |
[ExynosM1] nothing: 0.39 ns/iter (=1.01 cycles @ 2.6GHz) | |
[ExynosM1] four_dep_adds: 1.54 ns/iter (=4.01 cycles @ 2.6GHz) | |
[ExynosM1] four_indep_adds: 0.75 ns/iter (=1.94 cycles @ 2.6GHz) | |
[ExynosM1] aligned_store: 1.54 ns/iter (=4.01 cycles @ 2.6GHz) | |
[ExynosM1] unaligned_store: 2.31 ns/iter (=6.02 cycles @ 2.6GHz) | |
[ExynosM1] aligned_full_stlf: 10.02 ns/iter (=26.05 cycles @ 2.6GHz) | |
[ExynosM1] aligned_partial_stlf: 9.99 ns/iter (=25.98 cycles @ 2.6GHz) | |
[ExynosM1] unaligned_full_stlf: 9.89 ns/iter (=25.71 cycles @ 2.6GHz) | |
[ExynosM1] unaligned_partial_stlf: 9.59 ns/iter (=24.94 cycles @ 2.6GHz) | |
[ExynosM1] Terminating. | |
[NvDenver] nothing: 0.67 ns/iter (=1.68 cycles @ 2.5GHz) | |
[NvDenver] four_dep_adds: 4.25 ns/iter (=10.62 cycles @ 2.5GHz) | |
[NvDenver] four_indep_adds: 1.73 ns/iter (=4.34 cycles @ 2.5GHz) | |
[NvDenver] aligned_store: 0.99 ns/iter (=2.47 cycles @ 2.5GHz) | |
[NvDenver] unaligned_store: 0.99 ns/iter (=2.47 cycles @ 2.5GHz) | |
[NvDenver] aligned_full_stlf: 0.79 ns/iter (=1.98 cycles @ 2.5GHz) | |
[NvDenver] aligned_partial_stlf: 0.80 ns/iter (=2.01 cycles @ 2.5GHz) | |
[NvDenver] unaligned_full_stlf: 0.79 ns/iter (=1.98 cycles @ 2.5GHz) | |
[NvDenver] unaligned_partial_stlf: 0.79 ns/iter (=1.98 cycles @ 2.5GHz) | |
[QcomKryo] nothing: 0.47 ns/iter (=1.01 cycles @ 2.15GHz) | |
[QcomKryo] four_dep_adds: 1.87 ns/iter (=4.03 cycles @ 2.15GHz) | |
[QcomKryo] four_indep_adds: 0.97 ns/iter (=2.08 cycles @ 2.15GHz) | |
[QcomKryo] aligned_store: 1.87 ns/iter (=4.02 cycles @ 2.15GHz) | |
[QcomKryo] unaligned_store: 1.87 ns/iter (=4.01 cycles @ 2.15GHz) | |
[QcomKryo] aligned_full_stlf: 11.75 ns/iter (=25.26 cycles @ 2.15GHz) | |
[QcomKryo] aligned_partial_stlf: 11.89 ns/iter (=25.57 cycles @ 2.15GHz) | |
[QcomKryo] unaligned_full_stlf: 12.08 ns/iter (=25.98 cycles @ 2.15GHz) | |
[QcomKryo] unaligned_partial_stlf: 11.85 ns/iter (=25.47 cycles @ 2.15GHz) | |
[QcomKrait] nothing: 1.44 ns/iter (=2.16 cycles @ 1.50GHz) | |
[QcomKrait] four_dep_adds: 5.06 ns/iter (=7.59 cycles @ 1.50GHz) | |
[QcomKrait] four_indep_adds: 1.67 ns/iter (=2.50 cycles @ 1.50GHz) | |
[QcomKrait] aligned_store: 2.66 ns/iter (=3.98 cycles @ 1.50GHz) | |
[QcomKrait] unaligned_store: 2.65 ns/iter (=3.98 cycles @ 1.50GHz) | |
[QcomKrait] aligned_full_stlf: 9.20 ns/iter (=13.81 cycles @ 1.50GHz) | |
[QcomKrait] aligned_partial_stlf: 8.90 ns/iter (=13.34 cycles @ 1.50GHz) | |
[QcomKrait] unaligned_full_stlf: 8.90 ns/iter (=13.35 cycles @ 1.50GHz) | |
[QcomKrait] unaligned_partial_stlf: 9.04 ns/iter (=13.57 cycles @ 1.50GHz) | |
[CortexA57] nothing: 1.06 ns/iter (=2.01 cycles @ 1.90GHz) | |
[CortexA57] four_dep_adds: 2.12 ns/iter (=4.03 cycles @ 1.90GHz) | |
[CortexA57] four_indep_adds: 1.97 ns/iter (=3.74 cycles @ 1.90GHz) | |
[CortexA57] aligned_store: 3.02 ns/iter (=5.74 cycles @ 1.90GHz) | |
[CortexA57] unaligned_store: 3.38 ns/iter (=6.43 cycles @ 1.90GHz) | |
[CortexA57] aligned_full_stlf: 6.35 ns/iter (=12.07 cycles @ 1.90GHz) | |
[CortexA57] aligned_partial_stlf: 7.02 ns/iter (=13.35 cycles @ 1.90GHz) | |
[CortexA57] unaligned_full_stlf: 6.75 ns/iter (=12.83 cycles @ 1.90GHz) | |
[CortexA57] unaligned_partial_stlf: 6.56 ns/iter (=12.47 cycles @ 1.90GHz) | |
[CortexA15] nothing: 1.24 ns/iter (=2.04 cycles @ 1.65GHz) | |
[CortexA15] four_dep_adds: 2.43 ns/iter (=4.02 cycles @ 1.65GHz) | |
[CortexA15] four_indep_adds: 1.77 ns/iter (=2.93 cycles @ 1.65GHz) | |
[CortexA15] aligned_store: 2.44 ns/iter (=4.03 cycles @ 1.65GHz) | |
[CortexA15] unaligned_store: 3.00 ns/iter (=4.94 cycles @ 1.65GHz) | |
[CortexA15] aligned_full_stlf: 7.93 ns/iter (=13.09 cycles @ 1.65GHz) | |
[CortexA15] aligned_partial_stlf: 7.19 ns/iter (=11.86 cycles @ 1.65GHz) | |
[CortexA15] unaligned_full_stlf: 8.10 ns/iter (=13.37 cycles @ 1.65GHz) | |
[CortexA15] unaligned_partial_stlf: 7.64 ns/iter (=12.60 cycles @ 1.65GHz) | |
---- For reference: Apple A8X | |
[AppleA8X] nothing: 0.70 ns/iter (=1.06 cycles @ 1.50GHz) | |
[AppleA8X] four_dep_adds: 2.72 ns/iter (=4.08 cycles @ 1.50GHz) | |
[AppleA8X] four_indep_adds: 0.87 ns/iter (=1.30 cycles @ 1.50GHz) | |
[AppleA8X] aligned_store: 1.62 ns/iter (=2.44 cycles @ 1.50GHz) | |
[AppleA8X] unaligned_store: 2.10 ns/iter (=3.14 cycles @ 1.50GHz) | |
[AppleA8X] aligned_full_stlf: 7.55 ns/iter (=11.32 cycles @ 1.50GHz) | |
[AppleA8X] aligned_partial_stlf: 7.36 ns/iter (=11.05 cycles @ 1.50GHz) | |
[AppleA8X] unaligned_full_stlf: 7.45 ns/iter (=11.17 cycles @ 1.50GHz) | |
[AppleA8X] unaligned_partial_stlf: 7.45 ns/iter (=11.17 cycles @ 1.50GHz) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment