Skip to content

Instantly share code, notes, and snippets.

@rygorous
Last active July 17, 2016 14:08
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rygorous/14f7b3afe12102ec88edfd1a2136fa6b to your computer and use it in GitHub Desktop.
Save rygorous/14f7b3afe12102ec88edfd1a2136fa6b to your computer and use it in GitHub Desktop.
// values in nanoseconds
static U64 timer()
{
struct timespec ts;
if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0)
return 0;
return (U64)ts.tv_sec * 1000000000 + (U64)ts.tv_nsec;
}
static double duration_in_s(U64 start, U64 end)
{
return (double)(end - start) / 1.0e9;
}
typedef void testfunc(U32 niter);
static void micro_nothing(U32 niter)
{
U32 a=0,b=0,c=0,d=0,e=0;
__asm__ volatile(
"1:\n"
"subs %5,#1\n"
"bne 1b\n"
: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
}
static void micro_four_dep_adds(U32 niter)
{
U32 a=0,b=0,c=0,d=0,e=0;
__asm__ volatile(
"1:\n"
"add %0,%4\n"
"add %0,%4\n"
"add %0,%4\n"
"add %0,%4\n"
"subs %5,#1\n"
"bne 1b\n"
: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
}
static void micro_four_indep_adds(U32 niter)
{
U32 a=0,b=0,c=0,d=0,e=0;
__asm__ volatile(
"1:\n"
"add %0,%4\n"
"add %1,%4\n"
"add %2,%4\n"
"add %3,%4\n"
"subs %5,#1\n"
"bne 1b\n"
: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
}
static void micro_aligned_store(U32 niter)
{
U32 buf[4];
U32 *e = buf;
U32 a=0,b=0,c=0,d=0;
__asm__ volatile(
"1:\n"
"str %0,[%4,#0]\n"
"str %1,[%4,#4]\n"
"str %2,[%4,#8]\n"
"str %3,[%4,#12]\n"
"subs %5,#1\n"
"bne 1b\n"
: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
}
static void micro_unaligned_store(U32 niter)
{
U32 buf[5];
U32 *e = buf;
U32 a=0,b=0,c=0,d=0;
__asm__ volatile(
"1:\n"
"str %0,[%4,#1]\n"
"str %1,[%4,#5]\n"
"str %2,[%4,#9]\n"
"str %3,[%4,#13]\n"
"subs %5,#1\n"
"bne 1b\n"
: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
}
static void micro_aligned_full_stlf(U32 niter)
{
U32 buf[4];
U32 *e = buf;
U32 a=0,b=0,c=0,d=0;
__asm__ volatile(
"1:\n"
"str %0,[%4,#0]\n"
"ldr %0,[%4,#0]\n"
"str %0,[%4,#0]\n"
"ldr %0,[%4,#0]\n"
"subs %5,#1\n"
"bne 1b\n"
: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
}
static void micro_aligned_partial_stlf(U32 niter)
{
U32 buf[4];
U32 *e = buf;
U32 a=0,b=0,c=0,d=0;
__asm__ volatile(
"1:\n"
"str %0,[%4,#0]\n"
"ldrb %0,[%4,#0]\n"
"str %0,[%4,#0]\n"
"ldrb %0,[%4,#0]\n"
"subs %5,#1\n"
"bne 1b\n"
: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
}
static void micro_unaligned_full_stlf(U32 niter)
{
U32 buf[5];
U32 *e = buf;
U32 a=0,b=0,c=0,d=0;
__asm__ volatile(
"1:\n"
"str %0,[%4,#1]\n"
"ldr %0,[%4,#1]\n"
"str %0,[%4,#1]\n"
"ldr %0,[%4,#1]\n"
"subs %5,#1\n"
"bne 1b\n"
: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
}
static void micro_unaligned_partial_stlf(U32 niter)
{
U32 buf[5];
U32 *e = buf;
U32 a=0,b=0,c=0,d=0;
__asm__ volatile(
"1:\n"
"str %0,[%4,#1]\n"
"ldrb %0,[%4,#1]\n"
"str %0,[%4,#1]\n"
"ldrb %0,[%4,#1]\n"
"subs %5,#1\n"
"bne 1b\n"
: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
}
static void arm_microbench(const char *name, testfunc *fn)
{
int nIter = 1000000000; // 1 billion
// warm up
fn(nIter / 4);
U64 startt = timer();
fn(nIter);
U64 endt = timer();
double secs = duration_in_s(startt, endt);
double clock_rate_ghz = 1.65;
double ns_per_iter = (secs * 1e9) / (double)nIter;
double clocks_per_iter = ns_per_iter * clock_rate_ghz;
LOGI("%s: %.2f ns/iter (=%.2f cycles @ %.2fGHz)\n", name, ns_per_iter, clocks_per_iter, clock_rate_ghz);
}
[ExynosM1] Starting test.
[ExynosM1] nothing: 0.39 ns/iter (=1.01 cycles @ 2.6GHz)
[ExynosM1] four_dep_adds: 1.54 ns/iter (=4.01 cycles @ 2.6GHz)
[ExynosM1] four_indep_adds: 0.75 ns/iter (=1.94 cycles @ 2.6GHz)
[ExynosM1] aligned_store: 1.54 ns/iter (=4.01 cycles @ 2.6GHz)
[ExynosM1] unaligned_store: 2.31 ns/iter (=6.02 cycles @ 2.6GHz)
[ExynosM1] aligned_full_stlf: 10.02 ns/iter (=26.05 cycles @ 2.6GHz)
[ExynosM1] aligned_partial_stlf: 9.99 ns/iter (=25.98 cycles @ 2.6GHz)
[ExynosM1] unaligned_full_stlf: 9.89 ns/iter (=25.71 cycles @ 2.6GHz)
[ExynosM1] unaligned_partial_stlf: 9.59 ns/iter (=24.94 cycles @ 2.6GHz)
[ExynosM1] Terminating.
[NvDenver] nothing: 0.67 ns/iter (=1.68 cycles @ 2.5GHz)
[NvDenver] four_dep_adds: 4.25 ns/iter (=10.62 cycles @ 2.5GHz)
[NvDenver] four_indep_adds: 1.73 ns/iter (=4.34 cycles @ 2.5GHz)
[NvDenver] aligned_store: 0.99 ns/iter (=2.47 cycles @ 2.5GHz)
[NvDenver] unaligned_store: 0.99 ns/iter (=2.47 cycles @ 2.5GHz)
[NvDenver] aligned_full_stlf: 0.79 ns/iter (=1.98 cycles @ 2.5GHz)
[NvDenver] aligned_partial_stlf: 0.80 ns/iter (=2.01 cycles @ 2.5GHz)
[NvDenver] unaligned_full_stlf: 0.79 ns/iter (=1.98 cycles @ 2.5GHz)
[NvDenver] unaligned_partial_stlf: 0.79 ns/iter (=1.98 cycles @ 2.5GHz)
[QcomKryo] nothing: 0.47 ns/iter (=1.01 cycles @ 2.15GHz)
[QcomKryo] four_dep_adds: 1.87 ns/iter (=4.03 cycles @ 2.15GHz)
[QcomKryo] four_indep_adds: 0.97 ns/iter (=2.08 cycles @ 2.15GHz)
[QcomKryo] aligned_store: 1.87 ns/iter (=4.02 cycles @ 2.15GHz)
[QcomKryo] unaligned_store: 1.87 ns/iter (=4.01 cycles @ 2.15GHz)
[QcomKryo] aligned_full_stlf: 11.75 ns/iter (=25.26 cycles @ 2.15GHz)
[QcomKryo] aligned_partial_stlf: 11.89 ns/iter (=25.57 cycles @ 2.15GHz)
[QcomKryo] unaligned_full_stlf: 12.08 ns/iter (=25.98 cycles @ 2.15GHz)
[QcomKryo] unaligned_partial_stlf: 11.85 ns/iter (=25.47 cycles @ 2.15GHz)
[QcomKrait] nothing: 1.44 ns/iter (=2.16 cycles @ 1.50GHz)
[QcomKrait] four_dep_adds: 5.06 ns/iter (=7.59 cycles @ 1.50GHz)
[QcomKrait] four_indep_adds: 1.67 ns/iter (=2.50 cycles @ 1.50GHz)
[QcomKrait] aligned_store: 2.66 ns/iter (=3.98 cycles @ 1.50GHz)
[QcomKrait] unaligned_store: 2.65 ns/iter (=3.98 cycles @ 1.50GHz)
[QcomKrait] aligned_full_stlf: 9.20 ns/iter (=13.81 cycles @ 1.50GHz)
[QcomKrait] aligned_partial_stlf: 8.90 ns/iter (=13.34 cycles @ 1.50GHz)
[QcomKrait] unaligned_full_stlf: 8.90 ns/iter (=13.35 cycles @ 1.50GHz)
[QcomKrait] unaligned_partial_stlf: 9.04 ns/iter (=13.57 cycles @ 1.50GHz)
[CortexA57] nothing: 1.06 ns/iter (=2.01 cycles @ 1.90GHz)
[CortexA57] four_dep_adds: 2.12 ns/iter (=4.03 cycles @ 1.90GHz)
[CortexA57] four_indep_adds: 1.97 ns/iter (=3.74 cycles @ 1.90GHz)
[CortexA57] aligned_store: 3.02 ns/iter (=5.74 cycles @ 1.90GHz)
[CortexA57] unaligned_store: 3.38 ns/iter (=6.43 cycles @ 1.90GHz)
[CortexA57] aligned_full_stlf: 6.35 ns/iter (=12.07 cycles @ 1.90GHz)
[CortexA57] aligned_partial_stlf: 7.02 ns/iter (=13.35 cycles @ 1.90GHz)
[CortexA57] unaligned_full_stlf: 6.75 ns/iter (=12.83 cycles @ 1.90GHz)
[CortexA57] unaligned_partial_stlf: 6.56 ns/iter (=12.47 cycles @ 1.90GHz)
[CortexA15] nothing: 1.24 ns/iter (=2.04 cycles @ 1.65GHz)
[CortexA15] four_dep_adds: 2.43 ns/iter (=4.02 cycles @ 1.65GHz)
[CortexA15] four_indep_adds: 1.77 ns/iter (=2.93 cycles @ 1.65GHz)
[CortexA15] aligned_store: 2.44 ns/iter (=4.03 cycles @ 1.65GHz)
[CortexA15] unaligned_store: 3.00 ns/iter (=4.94 cycles @ 1.65GHz)
[CortexA15] aligned_full_stlf: 7.93 ns/iter (=13.09 cycles @ 1.65GHz)
[CortexA15] aligned_partial_stlf: 7.19 ns/iter (=11.86 cycles @ 1.65GHz)
[CortexA15] unaligned_full_stlf: 8.10 ns/iter (=13.37 cycles @ 1.65GHz)
[CortexA15] unaligned_partial_stlf: 7.64 ns/iter (=12.60 cycles @ 1.65GHz)
---- For reference: Apple A8X
[AppleA8X] nothing: 0.70 ns/iter (=1.06 cycles @ 1.50GHz)
[AppleA8X] four_dep_adds: 2.72 ns/iter (=4.08 cycles @ 1.50GHz)
[AppleA8X] four_indep_adds: 0.87 ns/iter (=1.30 cycles @ 1.50GHz)
[AppleA8X] aligned_store: 1.62 ns/iter (=2.44 cycles @ 1.50GHz)
[AppleA8X] unaligned_store: 2.10 ns/iter (=3.14 cycles @ 1.50GHz)
[AppleA8X] aligned_full_stlf: 7.55 ns/iter (=11.32 cycles @ 1.50GHz)
[AppleA8X] aligned_partial_stlf: 7.36 ns/iter (=11.05 cycles @ 1.50GHz)
[AppleA8X] unaligned_full_stlf: 7.45 ns/iter (=11.17 cycles @ 1.50GHz)
[AppleA8X] unaligned_partial_stlf: 7.45 ns/iter (=11.17 cycles @ 1.50GHz)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment