rygorous/microbench.cpp

## microbench.cpp
// values in nanoseconds
static U64 timer()
{
  struct timespec ts;
  if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0)
    return 0;

  return (U64)ts.tv_sec * 1000000000 + (U64)ts.tv_nsec;
}

static double duration_in_s(U64 start, U64 end)
{
  return (double)(end - start) / 1.0e9;
}


typedef void testfunc(U32 niter);

static void micro_nothing(U32 niter)
{
  U32 a=0,b=0,c=0,d=0,e=0;
  __asm__ volatile(
          "1:\n"
          "subs %5,#1\n"
          "bne 1b\n"
          : "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
}

static void micro_four_dep_adds(U32 niter)
{
  U32 a=0,b=0,c=0,d=0,e=0;
  __asm__ volatile(
          "1:\n"
          "add %0,%4\n"
          "add %0,%4\n"
          "add %0,%4\n"
          "add %0,%4\n"
          "subs %5,#1\n"
          "bne 1b\n"
          : "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
}

static void micro_four_indep_adds(U32 niter)
{
  U32 a=0,b=0,c=0,d=0,e=0;
  __asm__ volatile(
          "1:\n"
          "add %0,%4\n"
          "add %1,%4\n"
          "add %2,%4\n"
          "add %3,%4\n"
          "subs %5,#1\n"
          "bne 1b\n"
          : "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
}

static void micro_aligned_store(U32 niter)
{
  U32 buf[4];
  U32 *e = buf;
  U32 a=0,b=0,c=0,d=0;
  __asm__ volatile(
          "1:\n"
          "str %0,[%4,#0]\n"
          "str %1,[%4,#4]\n"
          "str %2,[%4,#8]\n"
          "str %3,[%4,#12]\n"
          "subs %5,#1\n"
          "bne 1b\n"
          : "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
}

static void micro_unaligned_store(U32 niter)
{
  U32 buf[5];
  U32 *e = buf;
  U32 a=0,b=0,c=0,d=0;
  __asm__ volatile(
          "1:\n"
          "str %0,[%4,#1]\n"
          "str %1,[%4,#5]\n"
          "str %2,[%4,#9]\n"
          "str %3,[%4,#13]\n"
          "subs %5,#1\n"
          "bne 1b\n"
          : "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
}

static void micro_aligned_full_stlf(U32 niter)
{
  U32 buf[4];
  U32 *e = buf;
  U32 a=0,b=0,c=0,d=0;
  __asm__ volatile(
          "1:\n"
          "str %0,[%4,#0]\n"
          "ldr %0,[%4,#0]\n"
          "str %0,[%4,#0]\n"
          "ldr %0,[%4,#0]\n"
          "subs %5,#1\n"
          "bne 1b\n"
          : "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
}

static void micro_aligned_partial_stlf(U32 niter)
{
  U32 buf[4];
  U32 *e = buf;
  U32 a=0,b=0,c=0,d=0;
  __asm__ volatile(
          "1:\n"
          "str %0,[%4,#0]\n"
          "ldrb %0,[%4,#0]\n"
          "str %0,[%4,#0]\n"
          "ldrb %0,[%4,#0]\n"
          "subs %5,#1\n"
          "bne 1b\n"
          : "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
}

static void micro_unaligned_full_stlf(U32 niter)
{
  U32 buf[5];
  U32 *e = buf;
  U32 a=0,b=0,c=0,d=0;
  __asm__ volatile(
          "1:\n"
          "str %0,[%4,#1]\n"
          "ldr %0,[%4,#1]\n"
          "str %0,[%4,#1]\n"
          "ldr %0,[%4,#1]\n"
          "subs %5,#1\n"
          "bne 1b\n"
          : "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
}

static void micro_unaligned_partial_stlf(U32 niter)
{
  U32 buf[5];
  U32 *e = buf;
  U32 a=0,b=0,c=0,d=0;
  __asm__ volatile(
          "1:\n"
          "str %0,[%4,#1]\n"
          "ldrb %0,[%4,#1]\n"
          "str %0,[%4,#1]\n"
          "ldrb %0,[%4,#1]\n"
          "subs %5,#1\n"
          "bne 1b\n"
          : "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
}

static void arm_microbench(const char *name, testfunc *fn)
{
  int nIter = 1000000000; // 1 billion

  // warm up
  fn(nIter / 4);

  U64 startt = timer();
  fn(nIter);
  U64 endt = timer();
  double secs = duration_in_s(startt, endt);
  double clock_rate_ghz = 1.65;
  double ns_per_iter = (secs * 1e9) / (double)nIter;
  double clocks_per_iter = ns_per_iter * clock_rate_ghz;

  LOGI("%s: %.2f ns/iter (=%.2f cycles @ %.2fGHz)\n", name, ns_per_iter, clocks_per_iter, clock_rate_ghz);
}

## results.txt
[ExynosM1]      Starting test.
[ExynosM1]      nothing: 0.39 ns/iter (=1.01 cycles @ 2.6GHz)
[ExynosM1]      four_dep_adds: 1.54 ns/iter (=4.01 cycles @ 2.6GHz)
[ExynosM1]      four_indep_adds: 0.75 ns/iter (=1.94 cycles @ 2.6GHz)
[ExynosM1]      aligned_store: 1.54 ns/iter (=4.01 cycles @ 2.6GHz)
[ExynosM1]      unaligned_store: 2.31 ns/iter (=6.02 cycles @ 2.6GHz)
[ExynosM1]      aligned_full_stlf: 10.02 ns/iter (=26.05 cycles @ 2.6GHz)
[ExynosM1]      aligned_partial_stlf: 9.99 ns/iter (=25.98 cycles @ 2.6GHz)
[ExynosM1]      unaligned_full_stlf: 9.89 ns/iter (=25.71 cycles @ 2.6GHz)
[ExynosM1]      unaligned_partial_stlf: 9.59 ns/iter (=24.94 cycles @ 2.6GHz)
[ExynosM1]      Terminating.

[NvDenver]      nothing: 0.67 ns/iter (=1.68 cycles @ 2.5GHz)
[NvDenver]      four_dep_adds: 4.25 ns/iter (=10.62 cycles @ 2.5GHz)
[NvDenver]      four_indep_adds: 1.73 ns/iter (=4.34 cycles @ 2.5GHz)
[NvDenver]      aligned_store: 0.99 ns/iter (=2.47 cycles @ 2.5GHz)
[NvDenver]      unaligned_store: 0.99 ns/iter (=2.47 cycles @ 2.5GHz)
[NvDenver]      aligned_full_stlf: 0.79 ns/iter (=1.98 cycles @ 2.5GHz)
[NvDenver]      aligned_partial_stlf: 0.80 ns/iter (=2.01 cycles @ 2.5GHz)
[NvDenver]      unaligned_full_stlf: 0.79 ns/iter (=1.98 cycles @ 2.5GHz)
[NvDenver]      unaligned_partial_stlf: 0.79 ns/iter (=1.98 cycles @ 2.5GHz)

[QcomKryo]      nothing: 0.47 ns/iter (=1.01 cycles @ 2.15GHz)
[QcomKryo]      four_dep_adds: 1.87 ns/iter (=4.03 cycles @ 2.15GHz)
[QcomKryo]      four_indep_adds: 0.97 ns/iter (=2.08 cycles @ 2.15GHz)
[QcomKryo]      aligned_store: 1.87 ns/iter (=4.02 cycles @ 2.15GHz)
[QcomKryo]      unaligned_store: 1.87 ns/iter (=4.01 cycles @ 2.15GHz)
[QcomKryo]      aligned_full_stlf: 11.75 ns/iter (=25.26 cycles @ 2.15GHz)
[QcomKryo]      aligned_partial_stlf: 11.89 ns/iter (=25.57 cycles @ 2.15GHz)
[QcomKryo]      unaligned_full_stlf: 12.08 ns/iter (=25.98 cycles @ 2.15GHz)
[QcomKryo]      unaligned_partial_stlf: 11.85 ns/iter (=25.47 cycles @ 2.15GHz)

[QcomKrait]     nothing: 1.44 ns/iter (=2.16 cycles @ 1.50GHz)
[QcomKrait]     four_dep_adds: 5.06 ns/iter (=7.59 cycles @ 1.50GHz)
[QcomKrait]     four_indep_adds: 1.67 ns/iter (=2.50 cycles @ 1.50GHz)
[QcomKrait]     aligned_store: 2.66 ns/iter (=3.98 cycles @ 1.50GHz)
[QcomKrait]     unaligned_store: 2.65 ns/iter (=3.98 cycles @ 1.50GHz)
[QcomKrait]     aligned_full_stlf: 9.20 ns/iter (=13.81 cycles @ 1.50GHz)
[QcomKrait]     aligned_partial_stlf: 8.90 ns/iter (=13.34 cycles @ 1.50GHz)
[QcomKrait]     unaligned_full_stlf: 8.90 ns/iter (=13.35 cycles @ 1.50GHz)
[QcomKrait]     unaligned_partial_stlf: 9.04 ns/iter (=13.57 cycles @ 1.50GHz)

[CortexA57]     nothing: 1.06 ns/iter (=2.01 cycles @ 1.90GHz)
[CortexA57]     four_dep_adds: 2.12 ns/iter (=4.03 cycles @ 1.90GHz)
[CortexA57]     four_indep_adds: 1.97 ns/iter (=3.74 cycles @ 1.90GHz)
[CortexA57]     aligned_store: 3.02 ns/iter (=5.74 cycles @ 1.90GHz)
[CortexA57]     unaligned_store: 3.38 ns/iter (=6.43 cycles @ 1.90GHz)
[CortexA57]     aligned_full_stlf: 6.35 ns/iter (=12.07 cycles @ 1.90GHz)
[CortexA57]     aligned_partial_stlf: 7.02 ns/iter (=13.35 cycles @ 1.90GHz)
[CortexA57]     unaligned_full_stlf: 6.75 ns/iter (=12.83 cycles @ 1.90GHz)
[CortexA57]     unaligned_partial_stlf: 6.56 ns/iter (=12.47 cycles @ 1.90GHz)

[CortexA15]     nothing: 1.24 ns/iter (=2.04 cycles @ 1.65GHz)
[CortexA15]     four_dep_adds: 2.43 ns/iter (=4.02 cycles @ 1.65GHz)
[CortexA15]     four_indep_adds: 1.77 ns/iter (=2.93 cycles @ 1.65GHz)
[CortexA15]     aligned_store: 2.44 ns/iter (=4.03 cycles @ 1.65GHz)
[CortexA15]     unaligned_store: 3.00 ns/iter (=4.94 cycles @ 1.65GHz)
[CortexA15]     aligned_full_stlf: 7.93 ns/iter (=13.09 cycles @ 1.65GHz)
[CortexA15]     aligned_partial_stlf: 7.19 ns/iter (=11.86 cycles @ 1.65GHz)
[CortexA15]     unaligned_full_stlf: 8.10 ns/iter (=13.37 cycles @ 1.65GHz)
[CortexA15]     unaligned_partial_stlf: 7.64 ns/iter (=12.60 cycles @ 1.65GHz)

---- For reference: Apple A8X

[AppleA8X]      nothing: 0.70 ns/iter (=1.06 cycles @ 1.50GHz)
[AppleA8X]      four_dep_adds: 2.72 ns/iter (=4.08 cycles @ 1.50GHz)
[AppleA8X]      four_indep_adds: 0.87 ns/iter (=1.30 cycles @ 1.50GHz)
[AppleA8X]      aligned_store: 1.62 ns/iter (=2.44 cycles @ 1.50GHz)
[AppleA8X]      unaligned_store: 2.10 ns/iter (=3.14 cycles @ 1.50GHz)
[AppleA8X]      aligned_full_stlf: 7.55 ns/iter (=11.32 cycles @ 1.50GHz)
[AppleA8X]      aligned_partial_stlf: 7.36 ns/iter (=11.05 cycles @ 1.50GHz)
[AppleA8X]      unaligned_full_stlf: 7.45 ns/iter (=11.17 cycles @ 1.50GHz)
[AppleA8X]      unaligned_partial_stlf: 7.45 ns/iter (=11.17 cycles @ 1.50GHz)
	// values in nanoseconds
	static U64 timer()
	{
	struct timespec ts;
	if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0)
	return 0;

	return (U64)ts.tv_sec * 1000000000 + (U64)ts.tv_nsec;
	}

	static double duration_in_s(U64 start, U64 end)
	{
	return (double)(end - start) / 1.0e9;
	}



	typedef void testfunc(U32 niter);

	static void micro_nothing(U32 niter)
	{
	U32 a=0,b=0,c=0,d=0,e=0;
	__asm__ volatile(
	"1:\n"
	"subs %5,#1\n"
	"bne 1b\n"
	: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
	}

	static void micro_four_dep_adds(U32 niter)
	{
	U32 a=0,b=0,c=0,d=0,e=0;
	__asm__ volatile(
	"1:\n"
	"add %0,%4\n"
	"add %0,%4\n"
	"add %0,%4\n"
	"add %0,%4\n"
	"subs %5,#1\n"
	"bne 1b\n"
	: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
	}

	static void micro_four_indep_adds(U32 niter)
	{
	U32 a=0,b=0,c=0,d=0,e=0;
	__asm__ volatile(
	"1:\n"
	"add %0,%4\n"
	"add %1,%4\n"
	"add %2,%4\n"
	"add %3,%4\n"
	"subs %5,#1\n"
	"bne 1b\n"
	: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
	}

	static void micro_aligned_store(U32 niter)
	{
	U32 buf[4];
	U32 *e = buf;
	U32 a=0,b=0,c=0,d=0;
	__asm__ volatile(
	"1:\n"
	"str %0,[%4,#0]\n"
	"str %1,[%4,#4]\n"
	"str %2,[%4,#8]\n"
	"str %3,[%4,#12]\n"
	"subs %5,#1\n"
	"bne 1b\n"
	: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
	}

	static void micro_unaligned_store(U32 niter)
	{
	U32 buf[5];
	U32 *e = buf;
	U32 a=0,b=0,c=0,d=0;
	__asm__ volatile(
	"1:\n"
	"str %0,[%4,#1]\n"
	"str %1,[%4,#5]\n"
	"str %2,[%4,#9]\n"
	"str %3,[%4,#13]\n"
	"subs %5,#1\n"
	"bne 1b\n"
	: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
	}

	static void micro_aligned_full_stlf(U32 niter)
	{
	U32 buf[4];
	U32 *e = buf;
	U32 a=0,b=0,c=0,d=0;
	__asm__ volatile(
	"1:\n"
	"str %0,[%4,#0]\n"
	"ldr %0,[%4,#0]\n"
	"str %0,[%4,#0]\n"
	"ldr %0,[%4,#0]\n"
	"subs %5,#1\n"
	"bne 1b\n"
	: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
	}

	static void micro_aligned_partial_stlf(U32 niter)
	{
	U32 buf[4];
	U32 *e = buf;
	U32 a=0,b=0,c=0,d=0;
	__asm__ volatile(
	"1:\n"
	"str %0,[%4,#0]\n"
	"ldrb %0,[%4,#0]\n"
	"str %0,[%4,#0]\n"
	"ldrb %0,[%4,#0]\n"
	"subs %5,#1\n"
	"bne 1b\n"
	: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
	}

	static void micro_unaligned_full_stlf(U32 niter)
	{
	U32 buf[5];
	U32 *e = buf;
	U32 a=0,b=0,c=0,d=0;
	__asm__ volatile(
	"1:\n"
	"str %0,[%4,#1]\n"
	"ldr %0,[%4,#1]\n"
	"str %0,[%4,#1]\n"
	"ldr %0,[%4,#1]\n"
	"subs %5,#1\n"
	"bne 1b\n"
	: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
	}

	static void micro_unaligned_partial_stlf(U32 niter)
	{
	U32 buf[5];
	U32 *e = buf;
	U32 a=0,b=0,c=0,d=0;
	__asm__ volatile(
	"1:\n"
	"str %0,[%4,#1]\n"
	"ldrb %0,[%4,#1]\n"
	"str %0,[%4,#1]\n"
	"ldrb %0,[%4,#1]\n"
	"subs %5,#1\n"
	"bne 1b\n"
	: "+r"(a), "+r"(b), "+r"(c), "+r"(d), "+r"(e), "+r"(niter));
	}

	static void arm_microbench(const char name, testfunc fn)
	{
	int nIter = 1000000000; // 1 billion

	// warm up
	fn(nIter / 4);

	U64 startt = timer();
	fn(nIter);
	U64 endt = timer();
	double secs = duration_in_s(startt, endt);
	double clock_rate_ghz = 1.65;
	double ns_per_iter = (secs * 1e9) / (double)nIter;
	double clocks_per_iter = ns_per_iter * clock_rate_ghz;

	LOGI("%s: %.2f ns/iter (=%.2f cycles @ %.2fGHz)\n", name, ns_per_iter, clocks_per_iter, clock_rate_ghz);
	}
	[ExynosM1] Starting test.
	[ExynosM1] nothing: 0.39 ns/iter (=1.01 cycles @ 2.6GHz)
	[ExynosM1] four_dep_adds: 1.54 ns/iter (=4.01 cycles @ 2.6GHz)
	[ExynosM1] four_indep_adds: 0.75 ns/iter (=1.94 cycles @ 2.6GHz)
	[ExynosM1] aligned_store: 1.54 ns/iter (=4.01 cycles @ 2.6GHz)
	[ExynosM1] unaligned_store: 2.31 ns/iter (=6.02 cycles @ 2.6GHz)
	[ExynosM1] aligned_full_stlf: 10.02 ns/iter (=26.05 cycles @ 2.6GHz)
	[ExynosM1] aligned_partial_stlf: 9.99 ns/iter (=25.98 cycles @ 2.6GHz)
	[ExynosM1] unaligned_full_stlf: 9.89 ns/iter (=25.71 cycles @ 2.6GHz)
	[ExynosM1] unaligned_partial_stlf: 9.59 ns/iter (=24.94 cycles @ 2.6GHz)
	[ExynosM1] Terminating.

	[NvDenver] nothing: 0.67 ns/iter (=1.68 cycles @ 2.5GHz)
	[NvDenver] four_dep_adds: 4.25 ns/iter (=10.62 cycles @ 2.5GHz)
	[NvDenver] four_indep_adds: 1.73 ns/iter (=4.34 cycles @ 2.5GHz)
	[NvDenver] aligned_store: 0.99 ns/iter (=2.47 cycles @ 2.5GHz)
	[NvDenver] unaligned_store: 0.99 ns/iter (=2.47 cycles @ 2.5GHz)
	[NvDenver] aligned_full_stlf: 0.79 ns/iter (=1.98 cycles @ 2.5GHz)
	[NvDenver] aligned_partial_stlf: 0.80 ns/iter (=2.01 cycles @ 2.5GHz)
	[NvDenver] unaligned_full_stlf: 0.79 ns/iter (=1.98 cycles @ 2.5GHz)
	[NvDenver] unaligned_partial_stlf: 0.79 ns/iter (=1.98 cycles @ 2.5GHz)

	[QcomKryo] nothing: 0.47 ns/iter (=1.01 cycles @ 2.15GHz)
	[QcomKryo] four_dep_adds: 1.87 ns/iter (=4.03 cycles @ 2.15GHz)
	[QcomKryo] four_indep_adds: 0.97 ns/iter (=2.08 cycles @ 2.15GHz)
	[QcomKryo] aligned_store: 1.87 ns/iter (=4.02 cycles @ 2.15GHz)
	[QcomKryo] unaligned_store: 1.87 ns/iter (=4.01 cycles @ 2.15GHz)
	[QcomKryo] aligned_full_stlf: 11.75 ns/iter (=25.26 cycles @ 2.15GHz)
	[QcomKryo] aligned_partial_stlf: 11.89 ns/iter (=25.57 cycles @ 2.15GHz)
	[QcomKryo] unaligned_full_stlf: 12.08 ns/iter (=25.98 cycles @ 2.15GHz)
	[QcomKryo] unaligned_partial_stlf: 11.85 ns/iter (=25.47 cycles @ 2.15GHz)

	[QcomKrait] nothing: 1.44 ns/iter (=2.16 cycles @ 1.50GHz)
	[QcomKrait] four_dep_adds: 5.06 ns/iter (=7.59 cycles @ 1.50GHz)
	[QcomKrait] four_indep_adds: 1.67 ns/iter (=2.50 cycles @ 1.50GHz)
	[QcomKrait] aligned_store: 2.66 ns/iter (=3.98 cycles @ 1.50GHz)
	[QcomKrait] unaligned_store: 2.65 ns/iter (=3.98 cycles @ 1.50GHz)
	[QcomKrait] aligned_full_stlf: 9.20 ns/iter (=13.81 cycles @ 1.50GHz)
	[QcomKrait] aligned_partial_stlf: 8.90 ns/iter (=13.34 cycles @ 1.50GHz)
	[QcomKrait] unaligned_full_stlf: 8.90 ns/iter (=13.35 cycles @ 1.50GHz)
	[QcomKrait] unaligned_partial_stlf: 9.04 ns/iter (=13.57 cycles @ 1.50GHz)

	[CortexA57] nothing: 1.06 ns/iter (=2.01 cycles @ 1.90GHz)
	[CortexA57] four_dep_adds: 2.12 ns/iter (=4.03 cycles @ 1.90GHz)
	[CortexA57] four_indep_adds: 1.97 ns/iter (=3.74 cycles @ 1.90GHz)
	[CortexA57] aligned_store: 3.02 ns/iter (=5.74 cycles @ 1.90GHz)
	[CortexA57] unaligned_store: 3.38 ns/iter (=6.43 cycles @ 1.90GHz)
	[CortexA57] aligned_full_stlf: 6.35 ns/iter (=12.07 cycles @ 1.90GHz)
	[CortexA57] aligned_partial_stlf: 7.02 ns/iter (=13.35 cycles @ 1.90GHz)
	[CortexA57] unaligned_full_stlf: 6.75 ns/iter (=12.83 cycles @ 1.90GHz)
	[CortexA57] unaligned_partial_stlf: 6.56 ns/iter (=12.47 cycles @ 1.90GHz)

	[CortexA15] nothing: 1.24 ns/iter (=2.04 cycles @ 1.65GHz)
	[CortexA15] four_dep_adds: 2.43 ns/iter (=4.02 cycles @ 1.65GHz)
	[CortexA15] four_indep_adds: 1.77 ns/iter (=2.93 cycles @ 1.65GHz)
	[CortexA15] aligned_store: 2.44 ns/iter (=4.03 cycles @ 1.65GHz)
	[CortexA15] unaligned_store: 3.00 ns/iter (=4.94 cycles @ 1.65GHz)
	[CortexA15] aligned_full_stlf: 7.93 ns/iter (=13.09 cycles @ 1.65GHz)
	[CortexA15] aligned_partial_stlf: 7.19 ns/iter (=11.86 cycles @ 1.65GHz)
	[CortexA15] unaligned_full_stlf: 8.10 ns/iter (=13.37 cycles @ 1.65GHz)
	[CortexA15] unaligned_partial_stlf: 7.64 ns/iter (=12.60 cycles @ 1.65GHz)

	---- For reference: Apple A8X

	[AppleA8X] nothing: 0.70 ns/iter (=1.06 cycles @ 1.50GHz)
	[AppleA8X] four_dep_adds: 2.72 ns/iter (=4.08 cycles @ 1.50GHz)
	[AppleA8X] four_indep_adds: 0.87 ns/iter (=1.30 cycles @ 1.50GHz)
	[AppleA8X] aligned_store: 1.62 ns/iter (=2.44 cycles @ 1.50GHz)
	[AppleA8X] unaligned_store: 2.10 ns/iter (=3.14 cycles @ 1.50GHz)
	[AppleA8X] aligned_full_stlf: 7.55 ns/iter (=11.32 cycles @ 1.50GHz)
	[AppleA8X] aligned_partial_stlf: 7.36 ns/iter (=11.05 cycles @ 1.50GHz)
	[AppleA8X] unaligned_full_stlf: 7.45 ns/iter (=11.17 cycles @ 1.50GHz)
	[AppleA8X] unaligned_partial_stlf: 7.45 ns/iter (=11.17 cycles @ 1.50GHz)