Skip to content

Instantly share code, notes, and snippets.

@rygorous
Last active November 1, 2017 15:19
Embed
What would you like to do?
Cortex-A57 fun.
static void micro_nothing(void *context, int nreps)
{
// Benchmarks at 2.0 cycles/iter.
//
// Apparently, can't retire more than one taken branch every 2 clocks.
// (Or, more precisely, it probably can't retire _the same branch_
// more than once every 2 clocks.)
__asm__ volatile(
".align 4\n"
"1:\n"
"subs %w0,%w0,#1\n"
"bne 1b\n"
: "+r"(nreps));
}
static void micro_four_indep_adds(void *context, int nreps)
{
int64_t a=0, b=0, c=0, d=0;
// Benchmarks at 2.5 cycles/iter. As per dispatch limits, it should go:
//
// c0: add 1; add 2 (2x I01)
// c1: add 3; add 4 (2x I01)
// c2: subs; bne; add 1 (2x I01, B)
// c3: add 2; add 3 (2x I01)
// c4: add 4; subs; bne (2x I01, B)
//
// 5 cycles for every two iters, so that one checks out.
__asm__ volatile(
".align 4\n"
"1:\n"
"add %1,%1,#1\n"
"add %2,%2,#1\n"
"add %3,%3,#1\n"
"add %4,%4,#1\n"
"subs %w0,%w0,#1\n"
"bne 1b\n"
: "+r"(nreps), "+r"(a), "+r"(b), "+r"(c), "+r"(d));
}
static void micro_four_indep_adds2(void *context, int nreps)
{
int64_t a=0, b=0, c=0, d=0;
int32_t magic=1;
// Benchmarks at 2.0 cycles/iter, as expected by dispatch limits:
//
// c0: add 1; add 2; add 3 (2x I01, M)
// c1: add 4; subs; bne (2x I01, B)
__asm__ volatile(
".align 4\n"
"1:\n"
"add %1,%1,#1\n"
"add %2,%2,%w5,uxtw\n"
"add %3,%3,#1\n"
"add %4,%4,#1\n"
"subs %w0,%w0,#1\n"
"bne 1b\n"
: "+r"(nreps), "+r"(a), "+r"(b), "+r"(c), "+r"(d)
: "r"(magic));
}
static void micro_uzp1_q(void *context, int nreps)
{
uint8x16_t n0, n1, n2, n3, n4, n5, n6, n7;
n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = vdupq_n_u8(0);
// Benchmarks at 5.67 cycles/iter, and I don't understand why.
//
// From the docs, I would expect:
//
// c0: uzp1 1 / uzp1 2 / subs (I01, 2x F01)
// c1: uzp1 3 / uzp1 4 (2x F01)
// c2: uzp1 5 / uzp1 6 (2x F01)
// c3: uzp1 7 / uzp1 8 / bne (2x F01, B)
//
// NOTE: ARM slides say 3x 64-bit writeback ports. That would mean something like:
// (the placement of the integer ops in here isn't right, where exactly they go
// depends on how far the frontend forges ahead, but it doesn't matter much.)
//
// c0: uzp1 1 (F0) / uzp1 2 first half (F1) / subs (I0)
// c1: uzp1 3 (F0) / uzp1 2 second half (F1)
// c2: uzp1 4 (F0) / uzp1 5 first half (F1)
// c3: uzp1 6 (F0) / uzp1 5 second half (F1)
// c4: uzp1 7 (F0) / uzp1 8 first half (F1)
// c5: uzp1 8 second half (F1) / bne (B) / uzp1 1 (F0)
// c6: uzp1 2 (F0) / subs (I0) / uzp1 3 first half (F1)
// c7: uzp1 4 (F0) / uzp1 3 second half (F1)
// c8: uzp1 5 (F0) / uzp1 6 first half (F1)
// c9: uzp1 7 (F0) / uzp1 6 second half (F1)
// c10: uzp1 8 (F0) / bne (B) / uzp1 1 first half (F1)
// c11: uzp1 2 (F0) / uzp1 1 second half (F1) / subs (I0)
// c12: uzp1 3 (F0) / uzp1 4 first half (F1)
// c13: uzp1 5 (F0) / uzp1 4 second half (F1)
// c14: uzp1 6 (F0) / uzp1 7 first half (F1)
// c15: uzp1 8 (F0) / uzp1 7 second half (F1) / bne (B)
//
// (and after that it repeats)
// gives *16* cycles for 3 iters = 16/3 = 5.33 ... not what happens!
//
// However, the variant where a branch is always the last uop on
// that cycle gives:
//
// c0: uzp1 1 (F0) / uzp1 2 first half (F1) / subs (I0)
// c1: uzp1 3 (F0) / uzp1 2 second half (F1)
// c2: uzp1 4 (F0) / uzp1 5 first half (F1)
// c3: uzp1 6 (F0) / uzp1 5 second half (F1)
// c4: uzp1 7 (F0) / uzp1 8 first half (F1)
// c5: uzp1 8 second half (F1) / bne (B)
// (repeats after that)
//
// for 6.00 cycles per iter, which is too slow by 0.33. As said, I don't get it.
//
// Maybe the uops going to F0/F1 are load-balanced in advance with no accounting
// for the WB port conflict stall cycles, and then the distribution ends up
// lopsided? It's really hard to say without a more detailed pipeline
// description.
__asm__ volatile(
".align 4\n"
"1:\n"
"uzp1 %1.16B, %1.16B, %1.16B\n"
"uzp1 %2.16B, %2.16B, %2.16B\n"
"subs %w0,%w0,#1\n"
"uzp1 %3.16B, %3.16B, %3.16B\n"
"uzp1 %4.16B, %4.16B, %4.16B\n"
"uzp1 %5.16B, %5.16B, %5.16B\n"
"uzp1 %6.16B, %6.16B, %6.16B\n"
"uzp1 %7.16B, %7.16B, %7.16B\n"
"uzp1 %8.16B, %8.16B, %8.16B\n"
"bne 1b\n"
: "+r"(nreps), "+w"(n0), "+w"(n1), "+w"(n2), "+w"(n3), "+w"(n4), "+w"(n5), "+w"(n6), "+w"(n7));
}
// (other tests that use say "trn1" or "add" elided; same result for all full-throughput 128b ops)
static void micro_uzp1_q2(void *context, int nreps)
{
uint8x16_t n0, n1, n2, n3, n4, n5, n6, n7;
n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = vdupq_n_u8(0);
// The idea is to test whether forwarding makes a difference
// and/or has more bandwidth. Not as far as I can tell:
// benchmarks at 9.05 cycles/iter (for 12 unzips).
__asm__ volatile(
".align 4\n"
"1:\n"
"uzp1 %1.16B, %1.16B, %1.16B\n"
"uzp1 %2.16B, %2.16B, %2.16B\n"
"subs %w0,%w0,#1\n"
"uzp1 %3.16B, %3.16B, %3.16B\n"
"uzp1 %4.16B, %4.16B, %4.16B\n"
"uzp1 %5.16B, %5.16B, %5.16B\n"
"uzp1 %6.16B, %6.16B, %6.16B\n"
"uzp1 %1.16B, %1.16B, %1.16B\n"
"uzp1 %2.16B, %2.16B, %2.16B\n"
"uzp1 %3.16B, %3.16B, %3.16B\n"
"uzp1 %4.16B, %4.16B, %4.16B\n"
"uzp1 %5.16B, %5.16B, %5.16B\n"
"uzp1 %6.16B, %6.16B, %6.16B\n"
"bne 1b\n"
: "+r"(nreps), "+w"(n0), "+w"(n1), "+w"(n2), "+w"(n3), "+w"(n4), "+w"(n5), "+w"(n6), "+w"(n7));
}
static void micro_uzp1_d(void *context, int nreps)
{
uint8x16_t n0, n1, n2, n3, n4, n5, n6, n7;
n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = vdupq_n_u8(0);
// Benchmarks at 4.00 cycles/iter, as I would expect:
//
// c0: uzp1 1 / uzp1 2 / subs (I01, 2x F01)
// c1: uzp1 3 / uzp1 4 (2x F01)
// c2: uzp1 5 / uzp1 6 (2x F01)
// c3: uzp1 7 / uzp1 8 / bne (2x F01, B)
__asm__ volatile(
".align 4\n"
"1:\n"
"uzp1 %1.8b, %1.8b, %1.8b\n"
"uzp1 %2.8b, %2.8b, %2.8b\n"
"subs %w0,%w0,#1\n"
"uzp1 %3.8b, %3.8b, %3.8b\n"
"uzp1 %4.8b, %4.8b, %4.8b\n"
"uzp1 %5.8b, %5.8b, %5.8b\n"
"uzp1 %6.8b, %6.8b, %6.8b\n"
"uzp1 %7.8b, %7.8b, %7.8b\n"
"uzp1 %8.8b, %8.8b, %8.8b\n"
"bne 1b\n"
: "+r"(nreps), "+w"(n0), "+w"(n1), "+w"(n2), "+w"(n3), "+w"(n4), "+w"(n5), "+w"(n6), "+w"(n7));
}
static void micro_uzp1_mix1(void *context, int nreps)
{
uint8x16_t n0, n1, n2, n3, n4, n5, n6, n7;
n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = vdupq_n_u8(0);
// Benchmarks at 4.33 cycles/iter, not the 4.0 I would expect:
//
// c0: uzp1 1.16 / uzp1 2.8 / subs (I01, 2x F01)
// c1: uzp1 3.16 / uzp1 4.8 (2x F01)
// c2: uzp1 5.16 / uzp1 6.8 (2x F01)
// c3: uzp1 7.16 / uzp1 8.8 / bne (2x F01, B)
__asm__ volatile(
".align 4\n"
"1:\n"
"uzp1 %1.16b, %1.16b, %1.16b\n"
"uzp1 %2.8b, %2.8b, %2.8b\n"
"subs %w0,%w0,#1\n"
"uzp1 %3.16b, %3.16b, %3.16b\n"
"uzp1 %4.8b, %4.8b, %4.8b\n"
"uzp1 %5.16b, %5.16b, %5.16b\n"
"uzp1 %6.8b, %6.8b, %6.8b\n"
"uzp1 %7.16b, %7.16b, %7.16b\n"
"uzp1 %8.8b, %8.8b, %8.8b\n"
"bne 1b\n"
: "+r"(nreps), "+w"(n0), "+w"(n1), "+w"(n2), "+w"(n3), "+w"(n4), "+w"(n5), "+w"(n6), "+w"(n7));
}
static void micro_uzp1_mix2(void *context, int nreps)
{
uint8x16_t n0, n1, n2, n3, n4, n5, n6, n7;
n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = vdupq_n_u8(0);
// Benchmarks at 4.57 cycles/iter! (??!)
__asm__ volatile(
".align 4\n"
"1:\n"
"uzp1 %1.16b, %1.16b, %1.16b\n"
"uzp1 %2.16b, %2.16b, %2.16b\n"
"uzp1 %3.16b, %3.16b, %3.16b\n"
"uzp1 %4.16b, %4.16b, %4.16b\n"
"uzp1 %5.8b, %5.8b, %5.8b\n"
"uzp1 %6.8b, %6.8b, %6.8b\n"
"uzp1 %7.8b, %7.8b, %7.8b\n"
"uzp1 %8.8b, %8.8b, %8.8b\n"
"subs %w0,%w0,#1\n"
"bne 1b\n"
: "+r"(nreps), "+w"(n0), "+w"(n1), "+w"(n2), "+w"(n3), "+w"(n4), "+w"(n5), "+w"(n6), "+w"(n7));
}
static void micro_uzp1_mix3(void *context, int nreps)
{
uint8x16_t n0, n1, n2, n3, n4, n5, n6, n7;
n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = vdupq_n_u8(0);
// Benchmarks at 4.33 cycles/iter.
__asm__ volatile(
".align 4\n"
"1:\n"
"uzp1 %1.16b, %1.16b, %1.16b\n"
"uzp1 %2.16b, %2.16b, %2.16b\n"
"uzp1 %3.8b, %3.8b, %3.8b\n"
"uzp1 %4.8b, %4.8b, %4.8b\n"
"uzp1 %5.16b, %5.16b, %5.16b\n"
"uzp1 %6.16b, %6.16b, %6.16b\n"
"uzp1 %7.8b, %7.8b, %7.8b\n"
"uzp1 %8.8b, %8.8b, %8.8b\n"
"subs %w0,%w0,#1\n"
"bne 1b\n"
: "+r"(nreps), "+w"(n0), "+w"(n1), "+w"(n2), "+w"(n3), "+w"(n4), "+w"(n5), "+w"(n6), "+w"(n7));
}
static void micro_F0_128_F1_64(void *context, int nreps)
{
uint8x16_t n0, n1, n2, n3, n4, n5, n6, n7;
n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = vdupq_n_u8(0);
// Benchmarks at 16.33 cycles/iter.
// (4.33 cycles/iter if I don't unrol the SIMD ops 4x)
// wherever those stray 0.33 cycles come from, it's
// per-iteration overhead.
// SMULL: 128b write, execs in F0 pipe, 1/cycle throughput, 4c latency
// SSHL (D form): 64b write, execs in F1, 1/cycle throughput, 3c latency
__asm__ volatile(
".align 4\n"
"1:\n"
"smull %1.8h, %1.8b, %1.8b\n"
"sshl %2.8b, %2.8b, %2.8b\n"
"smull %3.8h, %3.8b, %3.8b\n"
"sshl %4.8b, %4.8b, %4.8b\n"
"smull %5.8h, %5.8b, %5.8b\n"
"sshl %6.8b, %6.8b, %6.8b\n"
"smull %7.8h, %7.8b, %7.8b\n"
"sshl %8.8b, %8.8b, %8.8b\n"
"smull %1.8h, %1.8b, %1.8b\n"
"sshl %2.8b, %2.8b, %2.8b\n"
"smull %3.8h, %3.8b, %3.8b\n"
"sshl %4.8b, %4.8b, %4.8b\n"
"smull %5.8h, %5.8b, %5.8b\n"
"sshl %6.8b, %6.8b, %6.8b\n"
"smull %7.8h, %7.8b, %7.8b\n"
"sshl %8.8b, %8.8b, %8.8b\n"
"smull %1.8h, %1.8b, %1.8b\n"
"sshl %2.8b, %2.8b, %2.8b\n"
"smull %3.8h, %3.8b, %3.8b\n"
"sshl %4.8b, %4.8b, %4.8b\n"
"smull %5.8h, %5.8b, %5.8b\n"
"sshl %6.8b, %6.8b, %6.8b\n"
"smull %7.8h, %7.8b, %7.8b\n"
"sshl %8.8b, %8.8b, %8.8b\n"
"subs %w0,%w0,#1\n"
"smull %1.8h, %1.8b, %1.8b\n"
"sshl %2.8b, %2.8b, %2.8b\n"
"smull %3.8h, %3.8b, %3.8b\n"
"sshl %4.8b, %4.8b, %4.8b\n"
"smull %5.8h, %5.8b, %5.8b\n"
"sshl %6.8b, %6.8b, %6.8b\n"
"smull %7.8h, %7.8b, %7.8b\n"
"sshl %8.8b, %8.8b, %8.8b\n"
"bne 1b\n"
: "+r"(nreps), "+w"(n0), "+w"(n1), "+w"(n2), "+w"(n3), "+w"(n4), "+w"(n5), "+w"(n6), "+w"(n7));
}
static void micro_F0_64_F1_128(void *context, int nreps)
{
uint8x16_t n0, n1, n2, n3, n4, n5, n6, n7;
n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = vdupq_n_u8(0);
// Benchmarks at 16.33 cycles/iter.
// (4.33 cycles/iter if I don't unrol the SIMD ops 4x)
// wherever those stray 0.33 cycles come from, it's
// per-iteration overhead.
// MUL (D form): 64b write, execs in F0 pipe, 1/cycle throughput, 4c latency
// SABAL: 128b write, execs in F1 pipe, 1/cycle throughput, 4c latency
__asm__ volatile(
".align 4\n"
"1:\n"
"mul %1.8b, %1.8b, %1.8b\n"
"sabal %2.8h, %2.8b, %2.8b\n"
"mul %3.8b, %3.8b, %3.8b\n"
"sabal %4.8h, %4.8b, %4.8b\n"
"mul %5.8b, %5.8b, %5.8b\n"
"sabal %6.8h, %6.8b, %6.8b\n"
"mul %7.8b, %7.8b, %7.8b\n"
"sabal %8.8h, %8.8b, %8.8b\n"
"mul %1.8b, %1.8b, %1.8b\n"
"sabal %2.8h, %2.8b, %2.8b\n"
"mul %3.8b, %3.8b, %3.8b\n"
"sabal %4.8h, %4.8b, %4.8b\n"
"mul %5.8b, %5.8b, %5.8b\n"
"sabal %6.8h, %6.8b, %6.8b\n"
"mul %7.8b, %7.8b, %7.8b\n"
"sabal %8.8h, %8.8b, %8.8b\n"
"mul %1.8b, %1.8b, %1.8b\n"
"sabal %2.8h, %2.8b, %2.8b\n"
"mul %3.8b, %3.8b, %3.8b\n"
"sabal %4.8h, %4.8b, %4.8b\n"
"mul %5.8b, %5.8b, %5.8b\n"
"sabal %6.8h, %6.8b, %6.8b\n"
"mul %7.8b, %7.8b, %7.8b\n"
"sabal %8.8h, %8.8b, %8.8b\n"
"subs %w0,%w0,#1\n"
"mul %1.8b, %1.8b, %1.8b\n"
"sabal %2.8h, %2.8b, %2.8b\n"
"mul %3.8b, %3.8b, %3.8b\n"
"sabal %4.8h, %4.8b, %4.8b\n"
"mul %5.8b, %5.8b, %5.8b\n"
"sabal %6.8h, %6.8b, %6.8b\n"
"mul %7.8b, %7.8b, %7.8b\n"
"sabal %8.8h, %8.8b, %8.8b\n"
"bne 1b\n"
: "+r"(nreps), "+w"(n0), "+w"(n1), "+w"(n2), "+w"(n3), "+w"(n4), "+w"(n5), "+w"(n6), "+w"(n7));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment