Cortex-A57 fun.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| static void micro_nothing(void *context, int nreps) | |
| { | |
| // Benchmarks at 2.0 cycles/iter. | |
| // | |
| // Apparently, can't retire more than one taken branch every 2 clocks. | |
| // (Or, more precisely, it probably can't retire _the same branch_ | |
| // more than once every 2 clocks.) | |
| __asm__ volatile( | |
| ".align 4\n" | |
| "1:\n" | |
| "subs %w0,%w0,#1\n" | |
| "bne 1b\n" | |
| : "+r"(nreps)); | |
| } | |
| static void micro_four_indep_adds(void *context, int nreps) | |
| { | |
| int64_t a=0, b=0, c=0, d=0; | |
| // Benchmarks at 2.5 cycles/iter. As per dispatch limits, it should go: | |
| // | |
| // c0: add 1; add 2 (2x I01) | |
| // c1: add 3; add 4 (2x I01) | |
| // c2: subs; bne; add 1 (2x I01, B) | |
| // c3: add 2; add 3 (2x I01) | |
| // c4: add 4; subs; bne (2x I01, B) | |
| // | |
| // 5 cycles for every two iters, so that one checks out. | |
| __asm__ volatile( | |
| ".align 4\n" | |
| "1:\n" | |
| "add %1,%1,#1\n" | |
| "add %2,%2,#1\n" | |
| "add %3,%3,#1\n" | |
| "add %4,%4,#1\n" | |
| "subs %w0,%w0,#1\n" | |
| "bne 1b\n" | |
| : "+r"(nreps), "+r"(a), "+r"(b), "+r"(c), "+r"(d)); | |
| } | |
| static void micro_four_indep_adds2(void *context, int nreps) | |
| { | |
| int64_t a=0, b=0, c=0, d=0; | |
| int32_t magic=1; | |
| // Benchmarks at 2.0 cycles/iter, as expected by dispatch limits: | |
| // | |
| // c0: add 1; add 2; add 3 (2x I01, M) | |
| // c1: add 4; subs; bne (2x I01, B) | |
| __asm__ volatile( | |
| ".align 4\n" | |
| "1:\n" | |
| "add %1,%1,#1\n" | |
| "add %2,%2,%w5,uxtw\n" | |
| "add %3,%3,#1\n" | |
| "add %4,%4,#1\n" | |
| "subs %w0,%w0,#1\n" | |
| "bne 1b\n" | |
| : "+r"(nreps), "+r"(a), "+r"(b), "+r"(c), "+r"(d) | |
| : "r"(magic)); | |
| } | |
| static void micro_uzp1_q(void *context, int nreps) | |
| { | |
| uint8x16_t n0, n1, n2, n3, n4, n5, n6, n7; | |
| n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = vdupq_n_u8(0); | |
| // Benchmarks at 5.67 cycles/iter, and I don't understand why. | |
| // | |
| // From the docs, I would expect: | |
| // | |
| // c0: uzp1 1 / uzp1 2 / subs (I01, 2x F01) | |
| // c1: uzp1 3 / uzp1 4 (2x F01) | |
| // c2: uzp1 5 / uzp1 6 (2x F01) | |
| // c3: uzp1 7 / uzp1 8 / bne (2x F01, B) | |
| // | |
| // NOTE: ARM slides say 3x 64-bit writeback ports. That would mean something like: | |
| // (the placement of the integer ops in here isn't right, where exactly they go | |
| // depends on how far the frontend forges ahead, but it doesn't matter much.) | |
| // | |
| // c0: uzp1 1 (F0) / uzp1 2 first half (F1) / subs (I0) | |
| // c1: uzp1 3 (F0) / uzp1 2 second half (F1) | |
| // c2: uzp1 4 (F0) / uzp1 5 first half (F1) | |
| // c3: uzp1 6 (F0) / uzp1 5 second half (F1) | |
| // c4: uzp1 7 (F0) / uzp1 8 first half (F1) | |
| // c5: uzp1 8 second half (F1) / bne (B) / uzp1 1 (F0) | |
| // c6: uzp1 2 (F0) / subs (I0) / uzp1 3 first half (F1) | |
| // c7: uzp1 4 (F0) / uzp1 3 second half (F1) | |
| // c8: uzp1 5 (F0) / uzp1 6 first half (F1) | |
| // c9: uzp1 7 (F0) / uzp1 6 second half (F1) | |
| // c10: uzp1 8 (F0) / bne (B) / uzp1 1 first half (F1) | |
| // c11: uzp1 2 (F0) / uzp1 1 second half (F1) / subs (I0) | |
| // c12: uzp1 3 (F0) / uzp1 4 first half (F1) | |
| // c13: uzp1 5 (F0) / uzp1 4 second half (F1) | |
| // c14: uzp1 6 (F0) / uzp1 7 first half (F1) | |
| // c15: uzp1 8 (F0) / uzp1 7 second half (F1) / bne (B) | |
| // | |
| // (and after that it repeats) | |
| // gives *16* cycles for 3 iters = 16/3 = 5.33 ... not what happens! | |
| // | |
| // However, the variant where a branch is always the last uop on | |
| // that cycle gives: | |
| // | |
| // c0: uzp1 1 (F0) / uzp1 2 first half (F1) / subs (I0) | |
| // c1: uzp1 3 (F0) / uzp1 2 second half (F1) | |
| // c2: uzp1 4 (F0) / uzp1 5 first half (F1) | |
| // c3: uzp1 6 (F0) / uzp1 5 second half (F1) | |
| // c4: uzp1 7 (F0) / uzp1 8 first half (F1) | |
| // c5: uzp1 8 second half (F1) / bne (B) | |
| // (repeats after that) | |
| // | |
| // for 6.00 cycles per iter, which is too slow by 0.33. As said, I don't get it. | |
| // | |
| // Maybe the uops going to F0/F1 are load-balanced in advance with no accounting | |
| // for the WB port conflict stall cycles, and then the distribution ends up | |
| // lopsided? It's really hard to say without a more detailed pipeline | |
| // description. | |
| __asm__ volatile( | |
| ".align 4\n" | |
| "1:\n" | |
| "uzp1 %1.16B, %1.16B, %1.16B\n" | |
| "uzp1 %2.16B, %2.16B, %2.16B\n" | |
| "subs %w0,%w0,#1\n" | |
| "uzp1 %3.16B, %3.16B, %3.16B\n" | |
| "uzp1 %4.16B, %4.16B, %4.16B\n" | |
| "uzp1 %5.16B, %5.16B, %5.16B\n" | |
| "uzp1 %6.16B, %6.16B, %6.16B\n" | |
| "uzp1 %7.16B, %7.16B, %7.16B\n" | |
| "uzp1 %8.16B, %8.16B, %8.16B\n" | |
| "bne 1b\n" | |
| : "+r"(nreps), "+w"(n0), "+w"(n1), "+w"(n2), "+w"(n3), "+w"(n4), "+w"(n5), "+w"(n6), "+w"(n7)); | |
| } | |
| // (other tests that use say "trn1" or "add" elided; same result for all full-throughput 128b ops) | |
| static void micro_uzp1_q2(void *context, int nreps) | |
| { | |
| uint8x16_t n0, n1, n2, n3, n4, n5, n6, n7; | |
| n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = vdupq_n_u8(0); | |
| // The idea is to test whether forwarding makes a difference | |
| // and/or has more bandwidth. Not as far as I can tell: | |
| // benchmarks at 9.05 cycles/iter (for 12 unzips). | |
| __asm__ volatile( | |
| ".align 4\n" | |
| "1:\n" | |
| "uzp1 %1.16B, %1.16B, %1.16B\n" | |
| "uzp1 %2.16B, %2.16B, %2.16B\n" | |
| "subs %w0,%w0,#1\n" | |
| "uzp1 %3.16B, %3.16B, %3.16B\n" | |
| "uzp1 %4.16B, %4.16B, %4.16B\n" | |
| "uzp1 %5.16B, %5.16B, %5.16B\n" | |
| "uzp1 %6.16B, %6.16B, %6.16B\n" | |
| "uzp1 %1.16B, %1.16B, %1.16B\n" | |
| "uzp1 %2.16B, %2.16B, %2.16B\n" | |
| "uzp1 %3.16B, %3.16B, %3.16B\n" | |
| "uzp1 %4.16B, %4.16B, %4.16B\n" | |
| "uzp1 %5.16B, %5.16B, %5.16B\n" | |
| "uzp1 %6.16B, %6.16B, %6.16B\n" | |
| "bne 1b\n" | |
| : "+r"(nreps), "+w"(n0), "+w"(n1), "+w"(n2), "+w"(n3), "+w"(n4), "+w"(n5), "+w"(n6), "+w"(n7)); | |
| } | |
| static void micro_uzp1_d(void *context, int nreps) | |
| { | |
| uint8x16_t n0, n1, n2, n3, n4, n5, n6, n7; | |
| n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = vdupq_n_u8(0); | |
| // Benchmarks at 4.00 cycles/iter, as I would expect: | |
| // | |
| // c0: uzp1 1 / uzp1 2 / subs (I01, 2x F01) | |
| // c1: uzp1 3 / uzp1 4 (2x F01) | |
| // c2: uzp1 5 / uzp1 6 (2x F01) | |
| // c3: uzp1 7 / uzp1 8 / bne (2x F01, B) | |
| __asm__ volatile( | |
| ".align 4\n" | |
| "1:\n" | |
| "uzp1 %1.8b, %1.8b, %1.8b\n" | |
| "uzp1 %2.8b, %2.8b, %2.8b\n" | |
| "subs %w0,%w0,#1\n" | |
| "uzp1 %3.8b, %3.8b, %3.8b\n" | |
| "uzp1 %4.8b, %4.8b, %4.8b\n" | |
| "uzp1 %5.8b, %5.8b, %5.8b\n" | |
| "uzp1 %6.8b, %6.8b, %6.8b\n" | |
| "uzp1 %7.8b, %7.8b, %7.8b\n" | |
| "uzp1 %8.8b, %8.8b, %8.8b\n" | |
| "bne 1b\n" | |
| : "+r"(nreps), "+w"(n0), "+w"(n1), "+w"(n2), "+w"(n3), "+w"(n4), "+w"(n5), "+w"(n6), "+w"(n7)); | |
| } | |
| static void micro_uzp1_mix1(void *context, int nreps) | |
| { | |
| uint8x16_t n0, n1, n2, n3, n4, n5, n6, n7; | |
| n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = vdupq_n_u8(0); | |
| // Benchmarks at 4.33 cycles/iter, not the 4.0 I would expect: | |
| // | |
| // c0: uzp1 1.16 / uzp1 2.8 / subs (I01, 2x F01) | |
| // c1: uzp1 3.16 / uzp1 4.8 (2x F01) | |
| // c2: uzp1 5.16 / uzp1 6.8 (2x F01) | |
| // c3: uzp1 7.16 / uzp1 8.8 / bne (2x F01, B) | |
| __asm__ volatile( | |
| ".align 4\n" | |
| "1:\n" | |
| "uzp1 %1.16b, %1.16b, %1.16b\n" | |
| "uzp1 %2.8b, %2.8b, %2.8b\n" | |
| "subs %w0,%w0,#1\n" | |
| "uzp1 %3.16b, %3.16b, %3.16b\n" | |
| "uzp1 %4.8b, %4.8b, %4.8b\n" | |
| "uzp1 %5.16b, %5.16b, %5.16b\n" | |
| "uzp1 %6.8b, %6.8b, %6.8b\n" | |
| "uzp1 %7.16b, %7.16b, %7.16b\n" | |
| "uzp1 %8.8b, %8.8b, %8.8b\n" | |
| "bne 1b\n" | |
| : "+r"(nreps), "+w"(n0), "+w"(n1), "+w"(n2), "+w"(n3), "+w"(n4), "+w"(n5), "+w"(n6), "+w"(n7)); | |
| } | |
| static void micro_uzp1_mix2(void *context, int nreps) | |
| { | |
| uint8x16_t n0, n1, n2, n3, n4, n5, n6, n7; | |
| n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = vdupq_n_u8(0); | |
| // Benchmarks at 4.57 cycles/iter! (??!) | |
| __asm__ volatile( | |
| ".align 4\n" | |
| "1:\n" | |
| "uzp1 %1.16b, %1.16b, %1.16b\n" | |
| "uzp1 %2.16b, %2.16b, %2.16b\n" | |
| "uzp1 %3.16b, %3.16b, %3.16b\n" | |
| "uzp1 %4.16b, %4.16b, %4.16b\n" | |
| "uzp1 %5.8b, %5.8b, %5.8b\n" | |
| "uzp1 %6.8b, %6.8b, %6.8b\n" | |
| "uzp1 %7.8b, %7.8b, %7.8b\n" | |
| "uzp1 %8.8b, %8.8b, %8.8b\n" | |
| "subs %w0,%w0,#1\n" | |
| "bne 1b\n" | |
| : "+r"(nreps), "+w"(n0), "+w"(n1), "+w"(n2), "+w"(n3), "+w"(n4), "+w"(n5), "+w"(n6), "+w"(n7)); | |
| } | |
| static void micro_uzp1_mix3(void *context, int nreps) | |
| { | |
| uint8x16_t n0, n1, n2, n3, n4, n5, n6, n7; | |
| n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = vdupq_n_u8(0); | |
| // Benchmarks at 4.33 cycles/iter. | |
| __asm__ volatile( | |
| ".align 4\n" | |
| "1:\n" | |
| "uzp1 %1.16b, %1.16b, %1.16b\n" | |
| "uzp1 %2.16b, %2.16b, %2.16b\n" | |
| "uzp1 %3.8b, %3.8b, %3.8b\n" | |
| "uzp1 %4.8b, %4.8b, %4.8b\n" | |
| "uzp1 %5.16b, %5.16b, %5.16b\n" | |
| "uzp1 %6.16b, %6.16b, %6.16b\n" | |
| "uzp1 %7.8b, %7.8b, %7.8b\n" | |
| "uzp1 %8.8b, %8.8b, %8.8b\n" | |
| "subs %w0,%w0,#1\n" | |
| "bne 1b\n" | |
| : "+r"(nreps), "+w"(n0), "+w"(n1), "+w"(n2), "+w"(n3), "+w"(n4), "+w"(n5), "+w"(n6), "+w"(n7)); | |
| } | |
| static void micro_F0_128_F1_64(void *context, int nreps) | |
| { | |
| uint8x16_t n0, n1, n2, n3, n4, n5, n6, n7; | |
| n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = vdupq_n_u8(0); | |
| // Benchmarks at 16.33 cycles/iter. | |
| // (4.33 cycles/iter if I don't unrol the SIMD ops 4x) | |
| // wherever those stray 0.33 cycles come from, it's | |
| // per-iteration overhead. | |
| // SMULL: 128b write, execs in F0 pipe, 1/cycle throughput, 4c latency | |
| // SSHL (D form): 64b write, execs in F1, 1/cycle throughput, 3c latency | |
| __asm__ volatile( | |
| ".align 4\n" | |
| "1:\n" | |
| "smull %1.8h, %1.8b, %1.8b\n" | |
| "sshl %2.8b, %2.8b, %2.8b\n" | |
| "smull %3.8h, %3.8b, %3.8b\n" | |
| "sshl %4.8b, %4.8b, %4.8b\n" | |
| "smull %5.8h, %5.8b, %5.8b\n" | |
| "sshl %6.8b, %6.8b, %6.8b\n" | |
| "smull %7.8h, %7.8b, %7.8b\n" | |
| "sshl %8.8b, %8.8b, %8.8b\n" | |
| "smull %1.8h, %1.8b, %1.8b\n" | |
| "sshl %2.8b, %2.8b, %2.8b\n" | |
| "smull %3.8h, %3.8b, %3.8b\n" | |
| "sshl %4.8b, %4.8b, %4.8b\n" | |
| "smull %5.8h, %5.8b, %5.8b\n" | |
| "sshl %6.8b, %6.8b, %6.8b\n" | |
| "smull %7.8h, %7.8b, %7.8b\n" | |
| "sshl %8.8b, %8.8b, %8.8b\n" | |
| "smull %1.8h, %1.8b, %1.8b\n" | |
| "sshl %2.8b, %2.8b, %2.8b\n" | |
| "smull %3.8h, %3.8b, %3.8b\n" | |
| "sshl %4.8b, %4.8b, %4.8b\n" | |
| "smull %5.8h, %5.8b, %5.8b\n" | |
| "sshl %6.8b, %6.8b, %6.8b\n" | |
| "smull %7.8h, %7.8b, %7.8b\n" | |
| "sshl %8.8b, %8.8b, %8.8b\n" | |
| "subs %w0,%w0,#1\n" | |
| "smull %1.8h, %1.8b, %1.8b\n" | |
| "sshl %2.8b, %2.8b, %2.8b\n" | |
| "smull %3.8h, %3.8b, %3.8b\n" | |
| "sshl %4.8b, %4.8b, %4.8b\n" | |
| "smull %5.8h, %5.8b, %5.8b\n" | |
| "sshl %6.8b, %6.8b, %6.8b\n" | |
| "smull %7.8h, %7.8b, %7.8b\n" | |
| "sshl %8.8b, %8.8b, %8.8b\n" | |
| "bne 1b\n" | |
| : "+r"(nreps), "+w"(n0), "+w"(n1), "+w"(n2), "+w"(n3), "+w"(n4), "+w"(n5), "+w"(n6), "+w"(n7)); | |
| } | |
| static void micro_F0_64_F1_128(void *context, int nreps) | |
| { | |
| uint8x16_t n0, n1, n2, n3, n4, n5, n6, n7; | |
| n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = vdupq_n_u8(0); | |
| // Benchmarks at 16.33 cycles/iter. | |
| // (4.33 cycles/iter if I don't unrol the SIMD ops 4x) | |
| // wherever those stray 0.33 cycles come from, it's | |
| // per-iteration overhead. | |
| // MUL (D form): 64b write, execs in F0 pipe, 1/cycle throughput, 4c latency | |
| // SABAL: 128b write, execs in F1 pipe, 1/cycle throughput, 4c latency | |
| __asm__ volatile( | |
| ".align 4\n" | |
| "1:\n" | |
| "mul %1.8b, %1.8b, %1.8b\n" | |
| "sabal %2.8h, %2.8b, %2.8b\n" | |
| "mul %3.8b, %3.8b, %3.8b\n" | |
| "sabal %4.8h, %4.8b, %4.8b\n" | |
| "mul %5.8b, %5.8b, %5.8b\n" | |
| "sabal %6.8h, %6.8b, %6.8b\n" | |
| "mul %7.8b, %7.8b, %7.8b\n" | |
| "sabal %8.8h, %8.8b, %8.8b\n" | |
| "mul %1.8b, %1.8b, %1.8b\n" | |
| "sabal %2.8h, %2.8b, %2.8b\n" | |
| "mul %3.8b, %3.8b, %3.8b\n" | |
| "sabal %4.8h, %4.8b, %4.8b\n" | |
| "mul %5.8b, %5.8b, %5.8b\n" | |
| "sabal %6.8h, %6.8b, %6.8b\n" | |
| "mul %7.8b, %7.8b, %7.8b\n" | |
| "sabal %8.8h, %8.8b, %8.8b\n" | |
| "mul %1.8b, %1.8b, %1.8b\n" | |
| "sabal %2.8h, %2.8b, %2.8b\n" | |
| "mul %3.8b, %3.8b, %3.8b\n" | |
| "sabal %4.8h, %4.8b, %4.8b\n" | |
| "mul %5.8b, %5.8b, %5.8b\n" | |
| "sabal %6.8h, %6.8b, %6.8b\n" | |
| "mul %7.8b, %7.8b, %7.8b\n" | |
| "sabal %8.8h, %8.8b, %8.8b\n" | |
| "subs %w0,%w0,#1\n" | |
| "mul %1.8b, %1.8b, %1.8b\n" | |
| "sabal %2.8h, %2.8b, %2.8b\n" | |
| "mul %3.8b, %3.8b, %3.8b\n" | |
| "sabal %4.8h, %4.8b, %4.8b\n" | |
| "mul %5.8b, %5.8b, %5.8b\n" | |
| "sabal %6.8h, %6.8b, %6.8b\n" | |
| "mul %7.8b, %7.8b, %7.8b\n" | |
| "sabal %8.8h, %8.8b, %8.8b\n" | |
| "bne 1b\n" | |
| : "+r"(nreps), "+w"(n0), "+w"(n1), "+w"(n2), "+w"(n3), "+w"(n4), "+w"(n5), "+w"(n6), "+w"(n7)); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment