Skip to content

Instantly share code, notes, and snippets.

@pnkfelix
Last active April 3, 2019 12:50
Show Gist options
  • Save pnkfelix/1b54b3272201d9f096a2289fd5712b52 to your computer and use it in GitHub Desktop.
Save pnkfelix/1b54b3272201d9f096a2289fd5712b52 to your computer and use it in GitHub Desktop.
perf annotate results for PR #59605

rust-lang/rust#59605

All three of the benchmarks here attempt to side-step allocation overhead by making a special target "collection" for the FromIterator traversal that just takes the last element from the iteration. (So its not really a collection at all.)

  • In practice this means that the code should just compile into stepping through the iteration, cloning each element and saving it in a temporary variable, and then moving that temporary variable into a Some at the end of the iteration.
  • (Its possible that the compiler is avoiding some extra copying in some circumstances. Its also possible that we are just getting lucky with respect to register allocation and/or register pressure in some circumstances. It seems to me like the hot-loop in into_last_std may be achieving both...)

Here is an overview of the three benchmarks. (The source code is presented in PR #59605; both the "std" and "new" use the same benchmark code, and I just gathered a perf run from before and after the reimplementation of the underlying impl FromIterator for Result.)

  • bench_result_from_iter_into_last_old is an inlined re-implementation of the original libstd FromIterator for Result.
    • This takes 665 ns/iter, pretty reliably.
  • bench_result_from_iter_into_last_std measures the orignal libstd FromIterator for Result.
    • I am seeing this take 369 ns/iter
    • I assume this 2x over bench_result_from_iter_into_last_old improvement is at least in part due to inlining directly into the benchmark driver?
  • bench_result_from_iter_into_last_new measures the new libstd FromIterator for Result.
    • For some reason this takes on the order of 3K ns/iter
    • that is a 5x to 10x slowdown (depending on which case above you compare against).
Percent│ ◆
│ ▒
│ ▒
│ Disassembly of section .text: ▒
│ ▒
│ 000000000000c4e0 <libtest::ns_iter_inner>: ▒
│ _ZN7libtest13ns_iter_inner17h2b6451fa1997b36eE(): ▒
│ push %rbp ▒
│ push %r15 ▒
│ push %r14 ▒
│ push %r13 ▒
│ push %r12 ▒
│ push %rbx ▒
│ sub $0xa8,%rsp ▒
│ mov %rsi,%rbp ▒
│ mov %rdi,%rbx ▒
│ → callq *0x6893b(%rip) # 74e38 <std::time::Instant::now> ▒
│ mov %rax,0x98(%rsp) ▒
│ mov %rdx,0xa0(%rsp) ▒
│ mov %rbp,0x88(%rsp) ▒
│ test %rbp,%rbp ▒
│ ↓ je 15d ▒
│ lea 0x60(%rsp),%r13 ▒
│ xor %r14d,%r14d ▒
│ mov %rbx,0x80(%rsp) ▒
│ xchg %ax,%ax ▒
│ 50: mov (%rbx),%rax ▒
│ mov (%rax),%rbx ▒
0.05 │ mov 0x10(%rax),%r15 ▒
│ movq $0x0,0x18(%rsp) ▒
│ test %r15,%r15 ▒
│ ↓ je f0 ▒
│ shl $0x5,%r15 ▒
0.04 │ add $0x8,%rbx ▒
│ xor %r12d,%r12d ▒
│ nop ▒
│ 80: cmpq $0x1,-0x8(%rbx) ▒
29.05 │ ↓ jne b0 ▒
│ lea 0x30(%rsp),%rdi ▒
│ mov %rbx,%rsi ▒
│ → callq *0x68a7b(%rip) # 74ff0 <<alloc::string::String as core::clone::Clone>::clone> ▒
│ mov 0x30(%rsp),%rax ▒
│ lea 0x38(%rsp),%rcx ▒
│ movups (%rcx),%xmm0 ▒
│ movaps %xmm0,(%rsp) ▒
│ mov $0x1,%ecx ▒
│ ↓ jmp b5 ▒
│ nop ◆
0.06 │ b0: mov (%rbx),%rax ▒
│ xor %ecx,%ecx ▒
│ b5: mov %rcx,0x50(%rsp) ▒
25.47 │ mov %rax,0x58(%rsp) ▒
27.78 │ movaps (%rsp),%xmm0 ▒
5.86 │ movups %xmm0,0x0(%r13) ▒
5.03 │ movups 0x0(%r13),%xmm0 ▒
6.47 │ movaps %xmm0,(%rsp) ▒
0.09 │ test %rcx,%rcx ▒
│ ↓ jne 100 ▒
│ add $0x20,%rbx ▒
│ mov $0x1,%r12d ▒
│ mov %rax,%rbp ▒
│ add $0xffffffffffffffe0,%r15 ▒
│ ↑ jne 80 ▒
│ mov %rax,%rbp ▒
│ ↓ jmp 11f ▒
│ xchg %ax,%ax ▒
│ f0: xor %r12d,%r12d ▒
│ mov %rax,%rbp ▒
│ ↓ jmp 11f ▒
│ nop ▒
│100: movaps (%rsp),%xmm0 ▒
│ movaps %xmm0,0x30(%rsp) ▒
│ mov %rax,0x18(%rsp) ▒
│ lea 0x20(%rsp),%rcx ▒
│ movups %xmm0,(%rcx) ▒
│ test %rax,%rax ▒
│ ↓ jne 204 ▒
0.06 │11f: mov %r12,0x18(%rsp) ▒
│ mov %rbp,0x20(%rsp) ▒
│ mov 0x80(%rsp),%rbx ▒
│ mov 0x8(%rbx),%rax ▒
0.05 │ cmp (%rax),%r12 ▒
│ ↓ jne 188 ▒
│ cmp $0x1,%r12 ▒
│ ↓ jne 146 ▒
│ cmp 0x8(%rax),%rbp ▒
│ ↓ jne 188 ▒
│146: add $0x1,%r14 ▒
│ lea 0x50(%rsp),%rax ▒
│ cmp 0x88(%rsp),%r14 ▒
│ ↑ jb 50 ▒
│15d: lea 0x98(%rsp),%rdi ▒
│ → callq *0x686e5(%rip) # 74d30 <std::time::Instant::elapsed> ▒
│ mov %rax,%rdi ▒
│ mov %edx,%esi ▒
│ → callq *0x6895a(%rip) # 74fb0 <libtest::ns_from_dur> ▒
│ add $0xa8,%rsp ▒
│ pop %rbx ▒
│ pop %r12 ▒
│ pop %r13 ▒
│ pop %r14 ▒
│ pop %r15 ▒
│ pop %rbp ▒
│ ← retq ▒
│188: lea 0x18(%rsp),%rcx ▒
│ mov %rcx,0x90(%rsp) ▒
│ mov %rax,(%rsp) ▒
│ lea 0x90(%rsp),%rax ▒
│ mov %rax,0x30(%rsp) ▒
│ lea <&T as core::fmt::Debug,%rax ▒
│ mov %rax,0x38(%rsp) ▒
│ mov %rsp,%rcx ▒
│ mov %rcx,0x40(%rsp) ▒
│ mov %rax,0x48(%rsp) ▒
│ lea __dso_handle+0x78,%rax ▒
│ mov %rax,0x50(%rsp) ▒
│ movq $0x3,0x58(%rsp) ▒
│ movq $0x0,0x60(%rsp) ▒
│ lea 0x30(%rsp),%rax ▒
│ mov %rax,0x70(%rsp) ▒
│ movq $0x2,0x78(%rsp) ▒
│ lea __dso_handle+0x150,%rsi ▒
│ lea 0x50(%rsp),%rdi ▒
│ → callq *0x6863e(%rip) # 74d20 <std::panicking::begin_panic_fmt> ▒
│ ud2 ▒
│204: movups 0x20(%rsp),%xmm0 ◆
│ mov %rax,0x50(%rsp) ▒
│ movups %xmm0,0x58(%rsp) ▒
│ lea _fini+0x588,%rdi ▒
│ lea 0x50(%rsp),%rdx ▒
│ mov $0x2b,%esi ▒
│ → callq core::result::unwrap_failed ▒
│ ud2 ▒
│ mov %rax,%rbx ▒
│ lea 0x18(%rsp),%rdi ▒
│ → callq core::ptr::real_drop_in_place ▒
│ mov %rbx,%rdi ▒
│ → callq _Unwind_Resume@plt ▒
│ ud2 ▒
Percent│ ▒
│ ▒
│ ▒
│ Disassembly of section .text: ▒
│ ▒
│ 0000000000059a20 <corebenches::iter::old_result_from_iter>: ▒
│ _ZN11corebenches4iter20old_result_from_iter17hbedd0237a9c59ab2E(): ▒
│ push %r15 ▒
0.04 │ push %r14 ▒
│ push %rbx ▒
│ sub $0x60,%rsp ◆
0.22 │ mov %rdi,%r15 ▒
│ mov %rsi,0x18(%rsp) ▒
│ mov %rdx,0x20(%rsp) ▒
│ lea 0x28(%rsp),%r14 ▒
0.13 │ movq $0x0,0x28(%rsp) ▒
│ ┌──cmp %rdx,%rsi ▒
│ ├──je 71 ▒
0.04 │ │ lea 0x20(%rsi),%rcx ▒
│ │ mov %rcx,0x18(%rsp) ▒
0.09 │ │ lea 0x8(%rsi),%rax ▒
│ │ cmpq $0x1,(%rsi) ▒
│ │↓ jne 99 ▒
│ │ mov %rsp,%rdi ▒
│ │ mov %rax,%rsi ▒
│ │→ callq *0x1b588(%rip) # 74ff0 <<alloc::string::String as core::clone::Clone>::clone> ▒
│ │ mov (%rsp),%rax ▒
│ │ movups 0x8(%rsp),%xmm0 ▒
│ │ movaps %xmm0,0x40(%rsp) ▒
│ │ movaps 0x40(%rsp),%xmm0 ▒
│ │ movaps %xmm0,0x50(%rsp) ▒
│ │ mov %rax,0x28(%rsp) ▒
│ │ movaps 0x50(%rsp),%xmm0 ▒
│ │ movups %xmm0,0x30(%rsp) ▒
│ │↓ jmp 73 ▒
│ 71:└─→xor %eax,%eax ▒
│ 73: xor %ecx,%ecx ▒
│ test %rax,%rax ▒
│ ↓ je 138 ▒
│ 7e: mov 0x10(%r14),%rax ▒
│ mov %rax,0x18(%r15) ▒
│ movups (%r14),%xmm0 ▒
│ movups %xmm0,0x8(%r15) ▒
│ mov $0x1,%ecx ▒
│ ↓ jmpq 142 ▒
│ 99: mov (%rax),%rbx ▒
0.13 │ cmp %rdx,%rcx ▒
│ ↓ je 128 ▒
│ lea 0x28(%rsi),%rax ▒
│ cmpq $0x1,0x20(%rsi) ▒
0.04 │ ↓ jne c0 ▒
│ add $0x40,%rsi ▒
│ ↓ jmp e5 ▒
│ nop ▒
5.00 │ c0: mov %rax,%rcx ▒
16.40 │ mov (%rax),%rbx ▒
3.82 │ lea 0x18(%rax),%rax ▒
15.87 │ cmp %rdx,%rax ▒
│ ↓ je 123 ▒
4.95 │ add $0x8,%rax ▒
12.64 │ cmpq $0x1,0x18(%rcx) ▒
40.30 │ ↑ jne c0 ▒
│ add $0x20,%rcx ▒
│ lea 0x18(%rcx),%rsi ▒
│ mov %rcx,%rax ▒
│ e5: mov %rsi,0x18(%rsp) ▒
│ mov %rsp,%rdi ▒
│ mov %rax,%rsi ▒
│ → callq *0x1b4da(%rip) # 74ff0 <<alloc::string::String as core::clone::Clone>::clone> ▒
│ mov (%rsp),%rax ▒
│ movups 0x8(%rsp),%xmm0 ▒
│ movaps %xmm0,0x40(%rsp) ▒
│ movaps 0x40(%rsp),%xmm0 ▒
│ movaps %xmm0,0x50(%rsp) ▒
│ movaps 0x50(%rsp),%xmm0 ◆
│ movaps %xmm0,(%rsp) ▒
│ mov %rax,0x28(%rsp) ▒
│ movups %xmm0,0x30(%rsp) ▒
│ ↓ jmp 12a ▒
│123: mov %rdx,0x18(%rsp) ▒
│128: xor %eax,%eax ▒
│12a: mov $0x1,%ecx ▒
│ test %rax,%rax ▒
│ ↑ jne 7e ▒
│138: mov %rcx,0x8(%r15) ▒
0.22 │ mov %rbx,0x10(%r15) ▒
│ xor %ecx,%ecx ▒
│142: mov %rcx,(%r15) ▒
│ mov %r15,%rax ▒
│ add $0x60,%rsp ▒
0.09 │ pop %rbx ▒
│ pop %r14 ▒
│ pop %r15 ▒
│ ← retq ▒
│ ↓ jmp 154 ▒
│154: mov %rax,%rbx ▒
│ mov %r14,%rdi ▒
│ → callq core::ptr::real_drop_in_place ▒
│ mov %rbx,%rdi ▒
│ → callq _Unwind_Resume@plt ▒
│ ud2 ▒
Percent│ ◆
│ ▒
│ ▒
│ Disassembly of section .text: ▒
│ ▒
│ 000000000000bf80 <libtest::ns_iter_inner>: ▒
│ _ZN7libtest13ns_iter_inner17h20bb9ba29da0ae68E(): ▒
│ push %rbp ▒
│ push %r15 ▒
│ push %r14 ▒
│ push %r13 ▒
│ push %r12 ▒
│ push %rbx ▒
│ sub $0x88,%rsp ▒
│ mov %rsi,%r14 ▒
│ mov %rdi,%rbx ▒
│ → callq *0x68e9b(%rip) # 74e38 <std::time::Instant::now> ▒
│ mov %rax,0x78(%rsp) ▒
│ mov %rdx,0x80(%rsp) ▒
│ test %r14,%r14 ▒
│ ↓ je 13b ▒
│ xor %ebp,%ebp ▒
│ mov %rsp,%r15 ▒
│ nop ▒
│ 40: mov (%rbx),%rax ▒
│ mov (%rax),%rsi ▒
│ mov 0x10(%rax),%rax ▒
│ mov %rax,%rcx ▒
│ shl $0x5,%rcx ▒
0.78 │ add %rsi,%rcx ▒
│ mov %rsi,(%rsp) ▒
0.19 │ mov %rcx,0x8(%rsp) ▒
│ movq $0x0,0x10(%rsp) ▒
│ test %rax,%rax ▒
│ ↓ je 100 ▒
│ lea 0x20(%rsi),%rax ▒
│ cmpq $0x1,(%rsi) ▒
│ ↓ jne 90 ▒
│ add $0x8,%rsi ▒
│ xor %r12d,%r12d ▒
│ ↓ jmp ad ▒
│ nop ▒
│ nop ▒
16.19 │ 90: mov -0x18(%rax),%r13 ▒
18.04 │ cmp %rcx,%rax ▒
0.19 │ ↓ je 105 ▒
24.46 │ cmpq $0x1,(%rax) ▒
20.75 │ lea 0x20(%rax),%rax ▒
19.39 │ ↑ jne 90 ▒
│ mov $0x1,%r12d ▒
│ lea -0x18(%rax),%rsi ▒
│ ad: mov %rax,(%rsp) ▒
│ lea 0x40(%rsp),%rdi ▒
│ → callq *0x68fb4(%rip) # 74ff0 <<alloc::string::String as core::clone::Clone>::clone> ▒
│ mov 0x40(%rsp),%rax ▒
│ lea 0x48(%rsp),%rcx ▒
│ movups (%rcx),%xmm0 ▒
│ movaps %xmm0,0x30(%rsp) ▒
│ movaps 0x30(%rsp),%xmm0 ▒
│ movaps %xmm0,0x60(%rsp) ▒
│ movaps 0x60(%rsp),%xmm0 ▒
│ movaps %xmm0,0x40(%rsp) ▒
│ mov %rax,0x10(%rsp) ▒
│ lea 0x10(%rsp),%rcx ▒
│ movups %xmm0,0x8(%rcx) ▒
│ test %rax,%rax ▒
│ ↓ je 110 ▒
│ ↓ jmpq 1d9 ◆
│ nop ▒
│100: xor %r12d,%r12d ▒
│ ↓ jmp 110 ▒
│105: mov %rcx,(%rsp) ▒
│ mov $0x1,%r12d ▒
│ nop ▒
│110: mov %r12,0x30(%rsp) ▒
│ mov %r13,0x38(%rsp) ▒
│ mov 0x8(%rbx),%rax ▒
│ cmp (%rax),%r12 ▒
│ ↓ jne 163 ▒
│ test %r12,%r12 ▒
│ ↓ je 12e ▒
│ cmp 0x8(%rax),%r13 ▒
│ ↓ jne 163 ▒
│12e: add $0x1,%rbp ▒
│ cmp %r14,%rbp ▒
│ ↑ jb 40 ▒
│13b: lea 0x78(%rsp),%rdi ▒
│ → callq *0x68c6a(%rip) # 74d30 <std::time::Instant::elapsed> ▒
│ mov %rax,%rdi ▒
│ mov %edx,%esi ▒
│ → callq *0x68edf(%rip) # 74fb0 <libtest::ns_from_dur> ▒
│ add $0x88,%rsp ▒
│ pop %rbx ▒
│ pop %r12 ▒
│ pop %r13 ▒
│ pop %r14 ▒
│ pop %r15 ▒
│ pop %rbp ▒
│ ← retq ▒
│163: lea 0x30(%rsp),%rcx ▒
│ mov %rcx,0x70(%rsp) ▒
│ mov %rax,0x60(%rsp) ▒
│ lea 0x70(%rsp),%rax ▒
│ mov %rax,0x40(%rsp) ▒
│ lea <&T as core::fmt::Debug,%rax ▒
│ mov %rax,0x48(%rsp) ▒
│ lea 0x60(%rsp),%rcx ▒
│ mov %rcx,0x50(%rsp) ▒
│ mov %rax,0x58(%rsp) ▒
│ lea __dso_handle+0x78,%rax ▒
│ mov %rax,(%rsp) ▒
│ movq $0x3,0x8(%rsp) ▒
│ movq $0x0,0x10(%rsp) ▒
│ lea 0x40(%rsp),%rax ▒
│ mov %rax,0x20(%rsp) ▒
│ movq $0x2,0x28(%rsp) ▒
│ lea __dso_handle+0x150,%rsi ▒
│ mov %rsp,%rdi ▒
│ → callq *0x68bc9(%rip) # 74d20 <std::panicking::begin_panic_fmt> ▒
│ ud2 ▒
│1d9: movups 0x18(%rsp),%xmm0 ▒
│ mov %rax,(%rsp) ▒
│ movups %xmm0,0x8(%rsp) ▒
│ lea _fini+0x8c8,%rdi ▒
│ mov %rsp,%rdx ▒
│ mov $0x2b,%esi ▒
│ → callq core::result::unwrap_failed ▒
│ ud2 ▒
│ mov %rax,%rbx ▒
│ lea 0x10(%rsp),%rdi ▒
│ → callq core::ptr::real_drop_in_place ▒
│ mov %rbx,%rdi ▒
│ → callq _Unwind_Resume@plt ▒
│ ud2 ▒
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment