Created
January 28, 2015 09:19
-
-
Save babanin/7fffd88475abcd8e768e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.openjdk; | |
import org.openjdk.jmh.annotations.Benchmark; | |
import org.openjdk.jmh.annotations.BenchmarkMode; | |
import org.openjdk.jmh.annotations.Fork; | |
import org.openjdk.jmh.annotations.Measurement; | |
import org.openjdk.jmh.annotations.Mode; | |
import org.openjdk.jmh.annotations.OutputTimeUnit; | |
import org.openjdk.jmh.annotations.Param; | |
import org.openjdk.jmh.annotations.Scope; | |
import org.openjdk.jmh.annotations.Setup; | |
import org.openjdk.jmh.annotations.State; | |
import org.openjdk.jmh.annotations.Warmup; | |
import java.util.ArrayList; | |
import java.util.List; | |
import java.util.concurrent.TimeUnit; | |
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) | |
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) | |
@Fork(3) | |
@BenchmarkMode(Mode.AverageTime) | |
@OutputTimeUnit(TimeUnit.NANOSECONDS) | |
@State(Scope.Benchmark) | |
public class ToArrayBench { | |
@Param({"1", "100", "10000"}) | |
private int size; | |
List<Byte> list; | |
@Setup | |
public void setup() throws Throwable { | |
list = new ArrayList<>(); | |
for (int i = 0; i < size; i++) { | |
list.add((byte) i); | |
} | |
} | |
/* | |
Originally asked by @twillouer here: | |
https://twitter.com/twillouer/status/558745334709362688 | |
Running this benchmark on 1x4x2 i7-4790K, JDK 8u40 EA, Linux x86_64 produces: | |
Benchmark (size) Mode Cnt Score Error Units | |
ToArrayBench.defensive_copy 1 avgt 15 9.781 ± 0.117 ns/op | |
ToArrayBench.simple_toArray 1 avgt 15 7.950 ± 0.428 ns/op | |
ToArrayBench.sized_array_fixed_size 1 avgt 15 18.384 ± 0.844 ns/op | |
ToArrayBench.sized_array_from_list 1 avgt 15 19.548 ± 0.631 ns/op | |
ToArrayBench.zero_sized_array 1 avgt 15 10.499 ± 0.647 ns/op | |
ToArrayBench.defensive_copy 100 avgt 15 49.125 ± 10.232 ns/op | |
ToArrayBench.simple_toArray 100 avgt 15 41.914 ± 1.353 ns/op | |
ToArrayBench.sized_array_fixed_size 100 avgt 15 118.524 ± 1.912 ns/op | |
ToArrayBench.sized_array_from_list 100 avgt 15 123.494 ± 8.427 ns/op | |
ToArrayBench.zero_sized_array 100 avgt 15 106.610 ± 1.730 ns/op | |
ToArrayBench.defensive_copy 10000 avgt 15 3973.125 ± 64.316 ns/op | |
ToArrayBench.simple_toArray 10000 avgt 15 3996.490 ± 185.899 ns/op | |
ToArrayBench.sized_array_fixed_size 10000 avgt 15 11088.298 ± 785.341 ns/op | |
ToArrayBench.sized_array_from_list 10000 avgt 15 11195.440 ± 1195.366 ns/op | |
ToArrayBench.zero_sized_array 10000 avgt 15 9246.457 ± 110.167 ns/op | |
EXERCISE: Try to explain these numbers before moving to the explanation below. | |
*/ | |
@Benchmark | |
public ArrayList<Byte> defensive_copy() { | |
return new ArrayList<>(list); | |
} | |
@Benchmark | |
public Object[] simple_toArray() { | |
return list.toArray(); | |
} | |
@Benchmark | |
public Byte[] zero_sized_array() { | |
return list.toArray(new Byte[0]); | |
} | |
@Benchmark | |
public Byte[] sized_array_from_list() { | |
return list.toArray(new Byte[list.size()]); | |
} | |
@Benchmark | |
public Byte[] sized_array_fixed_size() { | |
return list.toArray(new Byte[size]); | |
} | |
/* | |
============================================== SPOILERS ==================================================== | |
============================================== SPOILERS ==================================================== | |
============================================== SPOILERS ==================================================== | |
============================================== SPOILERS ==================================================== | |
This thing is *VERY* easy to untangle with "-prof perfasm", and JMH 1.5+ that is able to decode VM stubs. | |
Both ArrayList.toArray() and ArrayList::new(ArrayList) do Arrays.copyOf(), that is | |
delegated to VM stub that implements arraycopy (StubRoutines::jint_disjoint_arraycopy). | |
The hottest block there is: | |
(the same for "defensive_copy" and "simple_toArray") | |
2.03% 3.83% 0x00007f57b3fb8460: vmovdqu -0x38(%rdi,%rdx,8),%ymm0 | |
6.19% 10.37% 0x00007f57b3fb8466: vmovdqu %ymm0,-0x38(%rsi,%rdx,8) | |
11.73% 15.67% 0x00007f57b3fb846c: vmovdqu -0x18(%rdi,%rdx,8),%ymm1 | |
28.56% 15.09% 0x00007f57b3fb8472: vmovdqu %ymm1,-0x18(%rsi,%rdx,8) | |
15.06% 20.46% 0x00007f57b3fb8478: add $0x8,%rdx | |
0.05% 0x00007f57b3fb847c: jle Stub::jint_disjoint_arraycopy+64 0x0x7f57b3fb8460 | |
This is the AVX2-enabled arraycopy of the entire array, without fear and remorse, | |
expected to be fast. Notice the result of toArray() is Object[], and you cannot cast it to | |
Byte[]. ArrayList copying also produces the copy of Object[] internally, since type is erased. | |
But if you need a typed array... you fall into the interesting trap, as all other tests that | |
put the values into Byte[] arrays fall into. ArrayList.toArray(T[] dst) needs to check the target | |
element type, and therefore in the end it delegates to type-checking VM stub that implements arraycopy | |
(StubRoutines::checkcast_arraycopy): | |
(almost the same for "zero_sized_array", "sized_array_from_list", "sized_array_fixed_size") | |
; <a bit torn off, this is actually a final stage> | |
; pack and store to destination | |
9.47% 13.70% 0x00007ff76d0529b0: shr $0x3,%rax | |
6.28% 1.96% 0x00007ff76d0529b4: mov %eax,0x0(%r13,%rdx,4) | |
; i++, and termination check | |
2.63% 3.57% 0x00007ff76d0529b9: inc %rdx | |
0x00007ff76d0529bc: je Stub::checkcast_arraycopy+155 0x0x7ff76d052a1b | |
; load and unpack | |
4.89% 5.43% 0x00007ff76d0529c2: mov (%rdi,%rdx,4),%eax | |
10.32% 13.93% 0x00007ff76d0529c5: shl $0x3,%rax | |
; test for null | |
1.60% 2.10% 0x00007ff76d0529c9: test %rax,%rax | |
0x00007ff76d0529cc: je Stub::checkcast_arraycopy+48 0x0x7ff76d0529b0 | |
; type check | |
3.83% 4.55% 0x00007ff76d0529ce: mov 0x8(%rax),%r11d | |
17.64% 21.19% 0x00007ff76d0529d2: shl $0x3,%r11 | |
13.07% 17.12% 0x00007ff76d0529d6: cmp %r8,%r11 | |
0x00007ff76d0529d9: je Stub::checkcast_arraycopy+48 0x0x7ff76d0529b0 | |
This is a per-element copy that for each element does: load the reference, unpack it, null-check it, | |
check the type is correct, put it to the destination. No wonder this thing is significantly slower than | |
a streaming copy. (Actually, VM could be a bit smarter here, and figure out the target array type is | |
correct for all elements) | |
The only "mystery" left is why "zero_sized_array" is slightly faster than pre-sized arrays. If you look into | |
the *second* hottest block in non-zero array copies, you will naturally see the allocation of the | |
Byte[] array: | |
0.30% 0x00007ff76d19e2ad: movl $0xf8011e02,0x8(%r14) ; {metadata('java/lang/Byte'[])} | |
0x00007ff76d19e2b5: mov %r11d,0xc(%r14) | |
0x00007ff76d19e2b9: prefetchnta 0x140(%r8) | |
0.06% 0x00007ff76d19e2c1: mov %r14,%rdi | |
0x00007ff76d19e2c4: add $0x10,%rdi | |
0x00007ff76d19e2c8: prefetchnta 0x180(%r8) | |
0.02% 0.02% 0x00007ff76d19e2d0: shr $0x3,%rcx | |
0x00007ff76d19e2d4: add $0xfffffffffffffffe,%rcx | |
0.02% 0x00007ff76d19e2d8: xor %rax,%rax | |
0x00007ff76d19e2db: shl $0x3,%rcx | |
9.76% 0.10% 0x00007ff76d19e2df: rex.W rep stos %al,%es:(%rdi) ;*anewarray | |
0.37% 0x00007ff76d19e2e2: mov 0x8(%r12,%rbp,8),%r10d ; implicit exception: dispatches to 0x00007ff76d19e5ee | |
Notice this requires *pre-zeroing* the array, see the "rep stos" that consumes around 10% of time. When we | |
are dealing with "zero_sized_array", VM allocates the array for us. But, as far as I can tell, since VM knows it | |
is going to fill the array entirely, it will skip pre-zeroing, and just copy the contents over. This explains | |
the 10% difference between zero and non-zero toArray(...) calls performance. | |
Sometimes you should just trust the VM to do the sane thing. | |
*/ | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment