babanin/gist:7fffd88475abcd8e768e

## gistfile1.java
package org.openjdk;

import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;

import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;

@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Fork(3)
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Benchmark)
public class ToArrayBench {

    @Param({"1", "100", "10000"})
    private int size;

    List<Byte> list;

    @Setup
    public void setup() throws Throwable {
        list = new ArrayList<>();
        for (int i = 0; i < size; i++) {
            list.add((byte) i);
        }
    }

    /*
       Originally asked by @twillouer here:
         https://twitter.com/twillouer/status/558745334709362688

       Running this benchmark on 1x4x2 i7-4790K, JDK 8u40 EA, Linux x86_64 produces:

        Benchmark                            (size)  Mode  Cnt      Score      Error  Units
        ToArrayBench.defensive_copy               1  avgt   15      9.781 Â±    0.117  ns/op
        ToArrayBench.simple_toArray               1  avgt   15      7.950 Â±    0.428  ns/op
        ToArrayBench.sized_array_fixed_size       1  avgt   15     18.384 Â±    0.844  ns/op
        ToArrayBench.sized_array_from_list        1  avgt   15     19.548 Â±    0.631  ns/op
        ToArrayBench.zero_sized_array             1  avgt   15     10.499 Â±    0.647  ns/op

        ToArrayBench.defensive_copy             100  avgt   15     49.125 Â±   10.232  ns/op
        ToArrayBench.simple_toArray             100  avgt   15     41.914 Â±    1.353  ns/op
        ToArrayBench.sized_array_fixed_size     100  avgt   15    118.524 Â±    1.912  ns/op
        ToArrayBench.sized_array_from_list      100  avgt   15    123.494 Â±    8.427  ns/op
        ToArrayBench.zero_sized_array           100  avgt   15    106.610 Â±    1.730  ns/op

        ToArrayBench.defensive_copy           10000  avgt   15   3973.125 Â±   64.316  ns/op
        ToArrayBench.simple_toArray           10000  avgt   15   3996.490 Â±  185.899  ns/op
        ToArrayBench.sized_array_fixed_size   10000  avgt   15  11088.298 Â±  785.341  ns/op
        ToArrayBench.sized_array_from_list    10000  avgt   15  11195.440 Â± 1195.366  ns/op
        ToArrayBench.zero_sized_array         10000  avgt   15   9246.457 Â±  110.167  ns/op

        EXERCISE: Try to explain these numbers before moving to the explanation below.
     */

    @Benchmark
    public ArrayList<Byte> defensive_copy() {
        return new ArrayList<>(list);
    }

    @Benchmark
    public Object[] simple_toArray() {
        return list.toArray();
    }

    @Benchmark
    public Byte[] zero_sized_array() {
        return list.toArray(new Byte[0]);
    }

    @Benchmark
    public Byte[] sized_array_from_list() {
        return list.toArray(new Byte[list.size()]);
    }

    @Benchmark
    public Byte[] sized_array_fixed_size() {
        return list.toArray(new Byte[size]);
    }

    /*
      ============================================== SPOILERS ====================================================


      ============================================== SPOILERS ====================================================


      ============================================== SPOILERS ====================================================


      ============================================== SPOILERS ====================================================

      This thing is *VERY* easy to untangle with "-prof perfasm", and JMH 1.5+ that is able to decode VM stubs.

      Both ArrayList.toArray() and ArrayList::new(ArrayList) do Arrays.copyOf(), that is
      delegated to VM stub that implements arraycopy (StubRoutines::jint_disjoint_arraycopy).
      The hottest block there is:
       (the same for "defensive_copy" and "simple_toArray")

        2.03%    3.83%    0x00007f57b3fb8460: vmovdqu -0x38(%rdi,%rdx,8),%ymm0
        6.19%   10.37%    0x00007f57b3fb8466: vmovdqu %ymm0,-0x38(%rsi,%rdx,8)
       11.73%   15.67%    0x00007f57b3fb846c: vmovdqu -0x18(%rdi,%rdx,8),%ymm1
       28.56%   15.09%    0x00007f57b3fb8472: vmovdqu %ymm1,-0x18(%rsi,%rdx,8)
       15.06%   20.46%    0x00007f57b3fb8478: add    $0x8,%rdx
        0.05%             0x00007f57b3fb847c: jle    Stub::jint_disjoint_arraycopy+64 0x0x7f57b3fb8460

      This is the AVX2-enabled arraycopy of the entire array, without fear and remorse,
      expected to be fast. Notice the result of toArray() is Object[], and you cannot cast it to
      Byte[]. ArrayList copying also produces the copy of Object[] internally, since type is erased.

      But if you need a typed array...  you fall into the interesting trap, as all other tests that
      put the values into Byte[] arrays fall into. ArrayList.toArray(T[] dst) needs to check the target
      element type, and therefore in the end it delegates to type-checking VM stub that implements arraycopy
      (StubRoutines::checkcast_arraycopy):
        (almost the same for "zero_sized_array", "sized_array_from_list", "sized_array_fixed_size")

                                                ; <a bit torn off, this is actually a final stage>
                                                ; pack and store to destination
          9.47%   13.70%    0x00007ff76d0529b0: shr    $0x3,%rax
          6.28%    1.96%    0x00007ff76d0529b4: mov    %eax,0x0(%r13,%rdx,4)

                                                ; i++, and termination check
          2.63%    3.57%    0x00007ff76d0529b9: inc    %rdx
                            0x00007ff76d0529bc: je     Stub::checkcast_arraycopy+155 0x0x7ff76d052a1b

                                                ; load and unpack
          4.89%    5.43%    0x00007ff76d0529c2: mov    (%rdi,%rdx,4),%eax
         10.32%   13.93%    0x00007ff76d0529c5: shl    $0x3,%rax

                                                ; test for null
          1.60%    2.10%    0x00007ff76d0529c9: test   %rax,%rax
                            0x00007ff76d0529cc: je     Stub::checkcast_arraycopy+48 0x0x7ff76d0529b0

                                                ; type check
          3.83%    4.55%    0x00007ff76d0529ce: mov    0x8(%rax),%r11d
         17.64%   21.19%    0x00007ff76d0529d2: shl    $0x3,%r11
         13.07%   17.12%    0x00007ff76d0529d6: cmp    %r8,%r11
                            0x00007ff76d0529d9: je     Stub::checkcast_arraycopy+48 0x0x7ff76d0529b0

      This is a per-element copy that for each element does: load the reference, unpack it, null-check it,
      check the type is correct, put it to the destination. No wonder this thing is significantly slower than
      a streaming copy. (Actually, VM could be a bit smarter here, and figure out the target array type is
      correct for all elements)

      The only "mystery" left is why "zero_sized_array" is slightly faster than pre-sized arrays. If you look into
      the *second* hottest block in non-zero array copies, you will naturally see the allocation of the
      Byte[] array:

          0.30%             0x00007ff76d19e2ad: movl   $0xf8011e02,0x8(%r14)  ;   {metadata(&apos;java/lang/Byte&apos;[])}
                            0x00007ff76d19e2b5: mov    %r11d,0xc(%r14)
                            0x00007ff76d19e2b9: prefetchnta 0x140(%r8)
          0.06%             0x00007ff76d19e2c1: mov    %r14,%rdi
                            0x00007ff76d19e2c4: add    $0x10,%rdi
                            0x00007ff76d19e2c8: prefetchnta 0x180(%r8)
          0.02%    0.02%    0x00007ff76d19e2d0: shr    $0x3,%rcx
                            0x00007ff76d19e2d4: add    $0xfffffffffffffffe,%rcx
          0.02%             0x00007ff76d19e2d8: xor    %rax,%rax
                            0x00007ff76d19e2db: shl    $0x3,%rcx
          9.76%    0.10%    0x00007ff76d19e2df: rex.W rep stos %al,%es:(%rdi)  ;*anewarray
          0.37%             0x00007ff76d19e2e2: mov    0x8(%r12,%rbp,8),%r10d  ; implicit exception: dispatches to 0x00007ff76d19e5ee

      Notice this requires *pre-zeroing* the array, see the "rep stos" that consumes around 10% of time. When we
      are dealing with "zero_sized_array", VM allocates the array for us. But, as far as I can tell, since VM knows it
      is going to fill the array entirely, it will skip pre-zeroing, and just copy the contents over. This explains
      the 10% difference between zero and non-zero toArray(...) calls performance.

      Sometimes you should just trust the VM to do the sane thing.
     */

}
	package org.openjdk;

	import org.openjdk.jmh.annotations.Benchmark;
	import org.openjdk.jmh.annotations.BenchmarkMode;
	import org.openjdk.jmh.annotations.Fork;
	import org.openjdk.jmh.annotations.Measurement;
	import org.openjdk.jmh.annotations.Mode;
	import org.openjdk.jmh.annotations.OutputTimeUnit;
	import org.openjdk.jmh.annotations.Param;
	import org.openjdk.jmh.annotations.Scope;
	import org.openjdk.jmh.annotations.Setup;
	import org.openjdk.jmh.annotations.State;
	import org.openjdk.jmh.annotations.Warmup;

	import java.util.ArrayList;
	import java.util.List;
	import java.util.concurrent.TimeUnit;

	@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
	@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
	@Fork(3)
	@BenchmarkMode(Mode.AverageTime)
	@OutputTimeUnit(TimeUnit.NANOSECONDS)
	@State(Scope.Benchmark)
	public class ToArrayBench {

	@Param({"1", "100", "10000"})
	private int size;

	List<Byte> list;

	@Setup
	public void setup() throws Throwable {
	list = new ArrayList<>();
	for (int i = 0; i < size; i++) {
	list.add((byte) i);
	}
	}

	/*
	Originally asked by @twillouer here:
	https://twitter.com/twillouer/status/558745334709362688

	Running this benchmark on 1x4x2 i7-4790K, JDK 8u40 EA, Linux x86_64 produces:

	Benchmark (size) Mode Cnt Score Error Units
	ToArrayBench.defensive_copy 1 avgt 15 9.781 Â± 0.117 ns/op
	ToArrayBench.simple_toArray 1 avgt 15 7.950 Â± 0.428 ns/op
	ToArrayBench.sized_array_fixed_size 1 avgt 15 18.384 Â± 0.844 ns/op
	ToArrayBench.sized_array_from_list 1 avgt 15 19.548 Â± 0.631 ns/op
	ToArrayBench.zero_sized_array 1 avgt 15 10.499 Â± 0.647 ns/op

	ToArrayBench.defensive_copy 100 avgt 15 49.125 Â± 10.232 ns/op
	ToArrayBench.simple_toArray 100 avgt 15 41.914 Â± 1.353 ns/op
	ToArrayBench.sized_array_fixed_size 100 avgt 15 118.524 Â± 1.912 ns/op
	ToArrayBench.sized_array_from_list 100 avgt 15 123.494 Â± 8.427 ns/op
	ToArrayBench.zero_sized_array 100 avgt 15 106.610 Â± 1.730 ns/op

	ToArrayBench.defensive_copy 10000 avgt 15 3973.125 Â± 64.316 ns/op
	ToArrayBench.simple_toArray 10000 avgt 15 3996.490 Â± 185.899 ns/op
	ToArrayBench.sized_array_fixed_size 10000 avgt 15 11088.298 Â± 785.341 ns/op
	ToArrayBench.sized_array_from_list 10000 avgt 15 11195.440 Â± 1195.366 ns/op
	ToArrayBench.zero_sized_array 10000 avgt 15 9246.457 Â± 110.167 ns/op

	EXERCISE: Try to explain these numbers before moving to the explanation below.
	*/

	@Benchmark
	public ArrayList<Byte> defensive_copy() {
	return new ArrayList<>(list);
	}

	@Benchmark
	public Object[] simple_toArray() {
	return list.toArray();
	}

	@Benchmark
	public Byte[] zero_sized_array() {
	return list.toArray(new Byte[0]);
	}

	@Benchmark
	public Byte[] sized_array_from_list() {
	return list.toArray(new Byte[list.size()]);
	}

	@Benchmark
	public Byte[] sized_array_fixed_size() {
	return list.toArray(new Byte[size]);
	}

	/*
	============================================== SPOILERS ====================================================





	============================================== SPOILERS ====================================================





	============================================== SPOILERS ====================================================





	============================================== SPOILERS ====================================================

	This thing is VERY easy to untangle with "-prof perfasm", and JMH 1.5+ that is able to decode VM stubs.

	Both ArrayList.toArray() and ArrayList::new(ArrayList) do Arrays.copyOf(), that is
	delegated to VM stub that implements arraycopy (StubRoutines::jint_disjoint_arraycopy).
	The hottest block there is:
	(the same for "defensive_copy" and "simple_toArray")

	2.03% 3.83% 0x00007f57b3fb8460: vmovdqu -0x38(%rdi,%rdx,8),%ymm0
	6.19% 10.37% 0x00007f57b3fb8466: vmovdqu %ymm0,-0x38(%rsi,%rdx,8)
	11.73% 15.67% 0x00007f57b3fb846c: vmovdqu -0x18(%rdi,%rdx,8),%ymm1
	28.56% 15.09% 0x00007f57b3fb8472: vmovdqu %ymm1,-0x18(%rsi,%rdx,8)
	15.06% 20.46% 0x00007f57b3fb8478: add $0x8,%rdx
	0.05% 0x00007f57b3fb847c: jle Stub::jint_disjoint_arraycopy+64 0x0x7f57b3fb8460

	This is the AVX2-enabled arraycopy of the entire array, without fear and remorse,
	expected to be fast. Notice the result of toArray() is Object[], and you cannot cast it to
	Byte[]. ArrayList copying also produces the copy of Object[] internally, since type is erased.

	But if you need a typed array... you fall into the interesting trap, as all other tests that
	put the values into Byte[] arrays fall into. ArrayList.toArray(T[] dst) needs to check the target
	element type, and therefore in the end it delegates to type-checking VM stub that implements arraycopy
	(StubRoutines::checkcast_arraycopy):
	(almost the same for "zero_sized_array", "sized_array_from_list", "sized_array_fixed_size")

	; <a bit torn off, this is actually a final stage>
	; pack and store to destination
	9.47% 13.70% 0x00007ff76d0529b0: shr $0x3,%rax
	6.28% 1.96% 0x00007ff76d0529b4: mov %eax,0x0(%r13,%rdx,4)

	; i++, and termination check
	2.63% 3.57% 0x00007ff76d0529b9: inc %rdx
	0x00007ff76d0529bc: je Stub::checkcast_arraycopy+155 0x0x7ff76d052a1b

	; load and unpack
	4.89% 5.43% 0x00007ff76d0529c2: mov (%rdi,%rdx,4),%eax
	10.32% 13.93% 0x00007ff76d0529c5: shl $0x3,%rax

	; test for null
	1.60% 2.10% 0x00007ff76d0529c9: test %rax,%rax
	0x00007ff76d0529cc: je Stub::checkcast_arraycopy+48 0x0x7ff76d0529b0

	; type check
	3.83% 4.55% 0x00007ff76d0529ce: mov 0x8(%rax),%r11d
	17.64% 21.19% 0x00007ff76d0529d2: shl $0x3,%r11
	13.07% 17.12% 0x00007ff76d0529d6: cmp %r8,%r11
	0x00007ff76d0529d9: je Stub::checkcast_arraycopy+48 0x0x7ff76d0529b0

	This is a per-element copy that for each element does: load the reference, unpack it, null-check it,
	check the type is correct, put it to the destination. No wonder this thing is significantly slower than
	a streaming copy. (Actually, VM could be a bit smarter here, and figure out the target array type is
	correct for all elements)

	The only "mystery" left is why "zero_sized_array" is slightly faster than pre-sized arrays. If you look into
	the second hottest block in non-zero array copies, you will naturally see the allocation of the
	Byte[] array:

	0.30% 0x00007ff76d19e2ad: movl $0xf8011e02,0x8(%r14) ; {metadata('java/lang/Byte'[])}
	0x00007ff76d19e2b5: mov %r11d,0xc(%r14)
	0x00007ff76d19e2b9: prefetchnta 0x140(%r8)
	0.06% 0x00007ff76d19e2c1: mov %r14,%rdi
	0x00007ff76d19e2c4: add $0x10,%rdi
	0x00007ff76d19e2c8: prefetchnta 0x180(%r8)
	0.02% 0.02% 0x00007ff76d19e2d0: shr $0x3,%rcx
	0x00007ff76d19e2d4: add $0xfffffffffffffffe,%rcx
	0.02% 0x00007ff76d19e2d8: xor %rax,%rax
	0x00007ff76d19e2db: shl $0x3,%rcx
	9.76% 0.10% 0x00007ff76d19e2df: rex.W rep stos %al,%es:(%rdi) ;*anewarray
	0.37% 0x00007ff76d19e2e2: mov 0x8(%r12,%rbp,8),%r10d ; implicit exception: dispatches to 0x00007ff76d19e5ee

	Notice this requires pre-zeroing the array, see the "rep stos" that consumes around 10% of time. When we
	are dealing with "zero_sized_array", VM allocates the array for us. But, as far as I can tell, since VM knows it
	is going to fill the array entirely, it will skip pre-zeroing, and just copy the contents over. This explains
	the 10% difference between zero and non-zero toArray(...) calls performance.

	Sometimes you should just trust the VM to do the sane thing.
	*/

	}