-
-
Save Spasi/025febb7325b7b73ab2b90f0280796ce to your computer and use it in GitHub Desktop.
Vectorized floating point reductions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// double version: | |
/* | |
Windows 10, JVM 9.0.4+11, Ryzen 1800X: | |
-------------------------------------- | |
Benchmark (size) Mode Cnt Score Error Units | |
SIMDBenchmark.reduceBuffered 1024 thrpt 3 518,570 ± 27,117 ops/ms | |
SIMDBenchmark.reduceBuffered 65536 thrpt 3 35,050 ± 0,297 ops/ms | |
SIMDBenchmark.reduceBuffered 131072 thrpt 3 17,822 ± 1,511 ops/ms | |
SIMDBenchmark.reduceSimple 1024 thrpt 3 1230,955 ± 24,972 ops/ms | |
SIMDBenchmark.reduceSimple 65536 thrpt 3 18,628 ± 0,725 ops/ms | |
SIMDBenchmark.reduceSimple 131072 thrpt 3 9,340 ± 0,217 ops/ms | |
SIMDBenchmark.reduceUnrolled 1024 thrpt 3 4699,957 ± 121,889 ops/ms | |
SIMDBenchmark.reduceUnrolled 65536 thrpt 3 73,777 ± 0,889 ops/ms | |
SIMDBenchmark.reduceUnrolled 131072 thrpt 3 36,505 ± 3,219 ops/ms | |
SIMDBenchmark.reduceVectorised 1024 thrpt 3 365,681 ± 26,555 ops/ms | |
SIMDBenchmark.reduceVectorised 65536 thrpt 3 27,931 ± 1,371 ops/ms | |
SIMDBenchmark.reduceVectorised 131072 thrpt 3 14,286 ± 0,930 ops/ms | |
Linux, JVM 10+43, Ryzen 1800X: | |
------------------------------ | |
Benchmark (size) Mode Cnt Score Error Units | |
SIMDBenchmark.reduceUnrolled 1024 thrpt 5 4474.529 ± 29.028 ops/ms | |
SIMDBenchmark.reduceUnrolled 65536 thrpt 5 70.467 ± 1.030 ops/ms | |
SIMDBenchmark.reduceUnrolled 131072 thrpt 5 35.435 ± 0.293 ops/ms | |
SIMDBenchmark.reduceVectorised 1024 thrpt 5 381.656 ± 10.818 ops/ms | |
SIMDBenchmark.reduceVectorised 65536 thrpt 5 36.867 ± 1.764 ops/ms | |
SIMDBenchmark.reduceVectorised 131072 thrpt 5 18.645 ± 0.546 ops/ms | |
*/ | |
@Benchmark | |
public double reduceVectorised() { | |
double[] buffer = new double[1024]; | |
double[] temp = new double[1024]; | |
for (int i = 0; i < data.length >>> 10; ++i) { | |
System.arraycopy(data, i * 1024, temp, 0, temp.length); | |
for (int j = 0; j < 1024; ++j) { | |
buffer[j] += temp[j]; | |
} | |
} | |
return reduce(buffer); | |
} | |
@Benchmark | |
public double reduceUnrolled() { | |
double a0 = 0.0; | |
double a1 = 0.0; | |
double a2 = 0.0; | |
double a3 = 0.0; | |
for (int i = 0; i < data.length >> 2; i++) { | |
a0 += data[i * 4 + 0]; | |
a1 += data[i * 4 + 1]; | |
a2 += data[i * 4 + 2]; | |
a3 += data[i * 4 + 3]; | |
} | |
return a0 + a1 + a2 + a3; | |
} | |
// float version: | |
/* | |
Windows 10, JVM 9.0.4+11, Ryzen 1800X: | |
-------------------------------------- | |
Benchmark (size) Mode Cnt Score Error Units | |
SIMDBenchmark.reduceBuffered 1024 thrpt 3 632,734 ± 338,190 ops/ms | |
SIMDBenchmark.reduceBuffered 65536 thrpt 3 35,742 ± 4,967 ops/ms | |
SIMDBenchmark.reduceBuffered 131072 thrpt 3 17,894 ± 2,126 ops/ms | |
SIMDBenchmark.reduceSimple 1024 thrpt 3 1227,272 ± 90,193 ops/ms | |
SIMDBenchmark.reduceSimple 65536 thrpt 3 18,640 ± 0,616 ops/ms | |
SIMDBenchmark.reduceSimple 131072 thrpt 3 9,334 ± 0,171 ops/ms | |
SIMDBenchmark.reduceUnrolled128 1024 thrpt 3 4695,636 ± 98,984 ops/ms | |
SIMDBenchmark.reduceUnrolled128 65536 thrpt 3 74,801 ± 0,211 ops/ms | |
SIMDBenchmark.reduceUnrolled128 131072 thrpt 3 36,835 ± 5,834 ops/ms | |
SIMDBenchmark.reduceUnrolled256 1024 thrpt 3 6750,436 ± 257,662 ops/ms | |
SIMDBenchmark.reduceUnrolled256 65536 thrpt 3 105,726 ± 0,525 ops/ms | |
SIMDBenchmark.reduceUnrolled256 131072 thrpt 3 50,739 ± 42,492 ops/ms | |
SIMDBenchmark.reduceVectorised 1024 thrpt 3 566,206 ± 21,687 ops/ms | |
SIMDBenchmark.reduceVectorised 65536 thrpt 3 55,265 ± 0,757 ops/ms | |
SIMDBenchmark.reduceVectorised 131072 thrpt 3 28,468 ± 2,976 ops/ms | |
Linux, JVM 10+43, Ryzen 1800X: | |
------------------------------ | |
Benchmark (size) Mode Cnt Score Error Units | |
SIMDBenchmarkFloat.reduceUnrolled128 1024 thrpt 5 4614.667 ± 70.817 ops/ms | |
SIMDBenchmarkFloat.reduceUnrolled128 65536 thrpt 5 71.675 ± 1.006 ops/ms | |
SIMDBenchmarkFloat.reduceUnrolled128 131072 thrpt 5 36.169 ± 0.308 ops/ms | |
SIMDBenchmarkFloat.reduceUnrolled256 1024 thrpt 5 5010.649 ± 28.092 ops/ms | |
SIMDBenchmarkFloat.reduceUnrolled256 65536 thrpt 5 78.317 ± 0.749 ops/ms | |
SIMDBenchmarkFloat.reduceUnrolled256 131072 thrpt 5 39.264 ± 0.432 ops/ms | |
SIMDBenchmarkFloat.reduceVectorised 1024 thrpt 5 572.878 ± 32.272 ops/ms | |
SIMDBenchmarkFloat.reduceVectorised 65536 thrpt 5 70.393 ± 1.993 ops/ms | |
SIMDBenchmarkFloat.reduceVectorised 131072 thrpt 5 37.340 ± 1.031 ops/ms | |
*/ | |
@Benchmark | |
public float reduceUnrolled128() { | |
float a0 = 0.0f; | |
float a1 = 0.0f; | |
float a2 = 0.0f; | |
float a3 = 0.0f; | |
for (int i = 0; i < data.length >> 2; i++) { | |
a0 += data[i * 4 + 0]; | |
a1 += data[i * 4 + 1]; | |
a2 += data[i * 4 + 2]; | |
a3 += data[i * 4 + 3]; | |
} | |
return a0 + a1 + a2 + a3; | |
} | |
@Benchmark | |
public float reduceUnrolled256() { | |
float a0 = 0.0f; | |
float a1 = 0.0f; | |
float a2 = 0.0f; | |
float a3 = 0.0f; | |
float a4 = 0.0f; | |
float a5 = 0.0f; | |
float a6 = 0.0f; | |
float a7 = 0.0f; | |
for (int i = 0; i < data.length >> 3; i++) { | |
a0 += data[i * 8 + 0]; | |
a1 += data[i * 8 + 1]; | |
a2 += data[i * 8 + 2]; | |
a3 += data[i * 8 + 3]; | |
a4 += data[i * 8 + 4]; | |
a5 += data[i * 8 + 5]; | |
a6 += data[i * 8 + 6]; | |
a7 += data[i * 8 + 7]; | |
} | |
return a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7; | |
} |
Forgot to mention that for the float
versions I have written a different benchmark that uses a float[] data
array. So there shouldn't be any float
<->double
conversions.
reduceVectorized
on JDK 10:
51.62% c2, level 4 bench.generated.SIMDBenchmark_reduceVectorised_jmhTest::reduceVectorised_thrpt_jmhStub, version 921 (626 bytes)
37.43% runtime stub StubRoutines::jlong_disjoint_arraycopy (28 bytes)
51.62% c2, level 4 bench.generated.SIMDBenchmark_reduceVectorised_jmhTest::reduceVectorised_thrpt_jmhStub, version 921 (626 bytes)
37.43% runtime stub StubRoutines::jlong_disjoint_arraycopy (28 bytes)
↗ 0x00007f996566c4c0: vmovdqu 0x10(%r8,%r11,8),%ymm0
0.13% │ 0x00007f996566c4c7: vaddpd 0x10(%r13,%r11,8),%ymm0,%ymm0
2.32% │ 0x00007f996566c4ce: vmovdqu %ymm0,0x10(%r13,%r11,8)
0.35% │ 0x00007f996566c4d5: vmovdqu 0x30(%r8,%r11,8),%ymm0
0.12% │ 0x00007f996566c4dc: vaddpd 0x30(%r13,%r11,8),%ymm0,%ymm0
2.18% │ 0x00007f996566c4e3: vmovdqu %ymm0,0x30(%r13,%r11,8)
0.48% │ 0x00007f996566c4ea: vmovdqu 0x50(%r8,%r11,8),%ymm0
0.12% │ 0x00007f996566c4f1: vaddpd 0x50(%r13,%r11,8),%ymm0,%ymm0
2.18% │ 0x00007f996566c4f8: vmovdqu %ymm0,0x50(%r13,%r11,8)
0.49% │ 0x00007f996566c4ff: vmovdqu 0x70(%r8,%r11,8),%ymm0
0.16% │ 0x00007f996566c506: vaddpd 0x70(%r13,%r11,8),%ymm0,%ymm0
2.41% │ 0x00007f996566c50d: vmovdqu %ymm0,0x70(%r13,%r11,8)
0.37% │ 0x00007f996566c514: vmovdqu 0x90(%r8,%r11,8),%ymm0
0.09% │ 0x00007f996566c51e: vaddpd 0x90(%r13,%r11,8),%ymm0,%ymm0
2.29% │ 0x00007f996566c528: vmovdqu %ymm0,0x90(%r13,%r11,8)
0.40% │ 0x00007f996566c532: vmovdqu 0xb0(%r8,%r11,8),%ymm0
0.13% │ 0x00007f996566c53c: vaddpd 0xb0(%r13,%r11,8),%ymm0,%ymm0
2.69% │ 0x00007f996566c546: vmovdqu %ymm0,0xb0(%r13,%r11,8)
0.31% │ 0x00007f996566c550: vmovdqu 0xd0(%r8,%r11,8),%ymm0
0.19% │ 0x00007f996566c55a: vaddpd 0xd0(%r13,%r11,8),%ymm0,%ymm0
2.32% │ 0x00007f996566c564: vmovdqu %ymm0,0xd0(%r13,%r11,8)
0.55% │ 0x00007f996566c56e: vmovdqu 0xf0(%r8,%r11,8),%ymm0
0.14% │ 0x00007f996566c578: vaddpd 0xf0(%r13,%r11,8),%ymm0,%ymm0
2.30% │ 0x00007f996566c582: vmovdqu %ymm0,0xf0(%r13,%r11,8)
0.54% │ 0x00007f996566c58c: vmovdqu 0x110(%r8,%r11,8),%ymm0
0.10% │ 0x00007f996566c596: vaddpd 0x110(%r13,%r11,8),%ymm0,%ymm0
2.13% │ 0x00007f996566c5a0: vmovdqu %ymm0,0x110(%r13,%r11,8)
0.45% │ 0x00007f996566c5aa: vmovdqu 0x130(%r8,%r11,8),%ymm0
0.12% │ 0x00007f996566c5b4: vaddpd 0x130(%r13,%r11,8),%ymm0,%ymm0
2.37% │ 0x00007f996566c5be: vmovdqu %ymm0,0x130(%r13,%r11,8)
0.29% │ 0x00007f996566c5c8: vmovdqu 0x150(%r8,%r11,8),%ymm0
0.12% │ 0x00007f996566c5d2: vaddpd 0x150(%r13,%r11,8),%ymm0,%ymm0
2.26% │ 0x00007f996566c5dc: vmovdqu %ymm0,0x150(%r13,%r11,8)
0.31% │ 0x00007f996566c5e6: vmovdqu 0x170(%r8,%r11,8),%ymm0
0.12% │ 0x00007f996566c5f0: vaddpd 0x170(%r13,%r11,8),%ymm0,%ymm0
2.69% │ 0x00007f996566c5fa: vmovdqu %ymm0,0x170(%r13,%r11,8)
0.35% │ 0x00007f996566c604: vmovdqu 0x190(%r8,%r11,8),%ymm0
0.14% │ 0x00007f996566c60e: vaddpd 0x190(%r13,%r11,8),%ymm0,%ymm0
2.18% │ 0x00007f996566c618: vmovdqu %ymm0,0x190(%r13,%r11,8)
0.39% │ 0x00007f996566c622: vmovdqu 0x1b0(%r8,%r11,8),%ymm0
0.10% │ 0x00007f996566c62c: vaddpd 0x1b0(%r13,%r11,8),%ymm0,%ymm0
3.83% │ 0x00007f996566c636: vmovdqu %ymm0,0x1b0(%r13,%r11,8)
0.51% │ 0x00007f996566c640: vmovdqu 0x1d0(%r8,%r11,8),%ymm0
0.12% │ 0x00007f996566c64a: vaddpd 0x1d0(%r13,%r11,8),%ymm0,%ymm0
2.47% │ 0x00007f996566c654: vmovdqu %ymm0,0x1d0(%r13,%r11,8)
0.33% │ 0x00007f996566c65e: vmovdqu 0x1f0(%r8,%r11,8),%ymm0
0.10% │ 0x00007f996566c668: vaddpd 0x1f0(%r13,%r11,8),%ymm0,%ymm0
2.35% │ 0x00007f996566c672: vmovdqu %ymm0,0x1f0(%r13,%r11,8)
│ ;*dastore {reexecute=0 rethrow=0 return_oop=0}
│ ; - bench.SIMDBenchmark::reduceVectorised@63 (line 137)
│ ; - bench.generated.SIMDBenchmark_reduceVectorised_jmhTest::reduceVectorised_thrpt_jmhStub@17 (line 119)
0.48% │ 0x00007f996566c67c: add $0x40,%r11d ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ ; - bench.SIMDBenchmark::reduceVectorised@64 (line 136)
│ ; - bench.generated.SIMDBenchmark_reduceVectorised_jmhTest::reduceVectorised_thrpt_jmhStub@17 (line 119)
0.01% │ 0x00007f996566c680: cmp $0x3c1,%r11d
╰ 0x00007f996566c687: jl 0x00007f996566c4c0 ;*goto {reexecute=0 rethrow=0 return_oop=0}
; - bench.SIMDBenchmark::reduceVectorised@67 (line 136)
; - bench.generated.SIMDBenchmark_reduceVectorised_jmhTest::reduceVectorised_thrpt_jmhStub@17 (line 119)
reducedUnrolled
on JDK 10:
↗↗ 0x00007f0801895a90: cmp %ecx,%r10d
0.00% ││ 0x00007f0801895a93: jae 0x00007f0801895d9c
││ 0x00007f0801895a99: vaddsd 0x10(%rsi,%r10,8),%xmm5,%xmm0
││ ;*dadd {reexecute=0 rethrow=0 return_oop=0}
││ ; - bench.SIMDBenchmark::reduceUnrolled@37 (line 215)
││ ; - bench.generated.SIMDBenchmark_reduceUnrolled_jmhTest::reduceUnrolled_thrpt_jmhStub@17 (line 119)
0.20% ││ 0x00007f0801895aa0: mov %r10d,%r11d
││ 0x00007f0801895aa3: add $0x3,%r11d
0.82% ││ 0x00007f0801895aa7: cmp %ecx,%r11d
2.57% ││ 0x00007f0801895aaa: jae 0x00007f0801895dd6
││ 0x00007f0801895ab0: movslq %r10d,%r10
0.51% ││ 0x00007f0801895ab3: vaddsd 0x18(%rsi,%r10,8),%xmm3,%xmm7
5.86% ││ 0x00007f0801895aba: vaddsd 0x28(%rsi,%r10,8),%xmm6,%xmm6
0.04% ││ 0x00007f0801895ac1: vaddsd 0x20(%rsi,%r10,8),%xmm4,%xmm4
0.80% ││ 0x00007f0801895ac8: mov %r8d,%r9d
0.00% ││ 0x00007f0801895acb: shl $0x2,%r9d
││ 0x00007f0801895acf: mov %r9d,%r10d
0.50% ││ 0x00007f0801895ad2: add $0x20,%r10d ;*imul {reexecute=0 rethrow=0 return_oop=0}
││ ; - bench.SIMDBenchmark::reduceUnrolled@33 (line 215)
││ ; - bench.generated.SIMDBenchmark_reduceUnrolled_jmhTest::reduceUnrolled_thrpt_jmhStub@17 (line 119)
0.00% ││ 0x00007f0801895ad6: movslq %r9d,%r11
││ 0x00007f0801895ad9: vaddsd 0x30(%rsi,%r11,8),%xmm0,%xmm0
2.86% ││ 0x00007f0801895ae0: vaddsd 0x50(%rsi,%r11,8),%xmm0,%xmm0
1.87% ││ 0x00007f0801895ae7: vaddsd 0x70(%rsi,%r11,8),%xmm0,%xmm0
6.65% ││ 0x00007f0801895aee: vaddsd 0x90(%rsi,%r11,8),%xmm0,%xmm0
10.53% ││ 0x00007f0801895af8: vaddsd 0xb0(%rsi,%r11,8),%xmm0,%xmm0
11.39% ││ 0x00007f0801895b02: vaddsd 0xd0(%rsi,%r11,8),%xmm0,%xmm0
11.85% ││ 0x00007f0801895b0c: vaddsd 0xf0(%rsi,%r11,8),%xmm0,%xmm5
││ ;*dadd {reexecute=0 rethrow=0 return_oop=0}
││ ; - bench.SIMDBenchmark::reduceUnrolled@37 (line 215)
││ ; - bench.generated.SIMDBenchmark_reduceUnrolled_jmhTest::reduceUnrolled_thrpt_jmhStub@17 (line 119)
11.80% ││ 0x00007f0801895b16: vmovsd 0x108(%rsi,%r11,8),%xmm2
0.06% ││ 0x00007f0801895b20: vmovsd 0xe8(%rsi,%r11,8),%xmm3
0.01% ││ 0x00007f0801895b2a: vmovsd 0xc8(%rsi,%r11,8),%xmm8
││ 0x00007f0801895b34: vmovsd 0xa8(%rsi,%r11,8),%xmm9
0.00% ││ 0x00007f0801895b3e: vmovsd 0x88(%rsi,%r11,8),%xmm10
0.00% ││ 0x00007f0801895b48: vmovsd 0x68(%rsi,%r11,8),%xmm13
0.00% ││ 0x00007f0801895b4f: vmovsd 0x48(%rsi,%r11,8),%xmm0 ;*daload {reexecute=0 rethrow=0 return_oop=0}
││ ; - bench.SIMDBenchmark::reduceUnrolled@81 (line 218)
││ ; - bench.generated.SIMDBenchmark_reduceUnrolled_jmhTest::reduceUnrolled_thrpt_jmhStub@17 (line 119)
││ 0x00007f0801895b56: vmovsd 0x100(%rsi,%r11,8),%xmm11
││ ;*daload {reexecute=0 rethrow=0 return_oop=0}
││ ; - bench.SIMDBenchmark::reduceUnrolled@65 (line 217)
││ ; - bench.generated.SIMDBenchmark_reduceUnrolled_jmhTest::reduceUnrolled_thrpt_jmhStub@17 (line 119)
3.94% ││ 0x00007f0801895b60: vmovsd 0x38(%rsi,%r11,8),%xmm1 ;*daload {reexecute=0 rethrow=0 return_oop=0}
││ ; - bench.SIMDBenchmark::reduceUnrolled@50 (line 216)
││ ; - bench.generated.SIMDBenchmark_reduceUnrolled_jmhTest::reduceUnrolled_thrpt_jmhStub@17 (line 119)
││ 0x00007f0801895b67: vmovsd 0xe0(%rsi,%r11,8),%xmm12
0.00% ││ 0x00007f0801895b71: vaddsd %xmm1,%xmm7,%xmm1
0.01% ││ 0x00007f0801895b75: vmovsd 0xc0(%rsi,%r11,8),%xmm7
││ 0x00007f0801895b7f: vaddsd %xmm0,%xmm6,%xmm0
0.03% ││ 0x00007f0801895b83: vmovsd 0x80(%rsi,%r11,8),%xmm6
0.00% ││ 0x00007f0801895b8d: vaddsd %xmm13,%xmm0,%xmm0
0.01% ││ 0x00007f0801895b92: vmovsd 0x60(%rsi,%r11,8),%xmm13
3.97% ││ 0x00007f0801895b99: vaddsd %xmm10,%xmm0,%xmm10
0.02% ││ 0x00007f0801895b9e: vmovsd 0x40(%rsi,%r11,8),%xmm0
0.00% ││ 0x00007f0801895ba5: vaddsd %xmm0,%xmm4,%xmm0
││ 0x00007f0801895ba9: vmovsd 0xa0(%rsi,%r11,8),%xmm4 ;*daload {reexecute=0 rethrow=0 return_oop=0}
││ ; - bench.SIMDBenchmark::reduceUnrolled@65 (line 217)
││ ; - bench.generated.SIMDBenchmark_reduceUnrolled_jmhTest::reduceUnrolled_thrpt_jmhStub@17 (line 119)
││ 0x00007f0801895bb3: vaddsd %xmm13,%xmm0,%xmm0
0.01% ││ 0x00007f0801895bb8: vmovsd 0xf8(%rsi,%r11,8),%xmm13
0.03% ││ 0x00007f0801895bc2: vaddsd %xmm6,%xmm0,%xmm0
0.12% ││ 0x00007f0801895bc6: vmovsd 0xd8(%rsi,%r11,8),%xmm6
3.79% ││ 0x00007f0801895bd0: vaddsd %xmm4,%xmm0,%xmm0
0.27% ││ 0x00007f0801895bd4: vmovsd 0xb8(%rsi,%r11,8),%xmm4
││ 0x00007f0801895bde: vaddsd %xmm7,%xmm0,%xmm0
0.49% ││ 0x00007f0801895be2: vmovsd 0x98(%rsi,%r11,8),%xmm14
││ 0x00007f0801895bec: vaddsd %xmm12,%xmm0,%xmm12
2.77% ││ 0x00007f0801895bf1: vmovsd 0x78(%rsi,%r11,8),%xmm0
││ 0x00007f0801895bf8: vmovsd 0x58(%rsi,%r11,8),%xmm7 ;*daload {reexecute=0 rethrow=0 return_oop=0}
││ ; - bench.SIMDBenchmark::reduceUnrolled@50 (line 216)
││ ; - bench.generated.SIMDBenchmark_reduceUnrolled_jmhTest::reduceUnrolled_thrpt_jmhStub@17 (line 119)
││ 0x00007f0801895bff: vaddsd %xmm7,%xmm1,%xmm1
2.09% ││ 0x00007f0801895c03: vaddsd %xmm0,%xmm1,%xmm0
0.02% ││ 0x00007f0801895c07: vaddsd %xmm9,%xmm10,%xmm1
││ 0x00007f0801895c0c: vaddsd %xmm14,%xmm0,%xmm0
0.07% ││ 0x00007f0801895c11: vaddsd %xmm8,%xmm1,%xmm1
0.10% ││ 0x00007f0801895c16: vaddsd %xmm4,%xmm0,%xmm0
1.78% ││ 0x00007f0801895c1a: vaddsd %xmm3,%xmm1,%xmm1
0.20% ││ 0x00007f0801895c1e: vaddsd %xmm6,%xmm0,%xmm0
1.41% ││ 0x00007f0801895c22: vaddsd %xmm2,%xmm1,%xmm6 ;*dadd {reexecute=0 rethrow=0 return_oop=0}
││ ; - bench.SIMDBenchmark::reduceUnrolled@82 (line 218)
││ ; - bench.generated.SIMDBenchmark_reduceUnrolled_jmhTest::reduceUnrolled_thrpt_jmhStub@17 (line 119)
2.84% ││ 0x00007f0801895c26: vaddsd %xmm13,%xmm0,%xmm3 ;*dadd {reexecute=0 rethrow=0 return_oop=0}
││ ; - bench.SIMDBenchmark::reduceUnrolled@51 (line 216)
││ ; - bench.generated.SIMDBenchmark_reduceUnrolled_jmhTest::reduceUnrolled_thrpt_jmhStub@17 (line 119)
5.99% ││ 0x00007f0801895c2b: vaddsd %xmm11,%xmm12,%xmm4 ;*dadd {reexecute=0 rethrow=0 return_oop=0}
││ ; - bench.SIMDBenchmark::reduceUnrolled@66 (line 217)
││ ; - bench.generated.SIMDBenchmark_reduceUnrolled_jmhTest::reduceUnrolled_thrpt_jmhStub@17 (line 119)
0.82% ││ 0x00007f0801895c30: add $0x8,%r8d ;*iinc {reexecute=0 rethrow=0 return_oop=0}
││ ; - bench.SIMDBenchmark::reduceUnrolled@85 (line 214)
││ ; - bench.generated.SIMDBenchmark_reduceUnrolled_jmhTest::reduceUnrolled_thrpt_jmhStub@17 (line 119)
││ 0x00007f0801895c34: cmp %edi,%r8d
╰│ 0x00007f0801895c37: jl 0x00007f0801895a90 ;*goto {reexecute=0 rethrow=0 return_oop=0}
│ ; - bench.SIMDBenchmark::reduceUnrolled@88 (line 214)
│ ; - bench.generated.SIMDBenchmark_reduceUnrolled_jmhTest::reduceUnrolled_thrpt_jmhStub@17 (line 119)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi - thanks for posting this! I ran your code with perfasm (skylake, windows 10, jdk10-ea-38), I reproduced quite good performance for
reduceUnrolled
- but it looks like it the code is scalar (e.g.vmovsd qword
). Still, it's faster thanreduceVectorised
on my machine.reduceUnrolled256
does quite badly on my setup, it seems to be dominated by floating point to double conversions. Perhaps these are cheaper on Ryzen?