Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import jdk.incubator.foreign.CLinker;
import jdk.incubator.foreign.MemoryAddress;
import jdk.incubator.foreign.MemorySegment;
import jdk.incubator.foreign.ResourceScope;
import jdk.incubator.vector.IntVector;
import jdk.incubator.vector.VectorSpecies;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.TearDown;
import org.openjdk.jmh.annotations.Warmup;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.concurrent.TimeUnit;
@BenchmarkMode(Mode.AverageTime)
@Warmup(iterations = 5, time = 500, timeUnit = TimeUnit.MILLISECONDS)
@Measurement(iterations = 10, time = 500, timeUnit = TimeUnit.MILLISECONDS)
@State(org.openjdk.jmh.annotations.Scope.Thread)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Fork(value = 1, jvmArgsAppend = {
"--add-modules=jdk.incubator.foreign,jdk.incubator.vector",
"-Dforeign.restricted=permit",
"--enable-native-access", "ALL-UNNAMED"})
public class TestLoadStoreInts {
private static final VectorSpecies<Integer> SPECIES = VectorSpecies.ofPreferred(int.class);
@Param("1024")
private int size;
private int[] srcArray;
private int[] dstArray;
private ByteBuffer srcBufferHeap;
private ByteBuffer dstBufferHeap;
private ByteBuffer srcBufferNative;
private ByteBuffer dstBufferNative;
private ResourceScope implicitScope;
private MemorySegment srcSegmentImplicit;
private MemorySegment dstSegmentImplicit;
private ByteBuffer srcBufferSegmentImplicit;
private ByteBuffer dstBufferSegmentImplicit;
private MemoryAddress srcAddress;
private MemoryAddress dstAddress;
@Setup
public void setup() {
var intSize = size / Integer.BYTES;
srcArray = new int[intSize];
dstArray = srcArray.clone();
for (int i = 0; i < srcArray.length; i++) {
srcArray[i] = i;
}
srcBufferHeap = ByteBuffer.allocate(size);
dstBufferHeap = ByteBuffer.allocate(size);
srcBufferNative = ByteBuffer.allocateDirect(size);
dstBufferNative = ByteBuffer.allocateDirect(size);
implicitScope = ResourceScope.newImplicitScope();
srcSegmentImplicit = MemorySegment.allocateNative(size, SPECIES.vectorByteSize(), implicitScope);
srcBufferSegmentImplicit = srcSegmentImplicit.asByteBuffer();
dstSegmentImplicit = MemorySegment.allocateNative(size, SPECIES.vectorByteSize(), implicitScope);
dstBufferSegmentImplicit = dstSegmentImplicit.asByteBuffer();
srcAddress = CLinker.allocateMemory(size);
dstAddress = CLinker.allocateMemory(size);
}
@TearDown
public void tearDown() {
CLinker.freeMemory(srcAddress);
CLinker.freeMemory(dstAddress);
}
@Benchmark
public void array() {
for (int i = 0; i < SPECIES.loopBound(srcArray.length); i += SPECIES.length()) {
var v = IntVector.fromArray(SPECIES, srcArray, i);
v.intoArray(dstArray, i);
}
}
@Benchmark
public void arrayAdd() {
for (int i = 0; i < SPECIES.loopBound(srcArray.length); i += SPECIES.length()) {
var v = IntVector.fromArray(SPECIES, srcArray, i);
v = v.add(v);
v.intoArray(dstArray, i);
}
}
@Benchmark
public void bufferHeap() {
for (int i = 0; i < SPECIES.loopBound(srcArray.length); i += SPECIES.length()) {
var v = IntVector.fromByteBuffer(SPECIES, srcBufferHeap, i << 2, ByteOrder.nativeOrder());
v.intoByteBuffer(dstBufferHeap, i << 2, ByteOrder.nativeOrder());
}
}
@Benchmark
public void bufferNative() {
for (int i = 0; i < SPECIES.loopBound(srcArray.length); i += SPECIES.length()) {
var v = IntVector.fromByteBuffer(SPECIES, srcBufferNative, i << 2, ByteOrder.nativeOrder());
v.intoByteBuffer(dstBufferNative, i << 2, ByteOrder.nativeOrder());
}
}
@Benchmark
public void bufferNativeAdd() {
for (int i = 0; i < SPECIES.loopBound(srcArray.length); i += SPECIES.length()) {
var v = IntVector.fromByteBuffer(SPECIES, srcBufferNative, i << 2, ByteOrder.nativeOrder());
v = v.add(v);
v.intoByteBuffer(dstBufferNative, i << 2, ByteOrder.nativeOrder());
}
}
@Benchmark
public void bufferSegmentImplicit() {
for (int i = 0; i < SPECIES.loopBound(srcArray.length); i += SPECIES.length()) {
var v = IntVector.fromByteBuffer(SPECIES, srcBufferSegmentImplicit, i << 2, ByteOrder.nativeOrder());
v.intoByteBuffer(dstBufferSegmentImplicit, i << 2, ByteOrder.nativeOrder());
}
}
@Benchmark
public void bufferSegmentConfined() {
try (final var scope = ResourceScope.newConfinedScope()) {
final var srcBufferSegmentConfined = srcAddress.asSegment(size, scope).asByteBuffer();
final var dstBufferSegmentConfined = dstAddress.asSegment(size, scope).asByteBuffer();
for (int i = 0; i < SPECIES.loopBound(srcArray.length); i += SPECIES.length()) {
var v = IntVector.fromByteBuffer(SPECIES, srcBufferSegmentConfined, i << 2, ByteOrder.nativeOrder());
v.intoByteBuffer(dstBufferSegmentConfined, i << 2, ByteOrder.nativeOrder());
}
}
}
}
/*
# VM options: -XX:-TieredCompilation -XX:ObjectAlignmentInBytes=32 -Xbatch --add-modules=jdk.incubator.foreign,jdk.incubator.vector -Dforeign.restricted=permit --enable-native-access ALL-UNNAMED
Benchmark (size) Mode Cnt Score Error Units
TestLoadStoreInts.array 1024 avgt 10 17.307 ± 0.515 ns/op
TestLoadStoreInts.array 16384 avgt 10 301.323 ± 9.914 ns/op
TestLoadStoreInts.arrayAdd 1024 avgt 10 20.967 ± 0.265 ns/op
TestLoadStoreInts.arrayAdd 16384 avgt 10 294.017 ± 12.858 ns/op
TestLoadStoreInts.bufferHeap 1024 avgt 10 175.051 ± 4.957 ns/op
TestLoadStoreInts.bufferHeap 16384 avgt 10 2663.099 ± 28.749 ns/op
TestLoadStoreInts.bufferNative 1024 avgt 10 177.113 ± 1.992 ns/op
TestLoadStoreInts.bufferNative 16384 avgt 10 2608.598 ± 79.713 ns/op
TestLoadStoreInts.bufferNativeAdd 1024 avgt 10 171.592 ± 2.619 ns/op
TestLoadStoreInts.bufferNativeAdd 16384 avgt 10 2590.391 ± 35.435 ns/op
TestLoadStoreInts.bufferSegmentConfined 1024 avgt 10 1841.485 ± 83.677 ns/op
TestLoadStoreInts.bufferSegmentConfined 16384 avgt 10 3082.778 ± 172.632 ns/op
TestLoadStoreInts.bufferSegmentImplicit 1024 avgt 10 171.448 ± 1.796 ns/op
TestLoadStoreInts.bufferSegmentImplicit 16384 avgt 10 2612.027 ± 40.755 ns/op
For array, stip mined generated loop is unrolled
││││ │ ;; B48: # out( B48 B49 ) &lt;- in( B47 B48 ) Loop( B48-B48 inner main of N306 strip mined) Freq: 1.35695e+07
4.75% ││││↗│ 0x00000001134c3392: vmovdqu 0x10(%rdx,%r8,4),%ymm0
2.94% ││││││ 0x00000001134c3399: vmovdqu %ymm0,0x10(%rsi,%r8,4)
8.92% ││││││ 0x00000001134c33a0: vmovdqu 0x30(%rdx,%r8,4),%ymm0
2.03% ││││││ 0x00000001134c33a7: vmovdqu %ymm0,0x30(%rsi,%r8,4)
2.24% ││││││ 0x00000001134c33ae: vmovdqu 0x50(%rdx,%r8,4),%ymm0
2.86% ││││││ 0x00000001134c33b5: vmovdqu %ymm0,0x50(%rsi,%r8,4)
14.36% ││││││ 0x00000001134c33bc: vmovdqu 0x70(%rdx,%r8,4),%ymm0
2.07% ││││││ 0x00000001134c33c3: vmovdqu %ymm0,0x70(%rsi,%r8,4)
2.05% ││││││ 0x00000001134c33ca: add $0x20,%r8d
2.40% ││││││ 0x00000001134c33ce: cmp %ebp,%r8d
││││╰│ 0x00000001134c33d1: jl 0x00000001134c3392
----
With patch applied:
diff --git a/src/hotspot/share/opto/vectorIntrinsics.cpp b/src/hotspot/share/opto/vectorIntrinsics.cpp
index c36b2a22caf..bb1cc6f723e 100644
--- a/src/hotspot/share/opto/vectorIntrinsics.cpp
+++ b/src/hotspot/share/opto/vectorIntrinsics.cpp
@@ -827,7 +827,7 @@ bool LibraryCallKit::inline_vector_mem_operation(bool is_store) {
const TypeInstPtr* vbox_type = TypeInstPtr::make_exact(TypePtr::NotNull, vbox_klass);
if (can_access_non_heap) {
- insert_mem_bar(Op_MemBarCPUOrder);
+// insert_mem_bar(Op_MemBarCPUOrder);
}
if (is_store) {
@@ -874,7 +874,7 @@ bool LibraryCallKit::inline_vector_mem_operation(bool is_store) {
old_map->destruct(&_gvn);
if (can_access_non_heap) {
- insert_mem_bar(Op_MemBarCPUOrder);
+// insert_mem_bar(Op_MemBarCPUOrder);
}
C->set_max_vector_size(MAX2(C->max_vector_size(), (uint)(num_elem * type2aelembytes(elem_bt))));
# VM options: -XX:-TieredCompilation -XX:ObjectAlignmentInBytes=32 -Xbatch --add-modules=jdk.incubator.foreign,jdk.incubator.vector -Dforeign.restricted=permit --enable-native-access ALL-UNNAMED
Benchmark (size) Mode Cnt Score Error Units
TestLoadStoreInts.array 1024 avgt 10 17.222 ± 0.147 ns/op
TestLoadStoreInts.array 16384 avgt 10 290.514 ± 12.998 ns/op
TestLoadStoreInts.arrayAdd 1024 avgt 10 20.944 ± 0.278 ns/op
TestLoadStoreInts.arrayAdd 16384 avgt 10 303.965 ± 18.701 ns/op
TestLoadStoreInts.bufferHeap 1024 avgt 10 18.897 ± 0.188 ns/op
TestLoadStoreInts.bufferHeap 16384 avgt 10 304.835 ± 13.266 ns/op
TestLoadStoreInts.bufferNative 1024 avgt 10 18.591 ± 0.338 ns/op
TestLoadStoreInts.bufferNative 16384 avgt 10 225.034 ± 10.370 ns/op
TestLoadStoreInts.bufferNativeAdd 1024 avgt 10 22.067 ± 0.572 ns/op
TestLoadStoreInts.bufferNativeAdd 16384 avgt 10 234.426 ± 13.257 ns/op
TestLoadStoreInts.bufferSegmentConfined 1024 avgt 10 2047.616 ± 96.892 ns/op
TestLoadStoreInts.bufferSegmentConfined 16384 avgt 10 1649.711 ± 57.005 ns/op
TestLoadStoreInts.bufferSegmentImplicit 1024 avgt 10 18.595 ± 0.147 ns/op
TestLoadStoreInts.bufferSegmentImplicit 16384 avgt 10 239.499 ± 11.620 ns/op
array hot loop:
││││ │ ;; B48: # out( B48 B49 ) &lt;- in( B47 B48 ) Loop( B48-B48 inner main of N306 strip mined) Freq: 3.36034e+09
2.72% ││││↗│ 0x0000000108cddc92: vmovdqu 0x10(%rdx,%r8,4),%ymm0
1.96% ││││││ 0x0000000108cddc99: vmovdqu %ymm0,0x10(%rsi,%r8,4)
8.95% ││││││ 0x0000000108cddca0: vmovdqu 0x30(%rdx,%r8,4),%ymm0
8.35% ││││││ 0x0000000108cddca7: vmovdqu %ymm0,0x30(%rsi,%r8,4)
11.27% ││││││ 0x0000000108cddcae: vmovdqu 0x50(%rdx,%r8,4),%ymm0
3.41% ││││││ 0x0000000108cddcb5: vmovdqu %ymm0,0x50(%rsi,%r8,4)
29.06% ││││││ 0x0000000108cddcbc: vmovdqu 0x70(%rdx,%r8,4),%ymm0
15.99% ││││││ 0x0000000108cddcc3: vmovdqu %ymm0,0x70(%rsi,%r8,4)
13.23% ││││││ 0x0000000108cddcca: add $0x20,%r8d
0.22% ││││││ 0x0000000108cddcce: cmp %ebp,%r8d
││││╰│ 0x0000000108cddcd1: jl 0x0000000108cddc92
bufferNative hot loop
│ │ ;; B33: # out( B33 B34 ) &lt;- in( B32 B33 ) Loop( B33-B33 inner main of N354 strip mined) Freq: 3.2761e+09
3.02% │↗│ 0x0000000111edae44: mov %rbx,%rax
0.20% │││ 0x0000000111edae47: mov %r9,%rbp
7.56% │││ 0x0000000111edae4a: mov %esi,%ecx
0.50% │││ 0x0000000111edae4c: shl $0x2,%ecx
3.51% │││ 0x0000000111edae4f: movslq %ecx,%rcx
0.12% │││ 0x0000000111edae52: mov %rcx,%r13
7.50% │││ 0x0000000111edae55: add %rdi,%r13
0.38% │││ 0x0000000111edae58: add %r13,%rbp
3.12% │││ 0x0000000111edae5b: add %rdx,%rcx
0.16% │││ 0x0000000111edae5e: add %rcx,%rax
8.79% │││ 0x0000000111edae61: vmovdqu (%rax),%ymm4
7.32% │││ 0x0000000111edae65: vmovdqu %ymm4,0x0(%rbp)
11.97% │││ 0x0000000111edae6a: vmovdqu 0x20(%rax),%ymm4
2.40% │││ 0x0000000111edae6f: vmovdqu %ymm4,0x20(%rbp)
11.35% │││ 0x0000000111edae74: vmovdqu 0x40(%rax),%ymm4
3.29% │││ 0x0000000111edae79: vmovdqu %ymm4,0x40(%rbp)
8.02% │││ 0x0000000111edae7e: vmovdqu 0x60(%rax),%ymm4
0.83% │││ 0x0000000111edae83: vmovdqu %ymm4,0x60(%rbp)
9.94% │││ 0x0000000111edae88: add $0x20,%esi
0.60% │││ 0x0000000111edae8b: cmp %r8d,%esi
│╰│ 0x0000000111edae8e: jl 0x0000000111edae44
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment