This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package jmh; | |
import jdk.incubator.foreign.CLinker; | |
import jdk.incubator.foreign.MemoryAccess; | |
import jdk.incubator.foreign.MemoryAddress; | |
import jdk.incubator.foreign.MemorySegment; | |
import jdk.incubator.foreign.ResourceScope; | |
import jdk.incubator.vector.ByteVector; | |
import jdk.incubator.vector.IntVector; | |
import jdk.incubator.vector.VectorSpecies; | |
import org.openjdk.jmh.annotations.Benchmark; | |
import org.openjdk.jmh.annotations.BenchmarkMode; | |
import org.openjdk.jmh.annotations.Fork; | |
import org.openjdk.jmh.annotations.Measurement; | |
import org.openjdk.jmh.annotations.Mode; | |
import org.openjdk.jmh.annotations.OutputTimeUnit; | |
import org.openjdk.jmh.annotations.Param; | |
import org.openjdk.jmh.annotations.Setup; | |
import org.openjdk.jmh.annotations.State; | |
import org.openjdk.jmh.annotations.Warmup; | |
import java.nio.ByteBuffer; | |
import java.nio.ByteOrder; | |
import java.util.concurrent.TimeUnit; | |
@BenchmarkMode(Mode.AverageTime) | |
@Warmup(iterations = 5, time = 500, timeUnit = TimeUnit.MILLISECONDS) | |
@Measurement(iterations = 10, time = 500, timeUnit = TimeUnit.MILLISECONDS) | |
@State(org.openjdk.jmh.annotations.Scope.Thread) | |
@OutputTimeUnit(TimeUnit.NANOSECONDS) | |
@Fork(value = 1, jvmArgsAppend = { | |
"--add-modules=jdk.incubator.foreign,jdk.incubator.vector", | |
"-Dforeign.restricted=permit", | |
"--enable-native-access", "ALL-UNNAMED"}) | |
public class TestLoadStoreBytes { | |
private static final VectorSpecies<Byte> SPECIES = VectorSpecies.ofPreferred(byte.class); | |
@Param("1024") | |
private int size; | |
private byte[] srcArray; | |
private byte[] dstArray; | |
private ByteBuffer srcBufferHeap; | |
private ByteBuffer dstBufferHeap; | |
private ByteBuffer srcBufferNative; | |
private ByteBuffer dstBufferNative; | |
private ResourceScope implicitScope; | |
private MemorySegment srcSegmentImplicit; | |
private MemorySegment dstSegmentImplicit; | |
private ByteBuffer srcBufferSegmentImplicit; | |
private ByteBuffer dstBufferSegmentImplicit; | |
private MemoryAddress srcAddress; | |
private MemoryAddress dstAddress; | |
@Setup | |
public void setup() { | |
srcArray = new byte[size]; | |
dstArray = srcArray.clone(); | |
for (int i = 0; i < srcArray.length; i++) { | |
srcArray[i] = (byte) i; | |
} | |
srcBufferHeap = ByteBuffer.allocate(size); | |
dstBufferHeap = ByteBuffer.allocate(size); | |
srcBufferNative = ByteBuffer.allocateDirect(size); | |
dstBufferNative = ByteBuffer.allocateDirect(size); | |
implicitScope = ResourceScope.newImplicitScope(); | |
srcSegmentImplicit = MemorySegment.allocateNative(size, SPECIES.vectorByteSize(), implicitScope); | |
srcBufferSegmentImplicit = srcSegmentImplicit.asByteBuffer(); | |
dstSegmentImplicit = MemorySegment.allocateNative(size, SPECIES.vectorByteSize(), implicitScope); | |
dstBufferSegmentImplicit = dstSegmentImplicit.asByteBuffer(); | |
srcAddress = CLinker.allocateMemory(size); | |
dstAddress = CLinker.allocateMemory(size); | |
} | |
@Benchmark | |
public void array() { | |
for (int i = 0; i < SPECIES.loopBound(srcArray.length); i += SPECIES.length()) { | |
var v = ByteVector.fromArray(SPECIES, srcArray, i); | |
v.intoArray(dstArray, i); | |
} | |
} | |
@Benchmark | |
public void arrayScalar() { | |
for (int i = 0; i < SPECIES.loopBound(srcArray.length); i ++) { | |
var v = srcArray[i]; | |
dstArray[i] = v; | |
} | |
} | |
@Benchmark | |
public void bufferHeap() { | |
for (int i = 0; i < SPECIES.loopBound(srcArray.length); i += SPECIES.length()) { | |
var v = ByteVector.fromByteBuffer(SPECIES, srcBufferHeap, i, ByteOrder.nativeOrder()); | |
v.intoByteBuffer(dstBufferHeap, i, ByteOrder.nativeOrder()); | |
} | |
} | |
@Benchmark | |
public void bufferHeapScalar() { | |
for (int i = 0; i < SPECIES.loopBound(srcArray.length); i++) { | |
var v = srcBufferHeap.get(i); | |
dstBufferHeap.put(i, v); | |
} | |
} | |
@Benchmark | |
public void bufferNative() { | |
for (int i = 0; i < SPECIES.loopBound(srcArray.length); i += SPECIES.length()) { | |
var v = ByteVector.fromByteBuffer(SPECIES, srcBufferNative, i, ByteOrder.nativeOrder()); | |
v.intoByteBuffer(dstBufferNative, i, ByteOrder.nativeOrder()); | |
} | |
} | |
@Benchmark | |
public void bufferNativeScalar() { | |
for (int i = 0; i < SPECIES.loopBound(srcArray.length); i++) { | |
var v = srcBufferNative.get(i); | |
dstBufferNative.put(i, v); | |
} | |
} | |
@Benchmark | |
public void bufferSegmentImplicit() { | |
for (int i = 0; i < SPECIES.loopBound(srcArray.length); i += SPECIES.length()) { | |
var v = ByteVector.fromByteBuffer(SPECIES, srcBufferSegmentImplicit, i, ByteOrder.nativeOrder()); | |
v.intoByteBuffer(dstBufferSegmentImplicit, i, ByteOrder.nativeOrder()); | |
} | |
} | |
@Benchmark | |
public void segmentImplicitScalar() { | |
for (int i = 0; i < SPECIES.loopBound(srcArray.length); i++) { | |
var v = MemoryAccess.getByteAtOffset(srcSegmentImplicit, i); | |
MemoryAccess.setByteAtOffset(dstSegmentImplicit, i, v); | |
} | |
} | |
@Benchmark | |
public void bufferSegmentConfined() { | |
try (final var scope = ResourceScope.newConfinedScope()) { | |
final var srcBufferSegmentConfined = srcAddress.asSegment(size, scope).asByteBuffer(); | |
final var dstBufferSegmentConfined = dstAddress.asSegment(size, scope).asByteBuffer(); | |
for (int i = 0; i < SPECIES.loopBound(srcArray.length); i += SPECIES.length()) { | |
var v = ByteVector.fromByteBuffer(SPECIES, srcBufferSegmentConfined, i, ByteOrder.nativeOrder()); | |
v.intoByteBuffer(dstBufferSegmentConfined, i, ByteOrder.nativeOrder()); | |
} | |
} | |
} | |
} | |
/* | |
git@github.com:openjdk/jdk.git | |
cd20c01942dd8559a31e51ef2a595c6eba44b8ad refs/remotes/origin/HEAD | |
# VM options: -XX:-TieredCompilation -XX:ObjectAlignmentInBytes=32 --add-modules=jdk.incubator.foreign,jdk.incubator.vector -Dforeign.restricted=permit --enable-native-access ALL-UNNAMED | |
Benchmark (size) Mode Cnt Score Error Units | |
TestLoadStoreBytes.array 1024 avgt 10 13.767 ± 0.215 ns/op | |
TestLoadStoreBytes.array 16384 avgt 10 279.825 ± 5.764 ns/op | |
TestLoadStoreBytes.arrayScalar 1024 avgt 10 23.255 ± 0.684 ns/op | |
TestLoadStoreBytes.arrayScalar 16384 avgt 10 195.704 ± 6.330 ns/op | |
TestLoadStoreBytes.bufferHeap 1024 avgt 10 78.539 ± 0.431 ns/op | |
TestLoadStoreBytes.bufferHeap 16384 avgt 10 1303.619 ± 31.830 ns/op | |
TestLoadStoreBytes.bufferHeapScalar 1024 avgt 10 268.601 ± 8.998 ns/op | |
TestLoadStoreBytes.bufferHeapScalar 16384 avgt 10 4217.291 ± 28.522 ns/op | |
TestLoadStoreBytes.bufferNative 1024 avgt 10 73.032 ± 0.611 ns/op | |
TestLoadStoreBytes.bufferNative 16384 avgt 10 1282.294 ± 13.322 ns/op | |
TestLoadStoreBytes.bufferNativeScalar 1024 avgt 10 257.354 ± 13.796 ns/op | |
TestLoadStoreBytes.bufferNativeScalar 16384 avgt 10 4064.541 ± 53.939 ns/op | |
TestLoadStoreBytes.bufferSegmentConfined 1024 avgt 10 439.680 ± 8.065 ns/op | |
TestLoadStoreBytes.bufferSegmentConfined 16384 avgt 10 1388.378 ± 16.244 ns/op | |
TestLoadStoreBytes.bufferSegmentImplicit 1024 avgt 10 77.520 ± 0.753 ns/op | |
TestLoadStoreBytes.bufferSegmentImplicit 16384 avgt 10 1266.489 ± 11.080 ns/op | |
TestLoadStoreBytes.segmentImplicitScalar 1024 avgt 10 749.856 ± 8.769 ns/op | |
TestLoadStoreBytes.segmentImplicitScalar 16384 avgt 10 11846.255 ± 130.863 ns/op | |
# VM options: -XX:+TieredCompilation -XX:ObjectAlignmentInBytes=32 --add-modules=jdk.incubator.foreign,jdk.incubator.vector -Dforeign.restricted=permit --enable-native-access ALL-UNNAMED | |
Benchmark (size) Mode Cnt Score Error Units | |
TestLoadStoreBytes.array 1024 avgt 10 13.728 ± 0.818 ns/op | |
TestLoadStoreBytes.array 16384 avgt 10 260.386 ± 11.354 ns/op | |
TestLoadStoreBytes.arrayScalar 1024 avgt 10 23.388 ± 0.703 ns/op | |
TestLoadStoreBytes.arrayScalar 16384 avgt 10 190.860 ± 11.174 ns/op | |
TestLoadStoreBytes.bufferHeap 1024 avgt 10 71.069 ± 0.883 ns/op | |
TestLoadStoreBytes.bufferHeap 16384 avgt 10 1282.913 ± 11.387 ns/op | |
TestLoadStoreBytes.bufferHeapScalar 1024 avgt 10 266.658 ± 6.184 ns/op | |
TestLoadStoreBytes.bufferHeapScalar 16384 avgt 10 4165.673 ± 68.071 ns/op | |
TestLoadStoreBytes.bufferNative 1024 avgt 10 70.646 ± 1.210 ns/op | |
TestLoadStoreBytes.bufferNative 16384 avgt 10 1239.830 ± 12.737 ns/op | |
TestLoadStoreBytes.bufferNativeScalar 1024 avgt 10 251.330 ± 2.522 ns/op | |
TestLoadStoreBytes.bufferNativeScalar 16384 avgt 10 4042.816 ± 50.017 ns/op | |
TestLoadStoreBytes.bufferSegmentConfined 1024 avgt 10 50.773 ± 0.413 ns/op | |
TestLoadStoreBytes.bufferSegmentConfined 16384 avgt 10 535.652 ± 3.546 ns/op | |
TestLoadStoreBytes.bufferSegmentImplicit 1024 avgt 10 70.323 ± 0.748 ns/op | |
TestLoadStoreBytes.bufferSegmentImplicit 16384 avgt 10 1265.497 ± 9.701 ns/op | |
TestLoadStoreBytes.segmentImplicitScalar 1024 avgt 10 747.202 ± 9.150 ns/op | |
TestLoadStoreBytes.segmentImplicitScalar 16384 avgt 10 11903.696 ± 353.461 ns/op | |
---- | |
With patch applied: | |
diff --git a/src/hotspot/share/opto/vectorIntrinsics.cpp b/src/hotspot/share/opto/vectorIntrinsics.cpp | |
index c36b2a22caf..bb1cc6f723e 100644 | |
--- a/src/hotspot/share/opto/vectorIntrinsics.cpp | |
+++ b/src/hotspot/share/opto/vectorIntrinsics.cpp | |
@@ -827,7 +827,7 @@ bool LibraryCallKit::inline_vector_mem_operation(bool is_store) { | |
const TypeInstPtr* vbox_type = TypeInstPtr::make_exact(TypePtr::NotNull, vbox_klass); | |
if (can_access_non_heap) { | |
- insert_mem_bar(Op_MemBarCPUOrder); | |
+// insert_mem_bar(Op_MemBarCPUOrder); | |
} | |
if (is_store) { | |
@@ -874,7 +874,7 @@ bool LibraryCallKit::inline_vector_mem_operation(bool is_store) { | |
old_map->destruct(&_gvn); | |
if (can_access_non_heap) { | |
- insert_mem_bar(Op_MemBarCPUOrder); | |
+// insert_mem_bar(Op_MemBarCPUOrder); | |
} | |
C->set_max_vector_size(MAX2(C->max_vector_size(), (uint)(num_elem * type2aelembytes(elem_bt)))); | |
# VM options: -XX:-TieredCompilation -XX:ObjectAlignmentInBytes=32 --add-modules=jdk.incubator.foreign,jdk.incubator.vector -Dforeign.restricted=permit --enable-native-access ALL-UNNAMED | |
Benchmark (size) Mode Cnt Score Error Units | |
TestLoadStoreBytes.array 1024 avgt 10 13.713 ± 0.135 ns/op | |
TestLoadStoreBytes.array 16384 avgt 10 283.524 ± 4.238 ns/op | |
TestLoadStoreBytes.arrayScalar 1024 avgt 10 23.123 ± 0.283 ns/op | |
TestLoadStoreBytes.arrayScalar 16384 avgt 10 200.712 ± 8.645 ns/op | |
TestLoadStoreBytes.bufferHeap 1024 avgt 10 25.664 ± 0.316 ns/op | |
TestLoadStoreBytes.bufferHeap 16384 avgt 10 385.964 ± 12.450 ns/op | |
TestLoadStoreBytes.bufferHeapScalar 1024 avgt 10 265.677 ± 3.302 ns/op | |
TestLoadStoreBytes.bufferHeapScalar 16384 avgt 10 4163.773 ± 72.358 ns/op | |
TestLoadStoreBytes.bufferNative 1024 avgt 10 25.556 ± 0.390 ns/op | |
TestLoadStoreBytes.bufferNative 16384 avgt 10 349.823 ± 4.298 ns/op | |
TestLoadStoreBytes.bufferNativeScalar 1024 avgt 10 252.061 ± 3.130 ns/op | |
TestLoadStoreBytes.bufferNativeScalar 16384 avgt 10 4062.956 ± 70.301 ns/op | |
TestLoadStoreBytes.bufferSegmentConfined 1024 avgt 10 63.998 ± 0.425 ns/op | |
TestLoadStoreBytes.bufferSegmentConfined 16384 avgt 10 806.393 ± 10.080 ns/op | |
TestLoadStoreBytes.bufferSegmentImplicit 1024 avgt 10 25.539 ± 0.268 ns/op | |
TestLoadStoreBytes.bufferSegmentImplicit 16384 avgt 10 349.869 ± 4.381 ns/op | |
TestLoadStoreBytes.segmentImplicitScalar 1024 avgt 10 754.664 ± 21.680 ns/op | |
TestLoadStoreBytes.segmentImplicitScalar 16384 avgt 10 11887.237 ± 189.345 ns/op | |
# VM options: -XX:+TieredCompilation -XX:ObjectAlignmentInBytes=32 --add-modules=jdk.incubator.foreign,jdk.incubator.vector -Dforeign.restricted=permit --enable-native-access ALL-UNNAMED | |
Benchmark (size) Mode Cnt Score Error Units | |
TestLoadStoreBytes.array 1024 avgt 10 14.034 ± 0.176 ns/op | |
TestLoadStoreBytes.array 16384 avgt 10 252.624 ± 10.333 ns/op | |
TestLoadStoreBytes.arrayAdd 1024 avgt 10 15.911 ± 0.523 ns/op | |
TestLoadStoreBytes.arrayAdd 16384 avgt 10 271.969 ± 5.597 ns/op | |
TestLoadStoreBytes.arrayScalar 1024 avgt 10 23.344 ± 0.535 ns/op | |
TestLoadStoreBytes.arrayScalar 16384 avgt 10 195.957 ± 10.636 ns/op | |
TestLoadStoreBytes.bufferHeap 1024 avgt 10 25.644 ± 0.281 ns/op | |
TestLoadStoreBytes.bufferHeap 16384 avgt 10 388.989 ± 4.021 ns/op | |
TestLoadStoreBytes.bufferHeapScalar 1024 avgt 10 266.135 ± 3.611 ns/op | |
TestLoadStoreBytes.bufferHeapScalar 16384 avgt 10 4158.670 ± 63.424 ns/op | |
TestLoadStoreBytes.bufferNative 1024 avgt 10 25.678 ± 0.332 ns/op | |
TestLoadStoreBytes.bufferNative 16384 avgt 10 349.329 ± 3.850 ns/op | |
TestLoadStoreBytes.bufferNativeScalar 1024 avgt 10 251.668 ± 3.311 ns/op | |
TestLoadStoreBytes.bufferNativeScalar 16384 avgt 10 4045.838 ± 129.280 ns/op | |
TestLoadStoreBytes.bufferSegmentConfined 1024 avgt 10 34.934 ± 0.257 ns/op | |
TestLoadStoreBytes.bufferSegmentConfined 16384 avgt 10 286.946 ± 6.120 ns/op | |
TestLoadStoreBytes.bufferSegmentImplicit 1024 avgt 10 25.681 ± 0.501 ns/op | |
TestLoadStoreBytes.bufferSegmentImplicit 16384 avgt 10 351.099 ± 10.846 ns/op | |
TestLoadStoreBytes.segmentImplicitScalar 1024 avgt 10 750.972 ± 9.326 ns/op | |
TestLoadStoreBytes.segmentImplicitScalar 16384 avgt 10 11864.784 ± 135.626 ns/op | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment