Skip to content

Instantly share code, notes, and snippets.

@SirYwell
Created April 29, 2024 16:48
Show Gist options
  • Save SirYwell/d9ae4b5393de135ec15429c54d031820 to your computer and use it in GitHub Desktop.
Save SirYwell/d9ae4b5393de135ec15429c54d031820 to your computer and use it in GitHub Desktop.
TornadoVM output
WARNING: Using incubator modules: jdk.incubator.vector
Computing MxM of 512x512
[INFO] Loading Backend: uk.ac.manchester.tornado.drivers.opencl.OCLTornadoDriverProvider@36f0f1be
TornadoGraph dependency matrix...
+----+---------------+
| 5 [data]| <none>
|----+---------------+
| 6 [data]| <none>
|----+---------------+
| 7 [data]| <none>
|----+---------------+
| 8 [data]| <none>
|----+---------------+
| 9 [data]| 10
|----+---------------+
| 10 [task]| 6 7 8
|----+---------------+
| 11 [data]| 10
|----+---------------+
| 12 [data]| 11
|----+---------------+
| 13 [data]| 11
|----+---------------+
| 14 [data]| 11
|----+---------------+
-----------------------------------
Device Table:
[0]: [NVIDIA CUDA] -- NVIDIA GeForce RTX 2070 SUPER
Constant Table:
[0]: 512
Object Table:
[0]: 0x4cf4d528 MatrixFloat <512 x 512>
[1]: 0x73a8da0f MatrixFloat <512 x 512>
[2]: 0x7fbdb894 MatrixFloat <512 x 512>
Task Table:
[0]: task s0.t0 - matrixMultiplication
-----------------------------------
-----------------------------------
TaskGraph:
[0]: constant 0
[1]: object 0
[2]: object 1
[3]: object 2
[4]: context device=0, [ 5 6 7 8 10 11 12 13 14 ]
[5]: persist node
[6]: copy in object 0
[7]: copy in object 1
[8]: copy in object 2
[9]: dependent write on object 2 by task 10
[10]: task=0, args=[ 6 7 8 0 ]
[11]: copy out object 2 after task 10
[12]: deallocate object 1 after 11
[13]: deallocate object 2 after 11
[14]: deallocate object 3 after 11
-----------------------------------
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
__kernel void matrixMultiplication(__global long *_kernel_context, __constant uchar *_constant_region, __local uchar *_local_region, __global int *_atomics, __global uchar *A, __global uchar *B, __global uchar *C, __private int size)
{
ulong ul_18, ul_25, ul_24, ul_22, ul_17, ul_5, ul_35, ul_2, ul_34, ul_7, ul_39, ul_6, ul_1, ul_0, ul_30;
long l_20, l_21, l_37, l_38, l_28, l_29;
int i_33, i_3, i_4, i_36, i_9, i_41, i_8, i_40, i_11, i_10, i_13, i_12, i_14, i_16, i_19, i_27, i_26;
float f_32, f_15, f_31, f_23;
// BLOCK 0
ul_0 = (ulong) A;
ul_1 = (ulong) B;
ul_2 = (ulong) C;
i_3 = get_global_size(0);
i_4 = get_global_size(1);
ul_5 = ul_2 + 32L;
ul_6 = ul_1 + 32L;
ul_7 = ul_0 + 32L;
i_8 = get_global_id(0);
i_9 = get_global_id(1);
// BLOCK 1 MERGES [0 8 ]
i_10 = i_9;
for(;i_10 < 512;)
{
// BLOCK 2
i_11 = i_10 << 9;
i_12 = i_11 + 6;
// BLOCK 3 MERGES [2 7 ]
i_13 = i_8;
for(;i_13 < 512;)
{
// BLOCK 4
i_14 = i_13 + 6;
// BLOCK 5 MERGES [4 6 ]
f_15 = 0.0F;
i_16 = 0;
for(;i_16 < 512;)
{
// BLOCK 6
ul_17 = *((__global ulong *) ul_7);
ul_18 = ul_0 + ul_17;
i_19 = i_12 + i_16;
l_20 = (long) i_19;
l_21 = l_20 << 2;
ul_22 = ul_18 + l_21;
f_23 = *((__global float *) ul_22);
ul_24 = *((__global ulong *) ul_6);
ul_25 = ul_1 + ul_24;
i_26 = i_16 << 9;
i_27 = i_26 + i_14;
l_28 = (long) i_27;
l_29 = l_28 << 2;
ul_30 = ul_25 + l_29;
f_31 = *((__global float *) ul_30);
f_32 = fma(f_23, f_31, f_15);
i_33 = i_16 + 1;
f_15 = f_32;
i_16 = i_33;
} // B6
// BLOCK 7
ul_34 = *((__global ulong *) ul_5);
ul_35 = ul_2 + ul_34;
i_36 = i_13 + i_12;
l_37 = (long) i_36;
l_38 = l_37 << 2;
ul_39 = ul_35 + l_38;
*((__global float *) ul_39) = f_15;
i_40 = i_3 + i_13;
i_13 = i_40;
} // B7
// BLOCK 8
i_41 = i_4 + i_10;
i_10 = i_41;
} // B8
// BLOCK 9
return;
} // kernel
[TornadoVM-OCL-JNI] ERROR : clBuildProgram -> Returned: -11
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
__kernel void matrixMultiplication(__global long *_kernel_context, __constant uchar *_constant_region, __local uchar *_local_region, __global int *_atomics, __global uchar *A, __global uchar *B, __global uchar *C, __private int size)
{
ulong ul_18, ul_25, ul_24, ul_22, ul_17, ul_5, ul_35, ul_2, ul_34, ul_7, ul_39, ul_6, ul_1, ul_0, ul_30;
long l_20, l_21, l_37, l_38, l_28, l_29;
int i_33, i_3, i_4, i_36, i_9, i_41, i_8, i_40, i_11, i_10, i_13, i_12, i_14, i_16, i_19, i_27, i_26;
float f_32, f_15, f_31, f_23;
// BLOCK 0
ul_0 = (ulong) A;
ul_1 = (ulong) B;
ul_2 = (ulong) C;
i_3 = get_global_size(0);
i_4 = get_global_size(1);
ul_5 = ul_2 + 32L;
ul_6 = ul_1 + 32L;
ul_7 = ul_0 + 32L;
i_8 = get_global_id(0);
i_9 = get_global_id(1);
// BLOCK 1 MERGES [0 8 ]
i_10 = i_9;
for(;i_10 < 512;)
{
// BLOCK 2
i_11 = i_10 << 9;
i_12 = i_11 + 6;
// BLOCK 3 MERGES [2 7 ]
i_13 = i_8;
for(;i_13 < 512;)
{
// BLOCK 4
i_14 = i_13 + 6;
// BLOCK 5 MERGES [4 6 ]
f_15 = 0.0F;
i_16 = 0;
for(;i_16 < 512;)
{
// BLOCK 6
ul_17 = *((__global ulong *) ul_7);
ul_18 = ul_0 + ul_17;
i_19 = i_12 + i_16;
l_20 = (long) i_19;
l_21 = l_20 << 2;
ul_22 = ul_18 + l_21;
f_23 = *((__global float *) ul_22);
ul_24 = *((__global ulong *) ul_6);
ul_25 = ul_1 + ul_24;
i_26 = i_16 << 9;
i_27 = i_26 + i_14;
l_28 = (long) i_27;
l_29 = l_28 << 2;
ul_30 = ul_25 + l_29;
f_31 = *((__global float *) ul_30);
f_32 = fma(f_23, f_31, f_15);
i_33 = i_16 + 1;
f_15 = f_32;
i_16 = i_33;
} // B6
// BLOCK 7
ul_34 = *((__global ulong *) ul_5);
ul_35 = ul_2 + ul_34;
i_36 = i_13 + i_12;
l_37 = (long) i_36;
l_38 = l_37 << 2;
ul_39 = ul_35 + l_38;
*((__global float *) ul_39) = f_15;
i_40 = i_3 + i_13;
i_13 = i_40;
} // B7
// BLOCK 8
i_41 = i_4 + i_10;
i_10 = i_41;
} // B8
// BLOCK 9
return;
} // kernel
[TornadoVM-OCL-JNI] ERROR : clBuildProgram -> Returned: -11
uk.ac.manchester.tornado.api.exceptions.TornadoRuntimeException: [ERROR] Generated Kernel is NULL.
Please report this issue to https://github.com/beehive-lab/TornadoVM
at tornado.drivers.opencl@1.0.3/uk.ac.manchester.tornado.drivers.opencl.graal.OCLInstalledCode.checkKernelNotNull(OCLInstalledCode.java:346)
at tornado.drivers.opencl@1.0.3/uk.ac.manchester.tornado.drivers.opencl.graal.OCLInstalledCode.submitWithoutEvents(OCLInstalledCode.java:352)
at tornado.drivers.opencl@1.0.3/uk.ac.manchester.tornado.drivers.opencl.graal.OCLInstalledCode.launchWithoutDependencies(OCLInstalledCode.java:392)
at tornado.runtime@1.0.3/uk.ac.manchester.tornado.runtime.interpreter.TornadoVMInterpreter.executeLaunch(TornadoVMInterpreter.java:747)
at tornado.runtime@1.0.3/uk.ac.manchester.tornado.runtime.interpreter.TornadoVMInterpreter.execute(TornadoVMInterpreter.java:338)
at tornado.runtime@1.0.3/uk.ac.manchester.tornado.runtime.interpreter.TornadoVMInterpreter.execute(TornadoVMInterpreter.java:855)
at java.base/java.util.Spliterators$ArraySpliterator.forEachRemaining(Spliterators.java:1024)
at java.base/java.util.stream.ReferencePipeline$Head.forEach(ReferencePipeline.java:762)
at tornado.runtime@1.0.3/uk.ac.manchester.tornado.runtime.TornadoVM.executeInterpreterSingleThreaded(TornadoVM.java:123)
at tornado.runtime@1.0.3/uk.ac.manchester.tornado.runtime.TornadoVM.execute(TornadoVM.java:110)
at tornado.runtime@1.0.3/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph.scheduleInner(TornadoTaskGraph.java:858)
at tornado.runtime@1.0.3/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph.execute(TornadoTaskGraph.java:1338)
at tornado.runtime@1.0.3/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph.execute(TornadoTaskGraph.java:1350)
at tornado.api@1.0.3/uk.ac.manchester.tornado.api.TaskGraph.execute(TaskGraph.java:777)
at tornado.api@1.0.3/uk.ac.manchester.tornado.api.ImmutableTaskGraph.execute(ImmutableTaskGraph.java:49)
at tornado.api@1.0.3/uk.ac.manchester.tornado.api.TornadoExecutionPlan$TornadoExecutor.lambda$execute$0(TornadoExecutionPlan.java:400)
at java.base/java.util.ArrayList.forEach(ArrayList.java:1596)
at tornado.api@1.0.3/uk.ac.manchester.tornado.api.TornadoExecutionPlan$TornadoExecutor.execute(TornadoExecutionPlan.java:400)
at tornado.api@1.0.3/uk.ac.manchester.tornado.api.TornadoExecutionPlan.execute(TornadoExecutionPlan.java:116)
at example.MatrixMultiplication.main(MatrixMultiplication.java:95)
Bailout from LAUNCH Bytecode:
Reason: uk.ac.manchester.tornado.api.exceptions.TornadoRuntimeException: [ERROR] Generated Kernel is NULL.
Please report this issue to https://github.com/beehive-lab/TornadoVM
tornado.runtime@1.0.3/uk.ac.manchester.tornado.runtime.interpreter.TornadoVMInterpreter.executeLaunch(TornadoVMInterpreter.java:755)
tornado.runtime@1.0.3/uk.ac.manchester.tornado.runtime.interpreter.TornadoVMInterpreter.execute(TornadoVMInterpreter.java:338)
tornado.runtime@1.0.3/uk.ac.manchester.tornado.runtime.interpreter.TornadoVMInterpreter.execute(TornadoVMInterpreter.java:855)
java.base/java.util.Spliterators$ArraySpliterator.forEachRemaining(Spliterators.java:1024)
java.base/java.util.stream.ReferencePipeline$Head.forEach(ReferencePipeline.java:762)
tornado.runtime@1.0.3/uk.ac.manchester.tornado.runtime.TornadoVM.executeInterpreterSingleThreaded(TornadoVM.java:123)
tornado.runtime@1.0.3/uk.ac.manchester.tornado.runtime.TornadoVM.execute(TornadoVM.java:110)
tornado.runtime@1.0.3/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph.scheduleInner(TornadoTaskGraph.java:858)
tornado.runtime@1.0.3/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph.execute(TornadoTaskGraph.java:1338)
tornado.runtime@1.0.3/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph.execute(TornadoTaskGraph.java:1350)
tornado.api@1.0.3/uk.ac.manchester.tornado.api.TaskGraph.execute(TaskGraph.java:777)
tornado.api@1.0.3/uk.ac.manchester.tornado.api.ImmutableTaskGraph.execute(ImmutableTaskGraph.java:49)
tornado.api@1.0.3/uk.ac.manchester.tornado.api.TornadoExecutionPlan$TornadoExecutor.lambda$execute$0(TornadoExecutionPlan.java:400)
java.base/java.util.ArrayList.forEach(ArrayList.java:1596)
tornado.api@1.0.3/uk.ac.manchester.tornado.api.TornadoExecutionPlan$TornadoExecutor.execute(TornadoExecutionPlan.java:400)
tornado.api@1.0.3/uk.ac.manchester.tornado.api.TornadoExecutionPlan.execute(TornadoExecutionPlan.java:116)
example.MatrixMultiplication.main(MatrixMultiplication.java:95)
Single Threaded CPU Execution: 1.35 GFlops, Total time = 199 ms
Streams Execution: 7.46 GFlops, Total time = 36 ms
TornadoVM Execution on GPU (Accelerated): 268.44 GFlops, Total Time = 1 ms
Speedup: 199.0x
Verification false
cleanup: programs ..........0.000540105 s
cleanup: context ..........0.000007084 s
cleanup: total ..........0.000547189 s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment