Last active
July 17, 2017 14:27
-
-
Save bollu/9fddc42a9998d6d5ae37204a43c3a67e to your computer and use it in GitHub Desktop.
Testing to make sure that bollu/polly branch 07-17-break-arrray-access-for-invariant-load-in-blockgen generates sensible code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
16:25 $ make runbatch | |
rm *.optimised.ll | |
rm *.out | |
rm *.bench | |
rm: cannot remove '*.bench': No such file or directory | |
makefile:23: recipe for target 'clean' failed | |
make: [clean] Error 1 (ignored) | |
rm *.s | |
/users/siddhart/llvm-install/bin/opt -S -polly-canonicalize -polly-process-unprofitable -polly-invariant-load-hoisting -polly-codegen-ppcg \ | |
-polly-acc-mincompute=0 program.ll -o program.optimised.ll | |
/users/siddhart/llvm-install/bin/llc program.optimised.ll -o program.s | |
/users/siddhart/llvm-install/bin/clang program.s -lcudart -lGPURuntime -ldl -lOpenCL -lgfortran -lstdc++ -o program.out -L/opt/nvidia/cudatoolkit8.0/8.0.54_2.2.8_ga620558-2.1/lib64/ | |
export POLLY_DEBUG=1 | |
POLLY_DEBUG=1 srun -n 1 -Cgpu --partition=debug nvprof ./program.out | |
srun: job 2387457 queued and waiting for resources | |
srun: job 2387457 has been allocated resources | |
-> polly_initContext | |
-> initContextCUDA | |
==29336== NVPROF is profiling process 29336, command: ./program.out | |
> Running on GPU device 0 : Tesla P100-PCIE-16GB. | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
CUDA Link Completed in 0.000000ms. Linker Output: | |
ptxas info : 0 bytes gmem | |
ptxas info : Compiling entry function 'FUNC_inita_SCOP_0_KERNEL_0' for 'sm_60' | |
ptxas info : Function properties for FUNC_inita_SCOP_0_KERNEL_0 | |
ptxas . 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads | |
ptxas info : Used 4 registers, 328 bytes cmem[0] | |
info : 0 bytes gmem | |
info : Function properties for 'FUNC_inita_SCOP_0_KERNEL_0': | |
info : used 4 registers, 0 stack, 0 bytes smem, 328 bytes cmem[0], 0 bytes lmem | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
-> polly_initContext | |
-> initContextCUDA | |
-> polly_allocateMemoryForDevice | |
-> allocateMemoryForDeviceCUDA | |
-> polly_getDevicePtr | |
-> getDevicePtrCUDA | |
-> polly_getKernel | |
-> getKernelCUDA | |
-> using cached kernel | |
-> polly_launchKernel | |
-> launchKernelCUDA | |
-> polly_freeKernel | |
-> freeKernelCUDA | |
-> polly_copyFromDeviceToHost | |
-> copyFromDeviceToHostCUDA | |
-> polly_freeDeviceMemory | |
-> freeDeviceMemoryCUDA | |
-> polly_freeContext | |
==29336== Profiling application: ./program.out | |
* Control: 3 | |
----- | |
b: 0 | e: 1 | |
A: 0 0 0 0 0 0 | |
----- | |
b: 0 | e: 2 | |
A: 0 0 0 0 0 0 | |
----- | |
b: 0 | e: 3 | |
A: 0 0 0 0 0 0 | |
----- | |
b: 0 | e: 4 | |
A: 0 0 0 0 0 0 | |
----- | |
b: 0 | e: 5 | |
A: 0 0 0 0 0 0 | |
----- | |
b: 1 | e: 2 | |
A: 0 0 0 0 0 0 | |
----- | |
b: 1 | e: 3 | |
A: 0 0 0 0 0 0 | |
----- | |
b: 1 | e: 4 | |
A: 0 0 0 0 0 0 | |
----- | |
b: 1 | e: 5 | |
A: 0 0 0 0 0 0 | |
----- | |
b: 2 | e: 3 | |
A: 0 0 0 0 0 0 | |
----- | |
b: 2 | e: 4 | |
A: 0 0 0 0 0 0 | |
----- | |
b: 2 | e: 5 | |
A: 0 0 0 0 0 0 | |
----- | |
b: 3 | e: 4 | |
A: 0 0 0 0 0 0 | |
----- | |
b: 3 | e: 5 | |
A: 0 0 0 0 0 0 | |
----- | |
b: 4 | e: 5 | |
A: 0 0 0 0 0 0 | |
* Control: 4 | |
----- | |
b: 0 | e: 1 | |
A: 10 0 0 0 0 0 | |
----- | |
b: 0 | e: 2 | |
A: 10 10 0 0 0 0 | |
----- | |
b: 0 | e: 3 | |
A: 10 10 10 0 0 0 | |
----- | |
b: 0 | e: 4 | |
A: 10 10 10 10 0 0 | |
----- | |
b: 0 | e: 5 | |
A: 10 10 10 10 10 0 | |
----- | |
b: 1 | e: 2 | |
A: 0 10 0 0 0 0 | |
----- | |
b: 1 | e: 3 | |
A: 0 10 10 0 0 0 | |
----- | |
b: 1 | e: 4 | |
A: 0 10 10 10 0 0 | |
----- | |
b: 1 | e: 5 | |
A: 0 10 10 10 10 0 | |
----- | |
b: 2 | e: 3 | |
A: 0 0 10 0 0 0 | |
----- | |
b: 2 | e: 4 | |
A: 0 0 10 10 0 0 | |
----- | |
b: 2 | e: 5 | |
A: 0 0 10 10 10 0 | |
----- | |
b: 3 | e: 4 | |
A: 0 0 0 10 0 0 | |
----- | |
b: 3 | e: 5 | |
A: 0 0 0 10 10 0 | |
----- | |
b: 4 | e: 5 | |
A: 0 0 0 0 10 0 | |
==29336== Profiling result: | |
Time(%) Time Calls Avg Min Max Name | |
72.49% 53.280us 30 1.7760us 1.6640us 3.2640us FUNC_inita_SCOP_0_KERNEL_0 | |
27.51% 20.224us 30 674ns 640ns 832ns [CUDA memcpy DtoH] | |
==29336== API calls: | |
Time(%) Time Calls Avg Min Max Name | |
91.74% 265.27ms 1 265.27ms 265.27ms 265.27ms cuCtxCreate | |
5.67% 16.381ms 30 546.03us 499.71us 580.99us cuMemAlloc | |
1.50% 4.3289ms 1 4.3289ms 4.3289ms 4.3289ms cuLinkAddData | |
0.73% 2.1040ms 30 70.134us 66.598us 95.121us cuMemFree | |
0.17% 480.85us 30 16.028us 15.338us 20.152us cuMemcpyDtoH | |
0.13% 366.10us 30 12.203us 10.963us 23.069us cuLaunchKernel | |
0.03% 97.328us 1 97.328us 97.328us 97.328us cuLinkComplete | |
0.03% 73.145us 1 73.145us 73.145us 73.145us cuModuleLoadData | |
0.01% 24.702us 1 24.702us 24.702us 24.702us cuLinkCreate | |
0.01% 17.288us 1 17.288us 17.288us 17.288us cuDeviceGetName | |
0.00% 2.2970us 3 765ns 180ns 1.7200us cuDeviceGetCount | |
0.00% 2.2530us 1 2.2530us 2.2530us 2.2530us cuLinkDestroy | |
0.00% 874ns 3 291ns 184ns 484ns cuDeviceGet | |
0.00% 855ns 4 213ns 135ns 324ns cuDeviceGetAttribute | |
0.00% 678ns 1 678ns 678ns 678ns cuModuleGetFunction | |
0.00% 654ns 1 654ns 654ns 654ns cuDeviceComputeCapability |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// [siddhart@greina0 compile-cpp-to-gpu]$ cat program.c | |
#include <stdio.h> | |
void f(int *begin, int *end, int *arr, int *control, int *readarr) { | |
for(int i = *begin; i < *end; i++) { | |
int t = 0; | |
if (*control > 3) { | |
t += *readarr; | |
} | |
arr[i] = t; | |
} | |
} | |
void inita(int *A) { | |
for(int i = 0; i < 6; i++) { A[i] = 0; } | |
} | |
void printarr(int *A) { | |
printf("A: "); | |
for (int i = 0; i < 6; i++) { | |
printf("%d ", A[i]); | |
} | |
printf("\n"); | |
} | |
int main() { | |
int A[6]; | |
int readarr = 10;; | |
int control; | |
for(int control = 3; control <= 4; control++) { | |
printf("* Control: %d\n", control); | |
for(int b = 0; b < 6; b++) { | |
for(int e = b + 1; e < 6; e++) { | |
printf("-----\n"); | |
printf("b: %d | e: %d\n", b, e); | |
inita(A); | |
f(&b, &e, A, &control, &readarr); | |
printarr(A); | |
} | |
} | |
} | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[siddhart@greina0 compile-cpp-to-gpu]$ cat program.canonical.ll | |
; ModuleID = 'program.ll' | |
source_filename = "program.c" | |
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" | |
@.str = private unnamed_addr constant [4 x i8] c"A: \00", align 1 | |
@.str.1 = private unnamed_addr constant [4 x i8] c"%d \00", align 1 | |
@.str.2 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1 | |
@.str.3 = private unnamed_addr constant [15 x i8] c"* Control: %d\0A\00", align 1 | |
@.str.4 = private unnamed_addr constant [7 x i8] c"-----\0A\00", align 1 | |
@.str.5 = private unnamed_addr constant [15 x i8] c"b: %d | e: %d\0A\00", align 1 | |
@str = private unnamed_addr constant [6 x i8] c"-----\00" | |
define void @f(i32* %begin, i32* %end, i32* %arr, i32* %control, i32* %readarr) { | |
entry: | |
br label %entry.split | |
entry.split: ; preds = %entry | |
%tmp1 = load i32, i32* %begin, align 4 | |
%tmp41 = load i32, i32* %end, align 4 | |
%cmp2 = icmp slt i32 %tmp1, %tmp41 | |
br i1 %cmp2, label %for.body.lr.ph, label %for.end | |
for.body.lr.ph: ; preds = %entry.split | |
%0 = sext i32 %tmp1 to i64 | |
br label %for.body | |
for.body: ; preds = %for.body.lr.ph, %if.end | |
%indvars.iv = phi i64 [ %0, %for.body.lr.ph ], [ %indvars.iv.next, %if.end ] | |
%tmp6 = load i32, i32* %control, align 4 | |
%cmp1 = icmp sgt i32 %tmp6, 3 | |
br i1 %cmp1, label %if.then, label %if.end | |
if.then: ; preds = %for.body | |
%tmp8 = load i32, i32* %readarr, align 4 | |
br label %if.end | |
if.end: ; preds = %if.then, %for.body | |
%t.0 = phi i32 [ %tmp8, %if.then ], [ 0, %for.body ] | |
%arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv | |
store i32 %t.0, i32* %arrayidx, align 4 | |
%indvars.iv.next = add i64 %indvars.iv, 1 | |
%tmp4 = load i32, i32* %end, align 4 | |
%1 = sext i32 %tmp4 to i64 | |
%cmp = icmp slt i64 %indvars.iv.next, %1 | |
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge | |
for.cond.for.end_crit_edge: ; preds = %if.end | |
br label %for.end | |
for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split | |
ret void | |
} | |
define void @inita(i32* %A) { | |
entry: | |
br label %entry.split | |
entry.split: ; preds = %entry | |
br label %for.body | |
for.body: ; preds = %entry.split, %for.body | |
%indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] | |
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv | |
store i32 0, i32* %arrayidx, align 4 | |
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 | |
%exitcond = icmp ne i64 %indvars.iv.next, 6 | |
br i1 %exitcond, label %for.body, label %for.end | |
for.end: ; preds = %for.body | |
ret void | |
} | |
define void @printarr(i32* %A) { | |
entry: | |
br label %entry.split | |
entry.split: ; preds = %entry | |
%call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0)) | |
br label %for.body | |
for.body: ; preds = %entry.split, %for.body | |
%indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] | |
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv | |
%tmp3 = load i32, i32* %arrayidx, align 4 | |
%call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.1, i64 0, i64 0), i32 %tmp3) | |
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 | |
%exitcond = icmp ne i64 %indvars.iv.next, 6 | |
br i1 %exitcond, label %for.body, label %for.end | |
for.end: ; preds = %for.body | |
%putchar = tail call i32 @putchar(i32 10) | |
ret void | |
} | |
declare i32 @printf(i8*, ...) | |
define i32 @main() { | |
entry: | |
%A = alloca [6 x i32], align 16 | |
%readarr = alloca i32, align 4 | |
%control1 = alloca i32, align 4 | |
%b = alloca i32, align 4 | |
%e = alloca i32, align 4 | |
br label %entry.split | |
entry.split: ; preds = %entry | |
store i32 10, i32* %readarr, align 4 | |
store i32 3, i32* %control1, align 4 | |
br label %for.body | |
for.body: ; preds = %entry.split, %for.inc15 | |
%tmp1 = load i32, i32* %control1, align 4 | |
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.3, i64 0, i64 0), i32 %tmp1) | |
store i32 0, i32* %b, align 4 | |
br label %for.cond5.preheader | |
for.cond5.preheader: ; preds = %for.body, %for.inc12 | |
%storemerge2.in3 = load i32, i32* %b, align 4 | |
%storemerge24 = add nsw i32 %storemerge2.in3, 1 | |
store i32 %storemerge24, i32* %e, align 4 | |
%cmp65 = icmp slt i32 %storemerge2.in3, 5 | |
br i1 %cmp65, label %for.body7.lr.ph, label %for.inc12 | |
for.body7.lr.ph: ; preds = %for.cond5.preheader | |
br label %for.body7 | |
for.body7: ; preds = %for.body7.lr.ph, %for.body7 | |
%puts = call i32 @puts(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @str, i64 0, i64 0)) | |
%tmp5 = load i32, i32* %b, align 4 | |
%tmp6 = load i32, i32* %e, align 4 | |
%call9 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 %tmp5, i32 %tmp6) | |
%arraydecay = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0 | |
call void @inita(i32* %arraydecay) | |
%arraydecay10 = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0 | |
call void @f(i32* nonnull %b, i32* nonnull %e, i32* %arraydecay10, i32* nonnull %control1, i32* nonnull %readarr) | |
%arraydecay11 = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0 | |
call void @printarr(i32* %arraydecay11) | |
%storemerge2.in = load i32, i32* %e, align 4 | |
%storemerge2 = add nsw i32 %storemerge2.in, 1 | |
store i32 %storemerge2, i32* %e, align 4 | |
%cmp6 = icmp slt i32 %storemerge2.in, 5 | |
br i1 %cmp6, label %for.body7, label %for.cond5.for.inc12_crit_edge | |
for.cond5.for.inc12_crit_edge: ; preds = %for.body7 | |
br label %for.inc12 | |
for.inc12: ; preds = %for.cond5.for.inc12_crit_edge, %for.cond5.preheader | |
%tmp8 = load i32, i32* %b, align 4 | |
%inc13 = add nsw i32 %tmp8, 1 | |
store i32 %inc13, i32* %b, align 4 | |
%cmp3 = icmp slt i32 %tmp8, 5 | |
br i1 %cmp3, label %for.cond5.preheader, label %for.inc15 | |
for.inc15: ; preds = %for.inc12 | |
%tmp9 = load i32, i32* %control1, align 4 | |
%inc16 = add nsw i32 %tmp9, 1 | |
store i32 %inc16, i32* %control1, align 4 | |
%cmp = icmp slt i32 %tmp9, 4 | |
br i1 %cmp, label %for.body, label %for.end17 | |
for.end17: ; preds = %for.inc15 | |
ret i32 0 | |
} | |
; Function Attrs: nounwind | |
declare i32 @putchar(i32) #0 | |
; Function Attrs: nounwind | |
declare i32 @puts(i8* nocapture readonly) #0 | |
attributes #0 = { nounwind } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
16:27 $ cat program.optimised.ll | |
; ModuleID = 'program.ll' | |
source_filename = "program.c" | |
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" | |
@.str = private unnamed_addr constant [4 x i8] c"A: \00", align 1 | |
@.str.1 = private unnamed_addr constant [4 x i8] c"%d \00", align 1 | |
@.str.2 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1 | |
@.str.3 = private unnamed_addr constant [15 x i8] c"* Control: %d\0A\00", align 1 | |
@.str.4 = private unnamed_addr constant [7 x i8] c"-----\0A\00", align 1 | |
@.str.5 = private unnamed_addr constant [15 x i8] c"b: %d | e: %d\0A\00", align 1 | |
@str = private unnamed_addr constant [6 x i8] c"-----\00" | |
@FUNC_f_SCOP_0_KERNEL_0 = private unnamed_addr constant [1802 x i8] c"//\0A// Generated by LLVM NVPTX Back-End\0A//\0A\0A.version 3.2\0A.target sm_30\0A.address_size 64\0A\0A\09// .globl\09FUNC_f_SCOP_0_KERNEL_0\0A\0A.visible .entry FUNC_f_SCOP_0_KERNEL_0(\0A\09.param .u64 FUNC_f_SCOP_0_KERNEL_0_param_0,\0A\09.param .u64 FUNC_f_SCOP_0_KERNEL_0_param_1,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_2,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_3,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_4,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_5,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_6,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_7,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_8\0A)\0A.maxntid 32, 1, 1\0A{\0A\09.reg .pred \09%p<4>;\0A\09.reg .b32 \09%r<11>;\0A\09.reg .b64 \09%rd<25>;\0A\0A\09ld.param.u64 \09%rd11, [FUNC_f_SCOP_0_KERNEL_0_param_0];\0A\09ld.param.u64 \09%rd12, [FUNC_f_SCOP_0_KERNEL_0_param_1];\0A\09mov.u32 \09%r5, %ctaid.x;\0A\09mov.u32 \09%r6, %tid.x;\0A\09cvt.u64.u32 \09%rd13, %r6;\0A\09ld.param.u32 \09%r7, [FUNC_f_SCOP_0_KERNEL_0_param_4];\0A\09ld.param.u32 \09%r8, [FUNC_f_SCOP_0_KERNEL_0_param_5];\0A\09ld.param.s32 \09%rd14, [FUNC_f_SCOP_0_KERNEL_0_param_2];\0A\09ld.param.s32 \09%rd1, [FUNC_f_SCOP_0_KERNEL_0_param_3];\0A\09mul.wide.u32 \09%rd15, %r5, 32;\0A\09not.b64 \09%rd16, %rd14;\0A\09add.s64 \09%rd17, %rd16, %rd1;\0A\09sub.s64 \09%rd18, %rd17, %rd15;\0A\09shr.u64 \09%rd19, %rd18, 20;\0A\09ld.global.u32 \09%r10, [%rd11];\0A\09add.s64 \09%rd20, %rd15, %rd14;\0A\09add.s64 \09%rd22, %rd20, %rd13;\0A\09setp.gt.s32 \09%p1, %r7, 3;\0A\09selp.b32 \09%r2, %r8, 0, %p1;\0A\09add.s64 \09%rd24, %rd19, 1;\0A\09shl.b64 \09%rd21, %rd22, 2;\0A\09add.s64 \09%rd23, %rd12, %rd21;\0ALBB0_1:\0A\09setp.lt.s64 \09%p2, %rd22, %rd1;\0A\09@%p2 bra \09LBB0_4;\0A\09bra.uni \09LBB0_2;\0ALBB0_4:\0A\09st.global.u32 \09[%rd23], %r2;\0A\09mov.u32 \09%r10, %r2;\0ALBB0_2:\0A\09bar.sync \090;\0A\09add.s64 \09%rd24, %rd24, -1;\0A\09add.s64 \09%rd23, %rd23, 4194304;\0A\09add.s64 \09%rd22, %rd22, 1048576;\0A\09setp.eq.s64 \09%p3, %rd24, 0;\0A\09@%p3 bra \09LBB0_3;\0A\09bra.uni \09LBB0_1;\0ALBB0_3:\0A\09st.global.u32 \09[%rd11], %r10;\0A\09ret;\0A}\0A\0A\0A\00" | |
@FUNC_f_SCOP_0_KERNEL_0_name = private unnamed_addr constant [23 x i8] c"FUNC_f_SCOP_0_KERNEL_0\00" | |
@FUNC_inita_SCOP_0_KERNEL_0 = private unnamed_addr constant [477 x i8] c"//\0A// Generated by LLVM NVPTX Back-End\0A//\0A\0A.version 3.2\0A.target sm_30\0A.address_size 64\0A\0A\09// .globl\09FUNC_inita_SCOP_0_KERNEL_0\0A\0A.visible .entry FUNC_inita_SCOP_0_KERNEL_0(\0A\09.param .u64 FUNC_inita_SCOP_0_KERNEL_0_param_0\0A)\0A.maxntid 6, 1, 1\0A{\0A\09.reg .b32 \09%r<3>;\0A\09.reg .b64 \09%rd<4>;\0A\0A\09ld.param.u64 \09%rd1, [FUNC_inita_SCOP_0_KERNEL_0_param_0];\0A\09mov.u32 \09%r1, %tid.x;\0A\09mul.wide.u32 \09%rd2, %r1, 4;\0A\09add.s64 \09%rd3, %rd1, %rd2;\0A\09mov.u32 \09%r2, 0;\0A\09st.global.u32 \09[%rd3], %r2;\0A\09ret;\0A}\0A\0A\0A\00" | |
@FUNC_inita_SCOP_0_KERNEL_0_name = private unnamed_addr constant [27 x i8] c"FUNC_inita_SCOP_0_KERNEL_0\00" | |
define void @f(i32* %begin, i32* %end, i32* %arr, i32* %control, i32* %readarr) { | |
entry: | |
%tmp8.preload.s2a = alloca i32 | |
%tmp6.preload.s2a = alloca i32 | |
%tmp4.preload.s2a = alloca i32 | |
%tmp1.preload.s2a = alloca i32 | |
%polly_launch_0_params = alloca [18 x i8*] | |
%polly_launch_0_param_0 = alloca i8* | |
%polly_launch_0_param_1 = alloca i8* | |
%polly_launch_0_param_2 = alloca i32 | |
%polly_launch_0_param_3 = alloca i32 | |
%polly_launch_0_param_4 = alloca i32 | |
%polly_launch_0_param_5 = alloca i32 | |
%polly_launch_0_param_6 = alloca i32 | |
%polly_launch_0_param_7 = alloca i32 | |
%polly_launch_0_param_8 = alloca i32 | |
%polly_launch_0_param_size_0 = alloca i32 | |
%polly_launch_0_param_size_1 = alloca i32 | |
%polly_launch_0_param_size_2 = alloca i32 | |
%polly_launch_0_param_size_3 = alloca i32 | |
%polly_launch_0_param_size_4 = alloca i32 | |
%polly_launch_0_param_size_5 = alloca i32 | |
%polly_launch_0_param_size_6 = alloca i32 | |
%polly_launch_0_param_size_7 = alloca i32 | |
%polly_launch_0_param_size_8 = alloca i32 | |
%polly_launch_0_params_i8ptr = bitcast [18 x i8*]* %polly_launch_0_params to i8* | |
br label %polly.split_new_and_old | |
polly.split_new_and_old: ; preds = %entry | |
%polly.access.begin = getelementptr i32, i32* %begin, i64 0 | |
%polly.access.begin.load = load i32, i32* %polly.access.begin, align 4, !alias.scope !0, !noalias !2 | |
store i32 %polly.access.begin.load, i32* %tmp1.preload.s2a | |
%polly.access.end = getelementptr i32, i32* %end, i64 0 | |
%polly.access.end.load = load i32, i32* %polly.access.end, align 4, !alias.scope !3, !noalias !8 | |
store i32 %polly.access.end.load, i32* %tmp4.preload.s2a | |
%0 = sext i32 %polly.access.begin.load to i64 | |
%1 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %0, i64 1) | |
%.obit = extractvalue { i64, i1 } %1, 1 | |
%polly.overflow.state = or i1 false, %.obit | |
%.res = extractvalue { i64, i1 } %1, 0 | |
%2 = sext i32 %polly.access.end.load to i64 | |
%3 = icmp sge i64 %2, %.res | |
%polly.preload.cond.overflown = xor i1 %polly.overflow.state, true | |
%polly.preload.cond.result = and i1 %3, %polly.preload.cond.overflown | |
br label %polly.preload.cond | |
polly.preload.cond: ; preds = %polly.split_new_and_old | |
br i1 %polly.preload.cond.result, label %polly.preload.exec, label %polly.preload.merge | |
polly.preload.merge: ; preds = %polly.preload.exec, %polly.preload.cond | |
%polly.preload.tmp6.merge = phi i32 [ %polly.access.control.load, %polly.preload.exec ], [ 0, %polly.preload.cond ] | |
store i32 %polly.preload.tmp6.merge, i32* %tmp6.preload.s2a | |
%polly.access.begin4 = getelementptr i32, i32* %begin, i64 1 | |
%polly.access.arr = getelementptr i32, i32* %arr, i32 %polly.access.begin.load | |
%4 = ptrtoint i32* %polly.access.begin4 to i64 | |
%5 = ptrtoint i32* %polly.access.arr to i64 | |
%6 = icmp ule i64 %4, %5 | |
%polly.access.arr5 = getelementptr i32, i32* %arr, i32 %polly.access.end.load | |
%polly.access.begin6 = getelementptr i32, i32* %begin, i64 0 | |
%7 = ptrtoint i32* %polly.access.arr5 to i64 | |
%8 = ptrtoint i32* %polly.access.begin6 to i64 | |
%9 = icmp ule i64 %7, %8 | |
%10 = or i1 %6, %9 | |
%11 = and i1 true, %10 | |
%polly.access.control7 = getelementptr i32, i32* %control, i64 1 | |
%polly.access.arr8 = getelementptr i32, i32* %arr, i32 %polly.access.begin.load | |
%12 = ptrtoint i32* %polly.access.control7 to i64 | |
%13 = ptrtoint i32* %polly.access.arr8 to i64 | |
%14 = icmp ule i64 %12, %13 | |
%polly.access.arr9 = getelementptr i32, i32* %arr, i32 %polly.access.end.load | |
%polly.access.control10 = getelementptr i32, i32* %control, i64 0 | |
%15 = ptrtoint i32* %polly.access.arr9 to i64 | |
%16 = ptrtoint i32* %polly.access.control10 to i64 | |
%17 = icmp ule i64 %15, %16 | |
%18 = or i1 %14, %17 | |
%19 = and i1 %11, %18 | |
%polly.access.end11 = getelementptr i32, i32* %end, i64 1 | |
%polly.access.arr12 = getelementptr i32, i32* %arr, i32 %polly.access.begin.load | |
%20 = ptrtoint i32* %polly.access.end11 to i64 | |
%21 = ptrtoint i32* %polly.access.arr12 to i64 | |
%22 = icmp ule i64 %20, %21 | |
%polly.access.arr13 = getelementptr i32, i32* %arr, i32 %polly.access.end.load | |
%polly.access.end14 = getelementptr i32, i32* %end, i64 0 | |
%23 = ptrtoint i32* %polly.access.arr13 to i64 | |
%24 = ptrtoint i32* %polly.access.end14 to i64 | |
%25 = icmp ule i64 %23, %24 | |
%26 = or i1 %22, %25 | |
%27 = and i1 %19, %26 | |
%polly.access.readarr = getelementptr i32, i32* %readarr, i64 1 | |
%polly.access.arr15 = getelementptr i32, i32* %arr, i32 %polly.access.begin.load | |
%28 = ptrtoint i32* %polly.access.readarr to i64 | |
%29 = ptrtoint i32* %polly.access.arr15 to i64 | |
%30 = icmp ule i64 %28, %29 | |
%polly.access.arr16 = getelementptr i32, i32* %arr, i32 %polly.access.end.load | |
%polly.access.readarr17 = getelementptr i32, i32* %readarr, i64 0 | |
%31 = ptrtoint i32* %polly.access.arr16 to i64 | |
%32 = ptrtoint i32* %polly.access.readarr17 to i64 | |
%33 = icmp ule i64 %31, %32 | |
%34 = or i1 %30, %33 | |
%35 = and i1 %27, %34 | |
%36 = sext i32 %polly.access.begin.load to i64 | |
%37 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 0, i64 %36) | |
%.obit18 = extractvalue { i64, i1 } %37, 1 | |
%polly.overflow.state19 = or i1 false, %.obit18 | |
%.res20 = extractvalue { i64, i1 } %37, 0 | |
%38 = sext i32 %polly.access.end.load to i64 | |
%39 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %.res20, i64 %38) | |
%.obit21 = extractvalue { i64, i1 } %39, 1 | |
%polly.overflow.state22 = or i1 %polly.overflow.state19, %.obit21 | |
%.res23 = extractvalue { i64, i1 } %39, 0 | |
%40 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %.res23) | |
%.obit24 = extractvalue { i64, i1 } %40, 1 | |
%polly.overflow.state25 = or i1 %polly.overflow.state22, %.obit24 | |
%.res26 = extractvalue { i64, i1 } %40, 0 | |
%41 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 4, i64 %.res26) | |
%.obit27 = extractvalue { i64, i1 } %41, 1 | |
%polly.overflow.state28 = or i1 %polly.overflow.state25, %.obit27 | |
%.res29 = extractvalue { i64, i1 } %41, 0 | |
%42 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res29) | |
%.obit30 = extractvalue { i64, i1 } %42, 1 | |
%polly.overflow.state31 = or i1 %polly.overflow.state28, %.obit30 | |
%.res32 = extractvalue { i64, i1 } %42, 0 | |
%43 = sext i32 %polly.access.begin.load to i64 | |
%44 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 0, i64 %43) | |
%.obit33 = extractvalue { i64, i1 } %44, 1 | |
%polly.overflow.state34 = or i1 %polly.overflow.state31, %.obit33 | |
%.res35 = extractvalue { i64, i1 } %44, 0 | |
%45 = sext i32 %polly.access.end.load to i64 | |
%46 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %.res35, i64 %45) | |
%.obit36 = extractvalue { i64, i1 } %46, 1 | |
%polly.overflow.state37 = or i1 %polly.overflow.state34, %.obit36 | |
%.res38 = extractvalue { i64, i1 } %46, 0 | |
%47 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %.res38) | |
%.obit39 = extractvalue { i64, i1 } %47, 1 | |
%polly.overflow.state40 = or i1 %polly.overflow.state37, %.obit39 | |
%.res41 = extractvalue { i64, i1 } %47, 0 | |
%48 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 2, i64 %.res41) | |
%.obit42 = extractvalue { i64, i1 } %48, 1 | |
%polly.overflow.state43 = or i1 %polly.overflow.state40, %.obit42 | |
%.res44 = extractvalue { i64, i1 } %48, 0 | |
%49 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %.res32, i64 %.res44) | |
%.obit45 = extractvalue { i64, i1 } %49, 1 | |
%polly.overflow.state46 = or i1 %polly.overflow.state43, %.obit45 | |
%.res47 = extractvalue { i64, i1 } %49, 0 | |
%50 = sext i32 %polly.access.begin.load to i64 | |
%51 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 0, i64 %50) | |
%.obit48 = extractvalue { i64, i1 } %51, 1 | |
%polly.overflow.state49 = or i1 %polly.overflow.state46, %.obit48 | |
%.res50 = extractvalue { i64, i1 } %51, 0 | |
%52 = sext i32 %polly.access.end.load to i64 | |
%53 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %.res50, i64 %52) | |
%.obit51 = extractvalue { i64, i1 } %53, 1 | |
%polly.overflow.state52 = or i1 %polly.overflow.state49, %.obit51 | |
%.res53 = extractvalue { i64, i1 } %53, 0 | |
%54 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %.res53) | |
%.obit54 = extractvalue { i64, i1 } %54, 1 | |
%polly.overflow.state55 = or i1 %polly.overflow.state52, %.obit54 | |
%.res56 = extractvalue { i64, i1 } %54, 0 | |
%55 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 8, i64 %.res56) | |
%.obit57 = extractvalue { i64, i1 } %55, 1 | |
%polly.overflow.state58 = or i1 %polly.overflow.state55, %.obit57 | |
%.res59 = extractvalue { i64, i1 } %55, 0 | |
%56 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %.res47, i64 %.res59) | |
%.obit60 = extractvalue { i64, i1 } %56, 1 | |
%polly.overflow.state61 = or i1 %polly.overflow.state58, %.obit60 | |
%.res62 = extractvalue { i64, i1 } %56, 0 | |
%57 = icmp sge i64 %.res62, 0 | |
%58 = and i1 %35, %57 | |
%polly.rtc.overflown = xor i1 %polly.overflow.state61, true | |
%polly.rtc.result = and i1 %58, %polly.rtc.overflown | |
br i1 false, label %polly.start, label %entry.split.pre_entry_bb | |
entry.split.pre_entry_bb: ; preds = %polly.preload.merge | |
br label %entry.split | |
entry.split: ; preds = %entry.split.pre_entry_bb | |
%tmp1 = load i32, i32* %begin, align 4 | |
%tmp41 = load i32, i32* %end, align 4 | |
%cmp2 = icmp slt i32 %tmp1, %tmp41 | |
br i1 %cmp2, label %for.body.lr.ph, label %for.end.region_exiting | |
for.body.lr.ph: ; preds = %entry.split | |
%59 = sext i32 %tmp1 to i64 | |
br label %for.body | |
for.body: ; preds = %for.body.lr.ph, %if.end | |
%indvars.iv = phi i64 [ %59, %for.body.lr.ph ], [ %indvars.iv.next, %if.end ] | |
%tmp6 = load i32, i32* %control, align 4 | |
%cmp1 = icmp sgt i32 %tmp6, 3 | |
br i1 %cmp1, label %if.then, label %if.end | |
if.then: ; preds = %for.body | |
%tmp8 = load i32, i32* %readarr, align 4 | |
br label %if.end | |
if.end: ; preds = %if.then, %for.body | |
%t.0 = phi i32 [ %tmp8, %if.then ], [ 0, %for.body ] | |
%arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv | |
store i32 %t.0, i32* %arrayidx, align 4 | |
%indvars.iv.next = add i64 %indvars.iv, 1 | |
%tmp4 = load i32, i32* %end, align 4 | |
%60 = sext i32 %tmp4 to i64 | |
%cmp = icmp slt i64 %indvars.iv.next, %60 | |
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge | |
for.cond.for.end_crit_edge: ; preds = %if.end | |
br label %for.end.region_exiting | |
for.end.region_exiting: ; preds = %entry.split, %for.cond.for.end_crit_edge | |
br label %polly.merge_new_and_old | |
polly.merge_new_and_old: ; preds = %polly.exiting, %for.end.region_exiting | |
br label %for.end | |
for.end: ; preds = %polly.merge_new_and_old | |
ret void | |
polly.start: ; preds = %polly.preload.merge | |
br label %polly.acc.initialize | |
polly.acc.initialize: ; preds = %polly.start | |
%61 = call i8* @polly_initContextCUDA() | |
%p_dev_array_MemRef0 = call i8* @polly_allocateMemoryForDevice(i64 0) | |
%p_dev_array_MemRef1 = call i8* @polly_allocateMemoryForDevice(i64 0) | |
%p_dev_array_MemRef2 = call i8* @polly_allocateMemoryForDevice(i64 0) | |
%p_dev_array_MemRef3__phi = call i8* @polly_allocateMemoryForDevice(i64 4) | |
%p_dev_array_MemRef4 = call i8* @polly_allocateMemoryForDevice(i64 0) | |
%62 = sext i32 %polly.access.end.load to i64 | |
%63 = mul i64 4, %62 | |
%64 = sext i32 %polly.access.begin.load to i64 | |
%65 = add nsw i64 0, %64 | |
%66 = mul i64 %65, 4 | |
%67 = sub i64 %63, %66 | |
%p_dev_array_MemRef5 = call i8* @polly_allocateMemoryForDevice(i64 %67) | |
br label %polly.preload.begin | |
polly.preload.begin: ; preds = %polly.acc.initialize | |
%68 = sext i32 %polly.access.begin.load to i64 | |
%69 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %68, i64 1) | |
%.obit63 = extractvalue { i64, i1 } %69, 1 | |
%polly.overflow.state64 = or i1 false, %.obit63 | |
%.res65 = extractvalue { i64, i1 } %69, 0 | |
%70 = sext i32 %polly.access.end.load to i64 | |
%71 = icmp sge i64 %70, %.res65 | |
%72 = sext i32 %polly.preload.tmp6.merge to i64 | |
%73 = icmp sge i64 %72, 4 | |
%74 = and i1 %71, %73 | |
%polly.preload.cond.overflown66 = xor i1 %polly.overflow.state64, true | |
%polly.preload.cond.result67 = and i1 %74, %polly.preload.cond.overflown66 | |
br label %polly.preload.cond68 | |
polly.preload.cond68: ; preds = %polly.preload.begin | |
br i1 %polly.preload.cond.result67, label %polly.preload.exec70, label %polly.preload.merge69 | |
polly.preload.merge69: ; preds = %polly.preload.exec70, %polly.preload.cond68 | |
%polly.preload.tmp8.merge = phi i32 [ %polly.access.readarr71.load, %polly.preload.exec70 ], [ 0, %polly.preload.cond68 ] | |
store i32 %polly.preload.tmp8.merge, i32* %tmp8.preload.s2a | |
br label %polly.cond | |
polly.cond: ; preds = %polly.preload.merge69 | |
%75 = sext i32 %polly.access.begin.load to i64 | |
%76 = add nsw i64 %75, 1 | |
%77 = sext i32 %polly.access.end.load to i64 | |
%78 = icmp sge i64 %77, %76 | |
br i1 %78, label %polly.then, label %polly.else | |
polly.merge: ; preds = %polly.else, %polly.merge73 | |
call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef0) | |
call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef1) | |
call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef2) | |
call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef3__phi) | |
call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef4) | |
call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef5) | |
call void @polly_freeContext(i8* %61) | |
br label %polly.exiting | |
polly.exiting: ; preds = %polly.merge | |
br label %polly.merge_new_and_old | |
polly.preload.exec: ; preds = %polly.preload.cond | |
%polly.access.control = getelementptr i32, i32* %control, i64 0 | |
%polly.access.control.load = load i32, i32* %polly.access.control, align 4, !alias.scope !4, !noalias !9 | |
br label %polly.preload.merge | |
polly.preload.exec70: ; preds = %polly.preload.cond68 | |
%polly.access.readarr71 = getelementptr i32, i32* %readarr, i64 0 | |
%polly.access.readarr71.load = load i32, i32* %polly.access.readarr71, align 4, !alias.scope !6, !noalias !10 | |
br label %polly.preload.merge69 | |
polly.then: ; preds = %polly.cond | |
%79 = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef3__phi) | |
%80 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 0 | |
store i8* %79, i8** %polly_launch_0_param_0 | |
%81 = bitcast i8** %polly_launch_0_param_0 to i8* | |
store i8* %81, i8** %80 | |
%82 = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef5) | |
%83 = sext i32 %polly.access.begin.load to i64 | |
%84 = add nsw i64 0, %83 | |
%85 = bitcast i8* %82 to i32* | |
%86 = sub i64 0, %84 | |
%87 = getelementptr i32, i32* %85, i64 %86 | |
%88 = bitcast i32* %87 to i8* | |
%89 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 1 | |
store i8* %88, i8** %polly_launch_0_param_1 | |
%90 = bitcast i8** %polly_launch_0_param_1 to i8* | |
store i8* %90, i8** %89 | |
store i32 %polly.access.begin.load, i32* %polly_launch_0_param_2 | |
%91 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 2 | |
%92 = bitcast i32* %polly_launch_0_param_2 to i8* | |
store i8* %92, i8** %91 | |
store i32 %polly.access.end.load, i32* %polly_launch_0_param_3 | |
%93 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 3 | |
%94 = bitcast i32* %polly_launch_0_param_3 to i8* | |
store i8* %94, i8** %93 | |
store i32 %polly.preload.tmp6.merge, i32* %polly_launch_0_param_4 | |
%95 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 4 | |
%96 = bitcast i32* %polly_launch_0_param_4 to i8* | |
store i8* %96, i8** %95 | |
store i32 %polly.preload.tmp8.merge, i32* %polly_launch_0_param_5 | |
%97 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 5 | |
%98 = bitcast i32* %polly_launch_0_param_5 to i8* | |
store i8* %98, i8** %97 | |
store i32 %polly.access.begin.load, i32* %polly_launch_0_param_6 | |
%99 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 6 | |
%100 = bitcast i32* %polly_launch_0_param_6 to i8* | |
store i8* %100, i8** %99 | |
store i32 %polly.preload.tmp6.merge, i32* %polly_launch_0_param_7 | |
%101 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 7 | |
%102 = bitcast i32* %polly_launch_0_param_7 to i8* | |
store i8* %102, i8** %101 | |
store i32 %polly.access.end.load, i32* %polly_launch_0_param_8 | |
%103 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 8 | |
%104 = bitcast i32* %polly_launch_0_param_8 to i8* | |
store i8* %104, i8** %103 | |
store i32 4, i32* %polly_launch_0_param_size_0 | |
%105 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 9 | |
%106 = bitcast i32* %polly_launch_0_param_size_0 to i8* | |
store i8* %106, i8** %105 | |
store i32 4, i32* %polly_launch_0_param_size_1 | |
%107 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 10 | |
%108 = bitcast i32* %polly_launch_0_param_size_1 to i8* | |
store i8* %108, i8** %107 | |
store i32 4, i32* %polly_launch_0_param_size_2 | |
%109 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 11 | |
%110 = bitcast i32* %polly_launch_0_param_size_2 to i8* | |
store i8* %110, i8** %109 | |
store i32 4, i32* %polly_launch_0_param_size_3 | |
%111 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 12 | |
%112 = bitcast i32* %polly_launch_0_param_size_3 to i8* | |
store i8* %112, i8** %111 | |
store i32 4, i32* %polly_launch_0_param_size_4 | |
%113 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 13 | |
%114 = bitcast i32* %polly_launch_0_param_size_4 to i8* | |
store i8* %114, i8** %113 | |
store i32 4, i32* %polly_launch_0_param_size_5 | |
%115 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 14 | |
%116 = bitcast i32* %polly_launch_0_param_size_5 to i8* | |
store i8* %116, i8** %115 | |
store i32 4, i32* %polly_launch_0_param_size_6 | |
%117 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 15 | |
%118 = bitcast i32* %polly_launch_0_param_size_6 to i8* | |
store i8* %118, i8** %117 | |
store i32 4, i32* %polly_launch_0_param_size_7 | |
%119 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 16 | |
%120 = bitcast i32* %polly_launch_0_param_size_7 to i8* | |
store i8* %120, i8** %119 | |
store i32 4, i32* %polly_launch_0_param_size_8 | |
%121 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 17 | |
%122 = bitcast i32* %polly_launch_0_param_size_8 to i8* | |
store i8* %122, i8** %121 | |
%123 = call i8* @polly_getKernel(i8* getelementptr inbounds ([1802 x i8], [1802 x i8]* @FUNC_f_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([23 x i8], [23 x i8]* @FUNC_f_SCOP_0_KERNEL_0_name, i32 0, i32 0)) | |
%124 = sext i32 %polly.access.begin.load to i64 | |
%125 = add nsw i64 %124, 1048545 | |
%126 = sext i32 %polly.access.end.load to i64 | |
%127 = icmp sge i64 %126, %125 | |
%128 = sext i32 %polly.access.begin.load to i64 | |
%129 = sub nsw i64 0, %128 | |
%130 = sext i32 %polly.access.begin.load to i64 | |
%131 = mul nsw i64 31, %130 | |
%132 = sext i32 %polly.access.end.load to i64 | |
%133 = add nsw i64 %131, %132 | |
%134 = add nsw i64 %133, 31 | |
%polly.fdiv_q.shr = ashr i64 %134, 5 | |
%135 = add nsw i64 %129, %polly.fdiv_q.shr | |
%136 = select i1 %127, i64 32768, i64 %135 | |
%137 = trunc i64 %136 to i32 | |
call void @polly_launchKernel(i8* %123, i32 %137, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) | |
call void @polly_freeKernel(i8* %123) | |
br label %polly.cond72 | |
polly.cond72: ; preds = %polly.then | |
%138 = sext i32 %polly.access.end.load to i64 | |
%139 = icmp sge i64 %138, 1 | |
br i1 %139, label %polly.then74, label %polly.else75 | |
polly.merge73: ; preds = %polly.else75, %polly.then74 | |
br label %polly.merge | |
polly.else: ; preds = %polly.cond | |
br label %polly.merge | |
polly.then74: ; preds = %polly.cond72 | |
%140 = sext i32 %polly.access.end.load to i64 | |
%141 = mul i64 4, %140 | |
%142 = sext i32 %polly.access.begin.load to i64 | |
%143 = add nsw i64 0, %142 | |
%144 = getelementptr i32, i32* %arr, i64 %143 | |
%145 = bitcast i32* %144 to i8* | |
%146 = mul i64 %143, 4 | |
%147 = sub i64 %141, %146 | |
call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef5, i8* %145, i64 %147) | |
br label %polly.merge73 | |
polly.else75: ; preds = %polly.cond72 | |
br label %polly.merge73 | |
} | |
define void @inita(i32* %A) { | |
entry: | |
%polly_launch_0_params = alloca [2 x i8*] | |
%polly_launch_0_param_0 = alloca i8* | |
%polly_launch_0_param_size_0 = alloca i32 | |
%polly_launch_0_params_i8ptr = bitcast [2 x i8*]* %polly_launch_0_params to i8* | |
br label %entry.split | |
entry.split: ; preds = %entry | |
br label %polly.split_new_and_old | |
polly.split_new_and_old: ; preds = %entry.split | |
%0 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 6) | |
%.obit = extractvalue { i64, i1 } %0, 1 | |
%polly.overflow.state = or i1 false, %.obit | |
%.res = extractvalue { i64, i1 } %0, 0 | |
%1 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 6, i64 %.res) | |
%.obit2 = extractvalue { i64, i1 } %1, 1 | |
%polly.overflow.state3 = or i1 %polly.overflow.state, %.obit2 | |
%.res4 = extractvalue { i64, i1 } %1, 0 | |
%2 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res4) | |
%.obit5 = extractvalue { i64, i1 } %2, 1 | |
%polly.overflow.state6 = or i1 %polly.overflow.state3, %.obit5 | |
%.res7 = extractvalue { i64, i1 } %2, 0 | |
%3 = icmp sge i64 %.res7, 0 | |
%4 = and i1 true, %3 | |
%polly.rtc.overflown = xor i1 %polly.overflow.state6, true | |
%polly.rtc.result = and i1 %4, %polly.rtc.overflown | |
br i1 %polly.rtc.result, label %polly.start, label %for.body.pre_entry_bb | |
for.body.pre_entry_bb: ; preds = %polly.split_new_and_old | |
br label %for.body | |
for.body: ; preds = %for.body.pre_entry_bb, %for.body | |
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.pre_entry_bb ] | |
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv | |
store i32 0, i32* %arrayidx, align 4 | |
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 | |
%exitcond = icmp ne i64 %indvars.iv.next, 6 | |
br i1 %exitcond, label %for.body, label %polly.merge_new_and_old | |
polly.merge_new_and_old: ; preds = %polly.exiting, %for.body | |
br label %for.end | |
for.end: ; preds = %polly.merge_new_and_old | |
ret void | |
polly.start: ; preds = %polly.split_new_and_old | |
br label %polly.acc.initialize | |
polly.acc.initialize: ; preds = %polly.start | |
%5 = call i8* @polly_initContextCUDA() | |
%p_dev_array_MemRef0 = call i8* @polly_allocateMemoryForDevice(i64 24) | |
%6 = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef0) | |
%7 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 | |
store i8* %6, i8** %polly_launch_0_param_0 | |
%8 = bitcast i8** %polly_launch_0_param_0 to i8* | |
store i8* %8, i8** %7 | |
store i32 4, i32* %polly_launch_0_param_size_0 | |
%9 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 | |
%10 = bitcast i32* %polly_launch_0_param_size_0 to i8* | |
store i8* %10, i8** %9 | |
%11 = call i8* @polly_getKernel(i8* getelementptr inbounds ([477 x i8], [477 x i8]* @FUNC_inita_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([27 x i8], [27 x i8]* @FUNC_inita_SCOP_0_KERNEL_0_name, i32 0, i32 0)) | |
call void @polly_launchKernel(i8* %11, i32 1, i32 1, i32 6, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) | |
call void @polly_freeKernel(i8* %11) | |
%12 = bitcast i32* %A to i8* | |
call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef0, i8* %12, i64 24) | |
call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef0) | |
call void @polly_freeContext(i8* %5) | |
br label %polly.exiting | |
polly.exiting: ; preds = %polly.acc.initialize | |
br label %polly.merge_new_and_old | |
} | |
define void @printarr(i32* %A) { | |
entry: | |
br label %entry.split | |
entry.split: ; preds = %entry | |
%call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0)) | |
br label %for.body | |
for.body: ; preds = %entry.split, %for.body | |
%indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] | |
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv | |
%tmp3 = load i32, i32* %arrayidx, align 4 | |
%call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.1, i64 0, i64 0), i32 %tmp3) | |
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 | |
%exitcond = icmp ne i64 %indvars.iv.next, 6 | |
br i1 %exitcond, label %for.body, label %for.end | |
for.end: ; preds = %for.body | |
%putchar = tail call i32 @putchar(i32 10) | |
ret void | |
} | |
declare i32 @printf(i8*, ...) | |
define i32 @main() { | |
entry: | |
%A = alloca [6 x i32], align 16 | |
%readarr = alloca i32, align 4 | |
%control1 = alloca i32, align 4 | |
%b = alloca i32, align 4 | |
%e = alloca i32, align 4 | |
br label %entry.split | |
entry.split: ; preds = %entry | |
store i32 10, i32* %readarr, align 4 | |
store i32 3, i32* %control1, align 4 | |
br label %for.body | |
for.body: ; preds = %entry.split, %for.inc15 | |
%tmp1 = load i32, i32* %control1, align 4 | |
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.3, i64 0, i64 0), i32 %tmp1) | |
store i32 0, i32* %b, align 4 | |
br label %for.cond5.preheader | |
for.cond5.preheader: ; preds = %for.body, %for.inc12 | |
%storemerge2.in3 = load i32, i32* %b, align 4 | |
%storemerge24 = add nsw i32 %storemerge2.in3, 1 | |
store i32 %storemerge24, i32* %e, align 4 | |
%cmp65 = icmp slt i32 %storemerge2.in3, 5 | |
br i1 %cmp65, label %for.body7.lr.ph, label %for.inc12 | |
for.body7.lr.ph: ; preds = %for.cond5.preheader | |
br label %for.body7 | |
for.body7: ; preds = %for.body7.lr.ph, %for.body7 | |
%puts = call i32 @puts(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @str, i64 0, i64 0)) | |
%tmp5 = load i32, i32* %b, align 4 | |
%tmp6 = load i32, i32* %e, align 4 | |
%call9 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 %tmp5, i32 %tmp6) | |
%arraydecay = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0 | |
call void @inita(i32* %arraydecay) | |
%arraydecay10 = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0 | |
call void @f(i32* nonnull %b, i32* nonnull %e, i32* %arraydecay10, i32* nonnull %control1, i32* nonnull %readarr) | |
%arraydecay11 = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0 | |
call void @printarr(i32* %arraydecay11) | |
%storemerge2.in = load i32, i32* %e, align 4 | |
%storemerge2 = add nsw i32 %storemerge2.in, 1 | |
store i32 %storemerge2, i32* %e, align 4 | |
%cmp6 = icmp slt i32 %storemerge2.in, 5 | |
br i1 %cmp6, label %for.body7, label %for.cond5.for.inc12_crit_edge | |
for.cond5.for.inc12_crit_edge: ; preds = %for.body7 | |
br label %for.inc12 | |
for.inc12: ; preds = %for.cond5.for.inc12_crit_edge, %for.cond5.preheader | |
%tmp8 = load i32, i32* %b, align 4 | |
%inc13 = add nsw i32 %tmp8, 1 | |
store i32 %inc13, i32* %b, align 4 | |
%cmp3 = icmp slt i32 %tmp8, 5 | |
br i1 %cmp3, label %for.cond5.preheader, label %for.inc15 | |
for.inc15: ; preds = %for.inc12 | |
%tmp9 = load i32, i32* %control1, align 4 | |
%inc16 = add nsw i32 %tmp9, 1 | |
store i32 %inc16, i32* %control1, align 4 | |
%cmp = icmp slt i32 %tmp9, 4 | |
br i1 %cmp, label %for.body, label %for.end17 | |
for.end17: ; preds = %for.inc15 | |
ret i32 0 | |
} | |
; Function Attrs: nounwind | |
declare i32 @putchar(i32) #0 | |
; Function Attrs: nounwind | |
declare i32 @puts(i8* nocapture readonly) #0 | |
; Function Attrs: nounwind readnone speculatable | |
declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) #1 | |
; Function Attrs: nounwind readnone speculatable | |
declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) #1 | |
; Function Attrs: nounwind readnone speculatable | |
declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64) #1 | |
declare i8* @polly_initContextCUDA() | |
declare i8* @polly_allocateMemoryForDevice(i64) | |
declare i8* @polly_getDevicePtr(i8*) | |
declare i8* @polly_getKernel(i8*, i8*) | |
declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) | |
declare void @polly_freeKernel(i8*) | |
declare void @polly_copyFromDeviceToHost(i8*, i8*, i64) | |
declare void @polly_freeDeviceMemory(i8*) | |
declare void @polly_freeContext(i8*) | |
attributes #0 = { nounwind } | |
attributes #1 = { nounwind readnone speculatable } | |
!0 = distinct !{!0, !1, !"polly.alias.scope.MemRef0"} | |
!1 = distinct !{!1, !"polly.alias.scope.domain"} | |
!2 = !{!3, !4, !5, !6, !7} | |
!3 = distinct !{!3, !1, !"polly.alias.scope.MemRef1"} | |
!4 = distinct !{!4, !1, !"polly.alias.scope.MemRef2"} | |
!5 = distinct !{!5, !1, !"polly.alias.scope.MemRef3__phi"} | |
!6 = distinct !{!6, !1, !"polly.alias.scope.MemRef4"} | |
!7 = distinct !{!7, !1, !"polly.alias.scope.MemRef5"} | |
!8 = !{!0, !4, !5, !6, !7} | |
!9 = !{!0, !3, !5, !6, !7} | |
!10 = !{!0, !3, !4, !5, !7} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment