Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Testing to make sure that bollu/polly branch 07-17-break-arrray-access-for-invariant-load-in-blockgen generates sensible code
16:25 $ make runbatch
rm *.optimised.ll
rm *.out
rm *.bench
rm: cannot remove '*.bench': No such file or directory
makefile:23: recipe for target 'clean' failed
make: [clean] Error 1 (ignored)
rm *.s
/users/siddhart/llvm-install/bin/opt -S -polly-canonicalize -polly-process-unprofitable -polly-invariant-load-hoisting -polly-codegen-ppcg \
-polly-acc-mincompute=0 program.ll -o program.optimised.ll
/users/siddhart/llvm-install/bin/llc program.optimised.ll -o program.s
/users/siddhart/llvm-install/bin/clang program.s -lcudart -lGPURuntime -ldl -lOpenCL -lgfortran -lstdc++ -o program.out -L/opt/nvidia/cudatoolkit8.0/8.0.54_2.2.8_ga620558-2.1/lib64/
export POLLY_DEBUG=1
POLLY_DEBUG=1 srun -n 1 -Cgpu --partition=debug nvprof ./program.out
srun: job 2387457 queued and waiting for resources
srun: job 2387457 has been allocated resources
-> polly_initContext
-> initContextCUDA
==29336== NVPROF is profiling process 29336, command: ./program.out
> Running on GPU device 0 : Tesla P100-PCIE-16GB.
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
CUDA Link Completed in 0.000000ms. Linker Output:
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function 'FUNC_inita_SCOP_0_KERNEL_0' for 'sm_60'
ptxas info : Function properties for FUNC_inita_SCOP_0_KERNEL_0
ptxas . 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 4 registers, 328 bytes cmem[0]
info : 0 bytes gmem
info : Function properties for 'FUNC_inita_SCOP_0_KERNEL_0':
info : used 4 registers, 0 stack, 0 bytes smem, 328 bytes cmem[0], 0 bytes lmem
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
-> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
==29336== Profiling application: ./program.out
* Control: 3
-----
b: 0 | e: 1
A: 0 0 0 0 0 0
-----
b: 0 | e: 2
A: 0 0 0 0 0 0
-----
b: 0 | e: 3
A: 0 0 0 0 0 0
-----
b: 0 | e: 4
A: 0 0 0 0 0 0
-----
b: 0 | e: 5
A: 0 0 0 0 0 0
-----
b: 1 | e: 2
A: 0 0 0 0 0 0
-----
b: 1 | e: 3
A: 0 0 0 0 0 0
-----
b: 1 | e: 4
A: 0 0 0 0 0 0
-----
b: 1 | e: 5
A: 0 0 0 0 0 0
-----
b: 2 | e: 3
A: 0 0 0 0 0 0
-----
b: 2 | e: 4
A: 0 0 0 0 0 0
-----
b: 2 | e: 5
A: 0 0 0 0 0 0
-----
b: 3 | e: 4
A: 0 0 0 0 0 0
-----
b: 3 | e: 5
A: 0 0 0 0 0 0
-----
b: 4 | e: 5
A: 0 0 0 0 0 0
* Control: 4
-----
b: 0 | e: 1
A: 10 0 0 0 0 0
-----
b: 0 | e: 2
A: 10 10 0 0 0 0
-----
b: 0 | e: 3
A: 10 10 10 0 0 0
-----
b: 0 | e: 4
A: 10 10 10 10 0 0
-----
b: 0 | e: 5
A: 10 10 10 10 10 0
-----
b: 1 | e: 2
A: 0 10 0 0 0 0
-----
b: 1 | e: 3
A: 0 10 10 0 0 0
-----
b: 1 | e: 4
A: 0 10 10 10 0 0
-----
b: 1 | e: 5
A: 0 10 10 10 10 0
-----
b: 2 | e: 3
A: 0 0 10 0 0 0
-----
b: 2 | e: 4
A: 0 0 10 10 0 0
-----
b: 2 | e: 5
A: 0 0 10 10 10 0
-----
b: 3 | e: 4
A: 0 0 0 10 0 0
-----
b: 3 | e: 5
A: 0 0 0 10 10 0
-----
b: 4 | e: 5
A: 0 0 0 0 10 0
==29336== Profiling result:
Time(%) Time Calls Avg Min Max Name
72.49% 53.280us 30 1.7760us 1.6640us 3.2640us FUNC_inita_SCOP_0_KERNEL_0
27.51% 20.224us 30 674ns 640ns 832ns [CUDA memcpy DtoH]
==29336== API calls:
Time(%) Time Calls Avg Min Max Name
91.74% 265.27ms 1 265.27ms 265.27ms 265.27ms cuCtxCreate
5.67% 16.381ms 30 546.03us 499.71us 580.99us cuMemAlloc
1.50% 4.3289ms 1 4.3289ms 4.3289ms 4.3289ms cuLinkAddData
0.73% 2.1040ms 30 70.134us 66.598us 95.121us cuMemFree
0.17% 480.85us 30 16.028us 15.338us 20.152us cuMemcpyDtoH
0.13% 366.10us 30 12.203us 10.963us 23.069us cuLaunchKernel
0.03% 97.328us 1 97.328us 97.328us 97.328us cuLinkComplete
0.03% 73.145us 1 73.145us 73.145us 73.145us cuModuleLoadData
0.01% 24.702us 1 24.702us 24.702us 24.702us cuLinkCreate
0.01% 17.288us 1 17.288us 17.288us 17.288us cuDeviceGetName
0.00% 2.2970us 3 765ns 180ns 1.7200us cuDeviceGetCount
0.00% 2.2530us 1 2.2530us 2.2530us 2.2530us cuLinkDestroy
0.00% 874ns 3 291ns 184ns 484ns cuDeviceGet
0.00% 855ns 4 213ns 135ns 324ns cuDeviceGetAttribute
0.00% 678ns 1 678ns 678ns 678ns cuModuleGetFunction
0.00% 654ns 1 654ns 654ns 654ns cuDeviceComputeCapability
// [siddhart@greina0 compile-cpp-to-gpu]$ cat program.c
#include <stdio.h>
void f(int *begin, int *end, int *arr, int *control, int *readarr) {
for(int i = *begin; i < *end; i++) {
int t = 0;
if (*control > 3) {
t += *readarr;
}
arr[i] = t;
}
}
void inita(int *A) {
for(int i = 0; i < 6; i++) { A[i] = 0; }
}
void printarr(int *A) {
printf("A: ");
for (int i = 0; i < 6; i++) {
printf("%d ", A[i]);
}
printf("\n");
}
int main() {
int A[6];
int readarr = 10;;
int control;
for(int control = 3; control <= 4; control++) {
printf("* Control: %d\n", control);
for(int b = 0; b < 6; b++) {
for(int e = b + 1; e < 6; e++) {
printf("-----\n");
printf("b: %d | e: %d\n", b, e);
inita(A);
f(&b, &e, A, &control, &readarr);
printarr(A);
}
}
}
return 0;
}
[siddhart@greina0 compile-cpp-to-gpu]$ cat program.canonical.ll
; ModuleID = 'program.ll'
source_filename = "program.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@.str = private unnamed_addr constant [4 x i8] c"A: \00", align 1
@.str.1 = private unnamed_addr constant [4 x i8] c"%d \00", align 1
@.str.2 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1
@.str.3 = private unnamed_addr constant [15 x i8] c"* Control: %d\0A\00", align 1
@.str.4 = private unnamed_addr constant [7 x i8] c"-----\0A\00", align 1
@.str.5 = private unnamed_addr constant [15 x i8] c"b: %d | e: %d\0A\00", align 1
@str = private unnamed_addr constant [6 x i8] c"-----\00"
define void @f(i32* %begin, i32* %end, i32* %arr, i32* %control, i32* %readarr) {
entry:
br label %entry.split
entry.split: ; preds = %entry
%tmp1 = load i32, i32* %begin, align 4
%tmp41 = load i32, i32* %end, align 4
%cmp2 = icmp slt i32 %tmp1, %tmp41
br i1 %cmp2, label %for.body.lr.ph, label %for.end
for.body.lr.ph: ; preds = %entry.split
%0 = sext i32 %tmp1 to i64
br label %for.body
for.body: ; preds = %for.body.lr.ph, %if.end
%indvars.iv = phi i64 [ %0, %for.body.lr.ph ], [ %indvars.iv.next, %if.end ]
%tmp6 = load i32, i32* %control, align 4
%cmp1 = icmp sgt i32 %tmp6, 3
br i1 %cmp1, label %if.then, label %if.end
if.then: ; preds = %for.body
%tmp8 = load i32, i32* %readarr, align 4
br label %if.end
if.end: ; preds = %if.then, %for.body
%t.0 = phi i32 [ %tmp8, %if.then ], [ 0, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv
store i32 %t.0, i32* %arrayidx, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%tmp4 = load i32, i32* %end, align 4
%1 = sext i32 %tmp4 to i64
%cmp = icmp slt i64 %indvars.iv.next, %1
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %if.end
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split
ret void
}
define void @inita(i32* %A) {
entry:
br label %entry.split
entry.split: ; preds = %entry
br label %for.body
for.body: ; preds = %entry.split, %for.body
%indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
store i32 0, i32* %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp ne i64 %indvars.iv.next, 6
br i1 %exitcond, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
define void @printarr(i32* %A) {
entry:
br label %entry.split
entry.split: ; preds = %entry
%call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0))
br label %for.body
for.body: ; preds = %entry.split, %for.body
%indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
%tmp3 = load i32, i32* %arrayidx, align 4
%call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.1, i64 0, i64 0), i32 %tmp3)
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp ne i64 %indvars.iv.next, 6
br i1 %exitcond, label %for.body, label %for.end
for.end: ; preds = %for.body
%putchar = tail call i32 @putchar(i32 10)
ret void
}
declare i32 @printf(i8*, ...)
define i32 @main() {
entry:
%A = alloca [6 x i32], align 16
%readarr = alloca i32, align 4
%control1 = alloca i32, align 4
%b = alloca i32, align 4
%e = alloca i32, align 4
br label %entry.split
entry.split: ; preds = %entry
store i32 10, i32* %readarr, align 4
store i32 3, i32* %control1, align 4
br label %for.body
for.body: ; preds = %entry.split, %for.inc15
%tmp1 = load i32, i32* %control1, align 4
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.3, i64 0, i64 0), i32 %tmp1)
store i32 0, i32* %b, align 4
br label %for.cond5.preheader
for.cond5.preheader: ; preds = %for.body, %for.inc12
%storemerge2.in3 = load i32, i32* %b, align 4
%storemerge24 = add nsw i32 %storemerge2.in3, 1
store i32 %storemerge24, i32* %e, align 4
%cmp65 = icmp slt i32 %storemerge2.in3, 5
br i1 %cmp65, label %for.body7.lr.ph, label %for.inc12
for.body7.lr.ph: ; preds = %for.cond5.preheader
br label %for.body7
for.body7: ; preds = %for.body7.lr.ph, %for.body7
%puts = call i32 @puts(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @str, i64 0, i64 0))
%tmp5 = load i32, i32* %b, align 4
%tmp6 = load i32, i32* %e, align 4
%call9 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 %tmp5, i32 %tmp6)
%arraydecay = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0
call void @inita(i32* %arraydecay)
%arraydecay10 = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0
call void @f(i32* nonnull %b, i32* nonnull %e, i32* %arraydecay10, i32* nonnull %control1, i32* nonnull %readarr)
%arraydecay11 = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0
call void @printarr(i32* %arraydecay11)
%storemerge2.in = load i32, i32* %e, align 4
%storemerge2 = add nsw i32 %storemerge2.in, 1
store i32 %storemerge2, i32* %e, align 4
%cmp6 = icmp slt i32 %storemerge2.in, 5
br i1 %cmp6, label %for.body7, label %for.cond5.for.inc12_crit_edge
for.cond5.for.inc12_crit_edge: ; preds = %for.body7
br label %for.inc12
for.inc12: ; preds = %for.cond5.for.inc12_crit_edge, %for.cond5.preheader
%tmp8 = load i32, i32* %b, align 4
%inc13 = add nsw i32 %tmp8, 1
store i32 %inc13, i32* %b, align 4
%cmp3 = icmp slt i32 %tmp8, 5
br i1 %cmp3, label %for.cond5.preheader, label %for.inc15
for.inc15: ; preds = %for.inc12
%tmp9 = load i32, i32* %control1, align 4
%inc16 = add nsw i32 %tmp9, 1
store i32 %inc16, i32* %control1, align 4
%cmp = icmp slt i32 %tmp9, 4
br i1 %cmp, label %for.body, label %for.end17
for.end17: ; preds = %for.inc15
ret i32 0
}
; Function Attrs: nounwind
declare i32 @putchar(i32) #0
; Function Attrs: nounwind
declare i32 @puts(i8* nocapture readonly) #0
attributes #0 = { nounwind }
16:27 $ cat program.optimised.ll
; ModuleID = 'program.ll'
source_filename = "program.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@.str = private unnamed_addr constant [4 x i8] c"A: \00", align 1
@.str.1 = private unnamed_addr constant [4 x i8] c"%d \00", align 1
@.str.2 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1
@.str.3 = private unnamed_addr constant [15 x i8] c"* Control: %d\0A\00", align 1
@.str.4 = private unnamed_addr constant [7 x i8] c"-----\0A\00", align 1
@.str.5 = private unnamed_addr constant [15 x i8] c"b: %d | e: %d\0A\00", align 1
@str = private unnamed_addr constant [6 x i8] c"-----\00"
@FUNC_f_SCOP_0_KERNEL_0 = private unnamed_addr constant [1802 x i8] c"//\0A// Generated by LLVM NVPTX Back-End\0A//\0A\0A.version 3.2\0A.target sm_30\0A.address_size 64\0A\0A\09// .globl\09FUNC_f_SCOP_0_KERNEL_0\0A\0A.visible .entry FUNC_f_SCOP_0_KERNEL_0(\0A\09.param .u64 FUNC_f_SCOP_0_KERNEL_0_param_0,\0A\09.param .u64 FUNC_f_SCOP_0_KERNEL_0_param_1,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_2,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_3,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_4,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_5,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_6,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_7,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_8\0A)\0A.maxntid 32, 1, 1\0A{\0A\09.reg .pred \09%p<4>;\0A\09.reg .b32 \09%r<11>;\0A\09.reg .b64 \09%rd<25>;\0A\0A\09ld.param.u64 \09%rd11, [FUNC_f_SCOP_0_KERNEL_0_param_0];\0A\09ld.param.u64 \09%rd12, [FUNC_f_SCOP_0_KERNEL_0_param_1];\0A\09mov.u32 \09%r5, %ctaid.x;\0A\09mov.u32 \09%r6, %tid.x;\0A\09cvt.u64.u32 \09%rd13, %r6;\0A\09ld.param.u32 \09%r7, [FUNC_f_SCOP_0_KERNEL_0_param_4];\0A\09ld.param.u32 \09%r8, [FUNC_f_SCOP_0_KERNEL_0_param_5];\0A\09ld.param.s32 \09%rd14, [FUNC_f_SCOP_0_KERNEL_0_param_2];\0A\09ld.param.s32 \09%rd1, [FUNC_f_SCOP_0_KERNEL_0_param_3];\0A\09mul.wide.u32 \09%rd15, %r5, 32;\0A\09not.b64 \09%rd16, %rd14;\0A\09add.s64 \09%rd17, %rd16, %rd1;\0A\09sub.s64 \09%rd18, %rd17, %rd15;\0A\09shr.u64 \09%rd19, %rd18, 20;\0A\09ld.global.u32 \09%r10, [%rd11];\0A\09add.s64 \09%rd20, %rd15, %rd14;\0A\09add.s64 \09%rd22, %rd20, %rd13;\0A\09setp.gt.s32 \09%p1, %r7, 3;\0A\09selp.b32 \09%r2, %r8, 0, %p1;\0A\09add.s64 \09%rd24, %rd19, 1;\0A\09shl.b64 \09%rd21, %rd22, 2;\0A\09add.s64 \09%rd23, %rd12, %rd21;\0ALBB0_1:\0A\09setp.lt.s64 \09%p2, %rd22, %rd1;\0A\09@%p2 bra \09LBB0_4;\0A\09bra.uni \09LBB0_2;\0ALBB0_4:\0A\09st.global.u32 \09[%rd23], %r2;\0A\09mov.u32 \09%r10, %r2;\0ALBB0_2:\0A\09bar.sync \090;\0A\09add.s64 \09%rd24, %rd24, -1;\0A\09add.s64 \09%rd23, %rd23, 4194304;\0A\09add.s64 \09%rd22, %rd22, 1048576;\0A\09setp.eq.s64 \09%p3, %rd24, 0;\0A\09@%p3 bra \09LBB0_3;\0A\09bra.uni \09LBB0_1;\0ALBB0_3:\0A\09st.global.u32 \09[%rd11], %r10;\0A\09ret;\0A}\0A\0A\0A\00"
@FUNC_f_SCOP_0_KERNEL_0_name = private unnamed_addr constant [23 x i8] c"FUNC_f_SCOP_0_KERNEL_0\00"
@FUNC_inita_SCOP_0_KERNEL_0 = private unnamed_addr constant [477 x i8] c"//\0A// Generated by LLVM NVPTX Back-End\0A//\0A\0A.version 3.2\0A.target sm_30\0A.address_size 64\0A\0A\09// .globl\09FUNC_inita_SCOP_0_KERNEL_0\0A\0A.visible .entry FUNC_inita_SCOP_0_KERNEL_0(\0A\09.param .u64 FUNC_inita_SCOP_0_KERNEL_0_param_0\0A)\0A.maxntid 6, 1, 1\0A{\0A\09.reg .b32 \09%r<3>;\0A\09.reg .b64 \09%rd<4>;\0A\0A\09ld.param.u64 \09%rd1, [FUNC_inita_SCOP_0_KERNEL_0_param_0];\0A\09mov.u32 \09%r1, %tid.x;\0A\09mul.wide.u32 \09%rd2, %r1, 4;\0A\09add.s64 \09%rd3, %rd1, %rd2;\0A\09mov.u32 \09%r2, 0;\0A\09st.global.u32 \09[%rd3], %r2;\0A\09ret;\0A}\0A\0A\0A\00"
@FUNC_inita_SCOP_0_KERNEL_0_name = private unnamed_addr constant [27 x i8] c"FUNC_inita_SCOP_0_KERNEL_0\00"
define void @f(i32* %begin, i32* %end, i32* %arr, i32* %control, i32* %readarr) {
entry:
%tmp8.preload.s2a = alloca i32
%tmp6.preload.s2a = alloca i32
%tmp4.preload.s2a = alloca i32
%tmp1.preload.s2a = alloca i32
%polly_launch_0_params = alloca [18 x i8*]
%polly_launch_0_param_0 = alloca i8*
%polly_launch_0_param_1 = alloca i8*
%polly_launch_0_param_2 = alloca i32
%polly_launch_0_param_3 = alloca i32
%polly_launch_0_param_4 = alloca i32
%polly_launch_0_param_5 = alloca i32
%polly_launch_0_param_6 = alloca i32
%polly_launch_0_param_7 = alloca i32
%polly_launch_0_param_8 = alloca i32
%polly_launch_0_param_size_0 = alloca i32
%polly_launch_0_param_size_1 = alloca i32
%polly_launch_0_param_size_2 = alloca i32
%polly_launch_0_param_size_3 = alloca i32
%polly_launch_0_param_size_4 = alloca i32
%polly_launch_0_param_size_5 = alloca i32
%polly_launch_0_param_size_6 = alloca i32
%polly_launch_0_param_size_7 = alloca i32
%polly_launch_0_param_size_8 = alloca i32
%polly_launch_0_params_i8ptr = bitcast [18 x i8*]* %polly_launch_0_params to i8*
br label %polly.split_new_and_old
polly.split_new_and_old: ; preds = %entry
%polly.access.begin = getelementptr i32, i32* %begin, i64 0
%polly.access.begin.load = load i32, i32* %polly.access.begin, align 4, !alias.scope !0, !noalias !2
store i32 %polly.access.begin.load, i32* %tmp1.preload.s2a
%polly.access.end = getelementptr i32, i32* %end, i64 0
%polly.access.end.load = load i32, i32* %polly.access.end, align 4, !alias.scope !3, !noalias !8
store i32 %polly.access.end.load, i32* %tmp4.preload.s2a
%0 = sext i32 %polly.access.begin.load to i64
%1 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %0, i64 1)
%.obit = extractvalue { i64, i1 } %1, 1
%polly.overflow.state = or i1 false, %.obit
%.res = extractvalue { i64, i1 } %1, 0
%2 = sext i32 %polly.access.end.load to i64
%3 = icmp sge i64 %2, %.res
%polly.preload.cond.overflown = xor i1 %polly.overflow.state, true
%polly.preload.cond.result = and i1 %3, %polly.preload.cond.overflown
br label %polly.preload.cond
polly.preload.cond: ; preds = %polly.split_new_and_old
br i1 %polly.preload.cond.result, label %polly.preload.exec, label %polly.preload.merge
polly.preload.merge: ; preds = %polly.preload.exec, %polly.preload.cond
%polly.preload.tmp6.merge = phi i32 [ %polly.access.control.load, %polly.preload.exec ], [ 0, %polly.preload.cond ]
store i32 %polly.preload.tmp6.merge, i32* %tmp6.preload.s2a
%polly.access.begin4 = getelementptr i32, i32* %begin, i64 1
%polly.access.arr = getelementptr i32, i32* %arr, i32 %polly.access.begin.load
%4 = ptrtoint i32* %polly.access.begin4 to i64
%5 = ptrtoint i32* %polly.access.arr to i64
%6 = icmp ule i64 %4, %5
%polly.access.arr5 = getelementptr i32, i32* %arr, i32 %polly.access.end.load
%polly.access.begin6 = getelementptr i32, i32* %begin, i64 0
%7 = ptrtoint i32* %polly.access.arr5 to i64
%8 = ptrtoint i32* %polly.access.begin6 to i64
%9 = icmp ule i64 %7, %8
%10 = or i1 %6, %9
%11 = and i1 true, %10
%polly.access.control7 = getelementptr i32, i32* %control, i64 1
%polly.access.arr8 = getelementptr i32, i32* %arr, i32 %polly.access.begin.load
%12 = ptrtoint i32* %polly.access.control7 to i64
%13 = ptrtoint i32* %polly.access.arr8 to i64
%14 = icmp ule i64 %12, %13
%polly.access.arr9 = getelementptr i32, i32* %arr, i32 %polly.access.end.load
%polly.access.control10 = getelementptr i32, i32* %control, i64 0
%15 = ptrtoint i32* %polly.access.arr9 to i64
%16 = ptrtoint i32* %polly.access.control10 to i64
%17 = icmp ule i64 %15, %16
%18 = or i1 %14, %17
%19 = and i1 %11, %18
%polly.access.end11 = getelementptr i32, i32* %end, i64 1
%polly.access.arr12 = getelementptr i32, i32* %arr, i32 %polly.access.begin.load
%20 = ptrtoint i32* %polly.access.end11 to i64
%21 = ptrtoint i32* %polly.access.arr12 to i64
%22 = icmp ule i64 %20, %21
%polly.access.arr13 = getelementptr i32, i32* %arr, i32 %polly.access.end.load
%polly.access.end14 = getelementptr i32, i32* %end, i64 0
%23 = ptrtoint i32* %polly.access.arr13 to i64
%24 = ptrtoint i32* %polly.access.end14 to i64
%25 = icmp ule i64 %23, %24
%26 = or i1 %22, %25
%27 = and i1 %19, %26
%polly.access.readarr = getelementptr i32, i32* %readarr, i64 1
%polly.access.arr15 = getelementptr i32, i32* %arr, i32 %polly.access.begin.load
%28 = ptrtoint i32* %polly.access.readarr to i64
%29 = ptrtoint i32* %polly.access.arr15 to i64
%30 = icmp ule i64 %28, %29
%polly.access.arr16 = getelementptr i32, i32* %arr, i32 %polly.access.end.load
%polly.access.readarr17 = getelementptr i32, i32* %readarr, i64 0
%31 = ptrtoint i32* %polly.access.arr16 to i64
%32 = ptrtoint i32* %polly.access.readarr17 to i64
%33 = icmp ule i64 %31, %32
%34 = or i1 %30, %33
%35 = and i1 %27, %34
%36 = sext i32 %polly.access.begin.load to i64
%37 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 0, i64 %36)
%.obit18 = extractvalue { i64, i1 } %37, 1
%polly.overflow.state19 = or i1 false, %.obit18
%.res20 = extractvalue { i64, i1 } %37, 0
%38 = sext i32 %polly.access.end.load to i64
%39 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %.res20, i64 %38)
%.obit21 = extractvalue { i64, i1 } %39, 1
%polly.overflow.state22 = or i1 %polly.overflow.state19, %.obit21
%.res23 = extractvalue { i64, i1 } %39, 0
%40 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %.res23)
%.obit24 = extractvalue { i64, i1 } %40, 1
%polly.overflow.state25 = or i1 %polly.overflow.state22, %.obit24
%.res26 = extractvalue { i64, i1 } %40, 0
%41 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 4, i64 %.res26)
%.obit27 = extractvalue { i64, i1 } %41, 1
%polly.overflow.state28 = or i1 %polly.overflow.state25, %.obit27
%.res29 = extractvalue { i64, i1 } %41, 0
%42 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res29)
%.obit30 = extractvalue { i64, i1 } %42, 1
%polly.overflow.state31 = or i1 %polly.overflow.state28, %.obit30
%.res32 = extractvalue { i64, i1 } %42, 0
%43 = sext i32 %polly.access.begin.load to i64
%44 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 0, i64 %43)
%.obit33 = extractvalue { i64, i1 } %44, 1
%polly.overflow.state34 = or i1 %polly.overflow.state31, %.obit33
%.res35 = extractvalue { i64, i1 } %44, 0
%45 = sext i32 %polly.access.end.load to i64
%46 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %.res35, i64 %45)
%.obit36 = extractvalue { i64, i1 } %46, 1
%polly.overflow.state37 = or i1 %polly.overflow.state34, %.obit36
%.res38 = extractvalue { i64, i1 } %46, 0
%47 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %.res38)
%.obit39 = extractvalue { i64, i1 } %47, 1
%polly.overflow.state40 = or i1 %polly.overflow.state37, %.obit39
%.res41 = extractvalue { i64, i1 } %47, 0
%48 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 2, i64 %.res41)
%.obit42 = extractvalue { i64, i1 } %48, 1
%polly.overflow.state43 = or i1 %polly.overflow.state40, %.obit42
%.res44 = extractvalue { i64, i1 } %48, 0
%49 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %.res32, i64 %.res44)
%.obit45 = extractvalue { i64, i1 } %49, 1
%polly.overflow.state46 = or i1 %polly.overflow.state43, %.obit45
%.res47 = extractvalue { i64, i1 } %49, 0
%50 = sext i32 %polly.access.begin.load to i64
%51 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 0, i64 %50)
%.obit48 = extractvalue { i64, i1 } %51, 1
%polly.overflow.state49 = or i1 %polly.overflow.state46, %.obit48
%.res50 = extractvalue { i64, i1 } %51, 0
%52 = sext i32 %polly.access.end.load to i64
%53 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %.res50, i64 %52)
%.obit51 = extractvalue { i64, i1 } %53, 1
%polly.overflow.state52 = or i1 %polly.overflow.state49, %.obit51
%.res53 = extractvalue { i64, i1 } %53, 0
%54 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %.res53)
%.obit54 = extractvalue { i64, i1 } %54, 1
%polly.overflow.state55 = or i1 %polly.overflow.state52, %.obit54
%.res56 = extractvalue { i64, i1 } %54, 0
%55 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 8, i64 %.res56)
%.obit57 = extractvalue { i64, i1 } %55, 1
%polly.overflow.state58 = or i1 %polly.overflow.state55, %.obit57
%.res59 = extractvalue { i64, i1 } %55, 0
%56 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %.res47, i64 %.res59)
%.obit60 = extractvalue { i64, i1 } %56, 1
%polly.overflow.state61 = or i1 %polly.overflow.state58, %.obit60
%.res62 = extractvalue { i64, i1 } %56, 0
%57 = icmp sge i64 %.res62, 0
%58 = and i1 %35, %57
%polly.rtc.overflown = xor i1 %polly.overflow.state61, true
%polly.rtc.result = and i1 %58, %polly.rtc.overflown
br i1 false, label %polly.start, label %entry.split.pre_entry_bb
entry.split.pre_entry_bb: ; preds = %polly.preload.merge
br label %entry.split
entry.split: ; preds = %entry.split.pre_entry_bb
%tmp1 = load i32, i32* %begin, align 4
%tmp41 = load i32, i32* %end, align 4
%cmp2 = icmp slt i32 %tmp1, %tmp41
br i1 %cmp2, label %for.body.lr.ph, label %for.end.region_exiting
for.body.lr.ph: ; preds = %entry.split
%59 = sext i32 %tmp1 to i64
br label %for.body
for.body: ; preds = %for.body.lr.ph, %if.end
%indvars.iv = phi i64 [ %59, %for.body.lr.ph ], [ %indvars.iv.next, %if.end ]
%tmp6 = load i32, i32* %control, align 4
%cmp1 = icmp sgt i32 %tmp6, 3
br i1 %cmp1, label %if.then, label %if.end
if.then: ; preds = %for.body
%tmp8 = load i32, i32* %readarr, align 4
br label %if.end
if.end: ; preds = %if.then, %for.body
%t.0 = phi i32 [ %tmp8, %if.then ], [ 0, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv
store i32 %t.0, i32* %arrayidx, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%tmp4 = load i32, i32* %end, align 4
%60 = sext i32 %tmp4 to i64
%cmp = icmp slt i64 %indvars.iv.next, %60
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %if.end
br label %for.end.region_exiting
for.end.region_exiting: ; preds = %entry.split, %for.cond.for.end_crit_edge
br label %polly.merge_new_and_old
polly.merge_new_and_old: ; preds = %polly.exiting, %for.end.region_exiting
br label %for.end
for.end: ; preds = %polly.merge_new_and_old
ret void
polly.start: ; preds = %polly.preload.merge
br label %polly.acc.initialize
polly.acc.initialize: ; preds = %polly.start
%61 = call i8* @polly_initContextCUDA()
%p_dev_array_MemRef0 = call i8* @polly_allocateMemoryForDevice(i64 0)
%p_dev_array_MemRef1 = call i8* @polly_allocateMemoryForDevice(i64 0)
%p_dev_array_MemRef2 = call i8* @polly_allocateMemoryForDevice(i64 0)
%p_dev_array_MemRef3__phi = call i8* @polly_allocateMemoryForDevice(i64 4)
%p_dev_array_MemRef4 = call i8* @polly_allocateMemoryForDevice(i64 0)
%62 = sext i32 %polly.access.end.load to i64
%63 = mul i64 4, %62
%64 = sext i32 %polly.access.begin.load to i64
%65 = add nsw i64 0, %64
%66 = mul i64 %65, 4
%67 = sub i64 %63, %66
%p_dev_array_MemRef5 = call i8* @polly_allocateMemoryForDevice(i64 %67)
br label %polly.preload.begin
polly.preload.begin: ; preds = %polly.acc.initialize
%68 = sext i32 %polly.access.begin.load to i64
%69 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %68, i64 1)
%.obit63 = extractvalue { i64, i1 } %69, 1
%polly.overflow.state64 = or i1 false, %.obit63
%.res65 = extractvalue { i64, i1 } %69, 0
%70 = sext i32 %polly.access.end.load to i64
%71 = icmp sge i64 %70, %.res65
%72 = sext i32 %polly.preload.tmp6.merge to i64
%73 = icmp sge i64 %72, 4
%74 = and i1 %71, %73
%polly.preload.cond.overflown66 = xor i1 %polly.overflow.state64, true
%polly.preload.cond.result67 = and i1 %74, %polly.preload.cond.overflown66
br label %polly.preload.cond68
polly.preload.cond68: ; preds = %polly.preload.begin
br i1 %polly.preload.cond.result67, label %polly.preload.exec70, label %polly.preload.merge69
polly.preload.merge69: ; preds = %polly.preload.exec70, %polly.preload.cond68
%polly.preload.tmp8.merge = phi i32 [ %polly.access.readarr71.load, %polly.preload.exec70 ], [ 0, %polly.preload.cond68 ]
store i32 %polly.preload.tmp8.merge, i32* %tmp8.preload.s2a
br label %polly.cond
polly.cond: ; preds = %polly.preload.merge69
%75 = sext i32 %polly.access.begin.load to i64
%76 = add nsw i64 %75, 1
%77 = sext i32 %polly.access.end.load to i64
%78 = icmp sge i64 %77, %76
br i1 %78, label %polly.then, label %polly.else
polly.merge: ; preds = %polly.else, %polly.merge73
call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef0)
call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef1)
call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef2)
call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef3__phi)
call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef4)
call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef5)
call void @polly_freeContext(i8* %61)
br label %polly.exiting
polly.exiting: ; preds = %polly.merge
br label %polly.merge_new_and_old
polly.preload.exec: ; preds = %polly.preload.cond
%polly.access.control = getelementptr i32, i32* %control, i64 0
%polly.access.control.load = load i32, i32* %polly.access.control, align 4, !alias.scope !4, !noalias !9
br label %polly.preload.merge
polly.preload.exec70: ; preds = %polly.preload.cond68
%polly.access.readarr71 = getelementptr i32, i32* %readarr, i64 0
%polly.access.readarr71.load = load i32, i32* %polly.access.readarr71, align 4, !alias.scope !6, !noalias !10
br label %polly.preload.merge69
polly.then: ; preds = %polly.cond
%79 = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef3__phi)
%80 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 0
store i8* %79, i8** %polly_launch_0_param_0
%81 = bitcast i8** %polly_launch_0_param_0 to i8*
store i8* %81, i8** %80
%82 = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef5)
%83 = sext i32 %polly.access.begin.load to i64
%84 = add nsw i64 0, %83
%85 = bitcast i8* %82 to i32*
%86 = sub i64 0, %84
%87 = getelementptr i32, i32* %85, i64 %86
%88 = bitcast i32* %87 to i8*
%89 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 1
store i8* %88, i8** %polly_launch_0_param_1
%90 = bitcast i8** %polly_launch_0_param_1 to i8*
store i8* %90, i8** %89
store i32 %polly.access.begin.load, i32* %polly_launch_0_param_2
%91 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 2
%92 = bitcast i32* %polly_launch_0_param_2 to i8*
store i8* %92, i8** %91
store i32 %polly.access.end.load, i32* %polly_launch_0_param_3
%93 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 3
%94 = bitcast i32* %polly_launch_0_param_3 to i8*
store i8* %94, i8** %93
store i32 %polly.preload.tmp6.merge, i32* %polly_launch_0_param_4
%95 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 4
%96 = bitcast i32* %polly_launch_0_param_4 to i8*
store i8* %96, i8** %95
store i32 %polly.preload.tmp8.merge, i32* %polly_launch_0_param_5
%97 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 5
%98 = bitcast i32* %polly_launch_0_param_5 to i8*
store i8* %98, i8** %97
store i32 %polly.access.begin.load, i32* %polly_launch_0_param_6
%99 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 6
%100 = bitcast i32* %polly_launch_0_param_6 to i8*
store i8* %100, i8** %99
store i32 %polly.preload.tmp6.merge, i32* %polly_launch_0_param_7
%101 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 7
%102 = bitcast i32* %polly_launch_0_param_7 to i8*
store i8* %102, i8** %101
store i32 %polly.access.end.load, i32* %polly_launch_0_param_8
%103 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 8
%104 = bitcast i32* %polly_launch_0_param_8 to i8*
store i8* %104, i8** %103
store i32 4, i32* %polly_launch_0_param_size_0
%105 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 9
%106 = bitcast i32* %polly_launch_0_param_size_0 to i8*
store i8* %106, i8** %105
store i32 4, i32* %polly_launch_0_param_size_1
%107 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 10
%108 = bitcast i32* %polly_launch_0_param_size_1 to i8*
store i8* %108, i8** %107
store i32 4, i32* %polly_launch_0_param_size_2
%109 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 11
%110 = bitcast i32* %polly_launch_0_param_size_2 to i8*
store i8* %110, i8** %109
store i32 4, i32* %polly_launch_0_param_size_3
%111 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 12
%112 = bitcast i32* %polly_launch_0_param_size_3 to i8*
store i8* %112, i8** %111
store i32 4, i32* %polly_launch_0_param_size_4
%113 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 13
%114 = bitcast i32* %polly_launch_0_param_size_4 to i8*
store i8* %114, i8** %113
store i32 4, i32* %polly_launch_0_param_size_5
%115 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 14
%116 = bitcast i32* %polly_launch_0_param_size_5 to i8*
store i8* %116, i8** %115
store i32 4, i32* %polly_launch_0_param_size_6
%117 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 15
%118 = bitcast i32* %polly_launch_0_param_size_6 to i8*
store i8* %118, i8** %117
store i32 4, i32* %polly_launch_0_param_size_7
%119 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 16
%120 = bitcast i32* %polly_launch_0_param_size_7 to i8*
store i8* %120, i8** %119
store i32 4, i32* %polly_launch_0_param_size_8
%121 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 17
%122 = bitcast i32* %polly_launch_0_param_size_8 to i8*
store i8* %122, i8** %121
%123 = call i8* @polly_getKernel(i8* getelementptr inbounds ([1802 x i8], [1802 x i8]* @FUNC_f_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([23 x i8], [23 x i8]* @FUNC_f_SCOP_0_KERNEL_0_name, i32 0, i32 0))
%124 = sext i32 %polly.access.begin.load to i64
%125 = add nsw i64 %124, 1048545
%126 = sext i32 %polly.access.end.load to i64
%127 = icmp sge i64 %126, %125
%128 = sext i32 %polly.access.begin.load to i64
%129 = sub nsw i64 0, %128
%130 = sext i32 %polly.access.begin.load to i64
%131 = mul nsw i64 31, %130
%132 = sext i32 %polly.access.end.load to i64
%133 = add nsw i64 %131, %132
%134 = add nsw i64 %133, 31
%polly.fdiv_q.shr = ashr i64 %134, 5
%135 = add nsw i64 %129, %polly.fdiv_q.shr
%136 = select i1 %127, i64 32768, i64 %135
%137 = trunc i64 %136 to i32
call void @polly_launchKernel(i8* %123, i32 %137, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
call void @polly_freeKernel(i8* %123)
br label %polly.cond72
polly.cond72: ; preds = %polly.then
%138 = sext i32 %polly.access.end.load to i64
%139 = icmp sge i64 %138, 1
br i1 %139, label %polly.then74, label %polly.else75
polly.merge73: ; preds = %polly.else75, %polly.then74
br label %polly.merge
polly.else: ; preds = %polly.cond
br label %polly.merge
polly.then74: ; preds = %polly.cond72
%140 = sext i32 %polly.access.end.load to i64
%141 = mul i64 4, %140
%142 = sext i32 %polly.access.begin.load to i64
%143 = add nsw i64 0, %142
%144 = getelementptr i32, i32* %arr, i64 %143
%145 = bitcast i32* %144 to i8*
%146 = mul i64 %143, 4
%147 = sub i64 %141, %146
call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef5, i8* %145, i64 %147)
br label %polly.merge73
polly.else75: ; preds = %polly.cond72
br label %polly.merge73
}
define void @inita(i32* %A) {
entry:
%polly_launch_0_params = alloca [2 x i8*]
%polly_launch_0_param_0 = alloca i8*
%polly_launch_0_param_size_0 = alloca i32
%polly_launch_0_params_i8ptr = bitcast [2 x i8*]* %polly_launch_0_params to i8*
br label %entry.split
entry.split: ; preds = %entry
br label %polly.split_new_and_old
polly.split_new_and_old: ; preds = %entry.split
%0 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 6)
%.obit = extractvalue { i64, i1 } %0, 1
%polly.overflow.state = or i1 false, %.obit
%.res = extractvalue { i64, i1 } %0, 0
%1 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 6, i64 %.res)
%.obit2 = extractvalue { i64, i1 } %1, 1
%polly.overflow.state3 = or i1 %polly.overflow.state, %.obit2
%.res4 = extractvalue { i64, i1 } %1, 0
%2 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res4)
%.obit5 = extractvalue { i64, i1 } %2, 1
%polly.overflow.state6 = or i1 %polly.overflow.state3, %.obit5
%.res7 = extractvalue { i64, i1 } %2, 0
%3 = icmp sge i64 %.res7, 0
%4 = and i1 true, %3
%polly.rtc.overflown = xor i1 %polly.overflow.state6, true
%polly.rtc.result = and i1 %4, %polly.rtc.overflown
br i1 %polly.rtc.result, label %polly.start, label %for.body.pre_entry_bb
for.body.pre_entry_bb: ; preds = %polly.split_new_and_old
br label %for.body
for.body: ; preds = %for.body.pre_entry_bb, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.pre_entry_bb ]
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
store i32 0, i32* %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp ne i64 %indvars.iv.next, 6
br i1 %exitcond, label %for.body, label %polly.merge_new_and_old
polly.merge_new_and_old: ; preds = %polly.exiting, %for.body
br label %for.end
for.end: ; preds = %polly.merge_new_and_old
ret void
polly.start: ; preds = %polly.split_new_and_old
br label %polly.acc.initialize
polly.acc.initialize: ; preds = %polly.start
%5 = call i8* @polly_initContextCUDA()
%p_dev_array_MemRef0 = call i8* @polly_allocateMemoryForDevice(i64 24)
%6 = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef0)
%7 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
store i8* %6, i8** %polly_launch_0_param_0
%8 = bitcast i8** %polly_launch_0_param_0 to i8*
store i8* %8, i8** %7
store i32 4, i32* %polly_launch_0_param_size_0
%9 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
%10 = bitcast i32* %polly_launch_0_param_size_0 to i8*
store i8* %10, i8** %9
%11 = call i8* @polly_getKernel(i8* getelementptr inbounds ([477 x i8], [477 x i8]* @FUNC_inita_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([27 x i8], [27 x i8]* @FUNC_inita_SCOP_0_KERNEL_0_name, i32 0, i32 0))
call void @polly_launchKernel(i8* %11, i32 1, i32 1, i32 6, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
call void @polly_freeKernel(i8* %11)
%12 = bitcast i32* %A to i8*
call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef0, i8* %12, i64 24)
call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef0)
call void @polly_freeContext(i8* %5)
br label %polly.exiting
polly.exiting: ; preds = %polly.acc.initialize
br label %polly.merge_new_and_old
}
define void @printarr(i32* %A) {
entry:
br label %entry.split
entry.split: ; preds = %entry
%call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0))
br label %for.body
for.body: ; preds = %entry.split, %for.body
%indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
%tmp3 = load i32, i32* %arrayidx, align 4
%call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.1, i64 0, i64 0), i32 %tmp3)
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp ne i64 %indvars.iv.next, 6
br i1 %exitcond, label %for.body, label %for.end
for.end: ; preds = %for.body
%putchar = tail call i32 @putchar(i32 10)
ret void
}
declare i32 @printf(i8*, ...)
define i32 @main() {
entry:
%A = alloca [6 x i32], align 16
%readarr = alloca i32, align 4
%control1 = alloca i32, align 4
%b = alloca i32, align 4
%e = alloca i32, align 4
br label %entry.split
entry.split: ; preds = %entry
store i32 10, i32* %readarr, align 4
store i32 3, i32* %control1, align 4
br label %for.body
for.body: ; preds = %entry.split, %for.inc15
%tmp1 = load i32, i32* %control1, align 4
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.3, i64 0, i64 0), i32 %tmp1)
store i32 0, i32* %b, align 4
br label %for.cond5.preheader
for.cond5.preheader: ; preds = %for.body, %for.inc12
%storemerge2.in3 = load i32, i32* %b, align 4
%storemerge24 = add nsw i32 %storemerge2.in3, 1
store i32 %storemerge24, i32* %e, align 4
%cmp65 = icmp slt i32 %storemerge2.in3, 5
br i1 %cmp65, label %for.body7.lr.ph, label %for.inc12
for.body7.lr.ph: ; preds = %for.cond5.preheader
br label %for.body7
for.body7: ; preds = %for.body7.lr.ph, %for.body7
%puts = call i32 @puts(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @str, i64 0, i64 0))
%tmp5 = load i32, i32* %b, align 4
%tmp6 = load i32, i32* %e, align 4
%call9 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 %tmp5, i32 %tmp6)
%arraydecay = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0
call void @inita(i32* %arraydecay)
%arraydecay10 = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0
call void @f(i32* nonnull %b, i32* nonnull %e, i32* %arraydecay10, i32* nonnull %control1, i32* nonnull %readarr)
%arraydecay11 = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0
call void @printarr(i32* %arraydecay11)
%storemerge2.in = load i32, i32* %e, align 4
%storemerge2 = add nsw i32 %storemerge2.in, 1
store i32 %storemerge2, i32* %e, align 4
%cmp6 = icmp slt i32 %storemerge2.in, 5
br i1 %cmp6, label %for.body7, label %for.cond5.for.inc12_crit_edge
for.cond5.for.inc12_crit_edge: ; preds = %for.body7
br label %for.inc12
for.inc12: ; preds = %for.cond5.for.inc12_crit_edge, %for.cond5.preheader
%tmp8 = load i32, i32* %b, align 4
%inc13 = add nsw i32 %tmp8, 1
store i32 %inc13, i32* %b, align 4
%cmp3 = icmp slt i32 %tmp8, 5
br i1 %cmp3, label %for.cond5.preheader, label %for.inc15
for.inc15: ; preds = %for.inc12
%tmp9 = load i32, i32* %control1, align 4
%inc16 = add nsw i32 %tmp9, 1
store i32 %inc16, i32* %control1, align 4
%cmp = icmp slt i32 %tmp9, 4
br i1 %cmp, label %for.body, label %for.end17
for.end17: ; preds = %for.inc15
ret i32 0
}
; Function Attrs: nounwind
declare i32 @putchar(i32) #0
; Function Attrs: nounwind
declare i32 @puts(i8* nocapture readonly) #0
; Function Attrs: nounwind readnone speculatable
declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) #1
; Function Attrs: nounwind readnone speculatable
declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) #1
; Function Attrs: nounwind readnone speculatable
declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64) #1
declare i8* @polly_initContextCUDA()
declare i8* @polly_allocateMemoryForDevice(i64)
declare i8* @polly_getDevicePtr(i8*)
declare i8* @polly_getKernel(i8*, i8*)
declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
declare void @polly_freeKernel(i8*)
declare void @polly_copyFromDeviceToHost(i8*, i8*, i64)
declare void @polly_freeDeviceMemory(i8*)
declare void @polly_freeContext(i8*)
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone speculatable }
!0 = distinct !{!0, !1, !"polly.alias.scope.MemRef0"}
!1 = distinct !{!1, !"polly.alias.scope.domain"}
!2 = !{!3, !4, !5, !6, !7}
!3 = distinct !{!3, !1, !"polly.alias.scope.MemRef1"}
!4 = distinct !{!4, !1, !"polly.alias.scope.MemRef2"}
!5 = distinct !{!5, !1, !"polly.alias.scope.MemRef3__phi"}
!6 = distinct !{!6, !1, !"polly.alias.scope.MemRef4"}
!7 = distinct !{!7, !1, !"polly.alias.scope.MemRef5"}
!8 = !{!0, !4, !5, !6, !7}
!9 = !{!0, !3, !5, !6, !7}
!10 = !{!0, !3, !4, !5, !7}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment