bollu/debug output on run

## debug output on run
16:25 $ make runbatch
rm *.optimised.ll
rm *.out
rm *.bench
rm: cannot remove '*.bench': No such file or directory
makefile:23: recipe for target 'clean' failed
make: [clean] Error 1 (ignored)
rm *.s
/users/siddhart/llvm-install/bin/opt -S -polly-canonicalize -polly-process-unprofitable -polly-invariant-load-hoisting -polly-codegen-ppcg \
	-polly-acc-mincompute=0 program.ll -o program.optimised.ll
/users/siddhart/llvm-install/bin/llc program.optimised.ll -o program.s
/users/siddhart/llvm-install/bin/clang program.s  -lcudart -lGPURuntime -ldl -lOpenCL -lgfortran -lstdc++ -o program.out -L/opt/nvidia/cudatoolkit8.0/8.0.54_2.2.8_ga620558-2.1/lib64/
export POLLY_DEBUG=1
POLLY_DEBUG=1 srun -n 1 -Cgpu --partition=debug nvprof ./program.out
srun: job 2387457 queued and waiting for resources
srun: job 2387457 has been allocated resources
-> polly_initContext
-> initContextCUDA
==29336== NVPROF is profiling process 29336, command: ./program.out
> Running on GPU device 0 : Tesla P100-PCIE-16GB.

-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
CUDA Link Completed in 0.000000ms. Linker Output:
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function 'FUNC_inita_SCOP_0_KERNEL_0' for 'sm_60'
ptxas info    : Function properties for FUNC_inita_SCOP_0_KERNEL_0
ptxas         .     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 4 registers, 328 bytes cmem[0]
info    : 0 bytes gmem
info    : Function properties for 'FUNC_inita_SCOP_0_KERNEL_0':
info    : used 4 registers, 0 stack, 0 bytes smem, 328 bytes cmem[0], 0 bytes lmem
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
-> polly_initContext
-> initContextCUDA
-> polly_allocateMemoryForDevice
-> allocateMemoryForDeviceCUDA
-> polly_getDevicePtr
-> getDevicePtrCUDA
-> polly_getKernel
-> getKernelCUDA
  -> using cached kernel
-> polly_launchKernel
-> launchKernelCUDA
-> polly_freeKernel
-> freeKernelCUDA
-> polly_copyFromDeviceToHost
-> copyFromDeviceToHostCUDA
-> polly_freeDeviceMemory
-> freeDeviceMemoryCUDA
-> polly_freeContext
==29336== Profiling application: ./program.out
* Control: 3
-----
b: 0 | e: 1
A: 0 0 0 0 0 0
-----
b: 0 | e: 2
A: 0 0 0 0 0 0
-----
b: 0 | e: 3
A: 0 0 0 0 0 0
-----
b: 0 | e: 4
A: 0 0 0 0 0 0
-----
b: 0 | e: 5
A: 0 0 0 0 0 0
-----
b: 1 | e: 2
A: 0 0 0 0 0 0
-----
b: 1 | e: 3
A: 0 0 0 0 0 0
-----
b: 1 | e: 4
A: 0 0 0 0 0 0
-----
b: 1 | e: 5
A: 0 0 0 0 0 0
-----
b: 2 | e: 3
A: 0 0 0 0 0 0
-----
b: 2 | e: 4
A: 0 0 0 0 0 0
-----
b: 2 | e: 5
A: 0 0 0 0 0 0
-----
b: 3 | e: 4
A: 0 0 0 0 0 0
-----
b: 3 | e: 5
A: 0 0 0 0 0 0
-----
b: 4 | e: 5
A: 0 0 0 0 0 0
* Control: 4
-----
b: 0 | e: 1
A: 10 0 0 0 0 0
-----
b: 0 | e: 2
A: 10 10 0 0 0 0
-----
b: 0 | e: 3
A: 10 10 10 0 0 0
-----
b: 0 | e: 4
A: 10 10 10 10 0 0
-----
b: 0 | e: 5
A: 10 10 10 10 10 0
-----
b: 1 | e: 2
A: 0 10 0 0 0 0
-----
b: 1 | e: 3
A: 0 10 10 0 0 0
-----
b: 1 | e: 4
A: 0 10 10 10 0 0
-----
b: 1 | e: 5
A: 0 10 10 10 10 0
-----
b: 2 | e: 3
A: 0 0 10 0 0 0
-----
b: 2 | e: 4
A: 0 0 10 10 0 0
-----
b: 2 | e: 5
A: 0 0 10 10 10 0
-----
b: 3 | e: 4
A: 0 0 0 10 0 0
-----
b: 3 | e: 5
A: 0 0 0 10 10 0
-----
b: 4 | e: 5
A: 0 0 0 0 10 0
==29336== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 72.49%  53.280us        30  1.7760us  1.6640us  3.2640us  FUNC_inita_SCOP_0_KERNEL_0
 27.51%  20.224us        30     674ns     640ns     832ns  [CUDA memcpy DtoH]

==29336== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 91.74%  265.27ms         1  265.27ms  265.27ms  265.27ms  cuCtxCreate
  5.67%  16.381ms        30  546.03us  499.71us  580.99us  cuMemAlloc
  1.50%  4.3289ms         1  4.3289ms  4.3289ms  4.3289ms  cuLinkAddData
  0.73%  2.1040ms        30  70.134us  66.598us  95.121us  cuMemFree
  0.17%  480.85us        30  16.028us  15.338us  20.152us  cuMemcpyDtoH
  0.13%  366.10us        30  12.203us  10.963us  23.069us  cuLaunchKernel
  0.03%  97.328us         1  97.328us  97.328us  97.328us  cuLinkComplete
  0.03%  73.145us         1  73.145us  73.145us  73.145us  cuModuleLoadData
  0.01%  24.702us         1  24.702us  24.702us  24.702us  cuLinkCreate
  0.01%  17.288us         1  17.288us  17.288us  17.288us  cuDeviceGetName
  0.00%  2.2970us         3     765ns     180ns  1.7200us  cuDeviceGetCount
  0.00%  2.2530us         1  2.2530us  2.2530us  2.2530us  cuLinkDestroy
  0.00%     874ns         3     291ns     184ns     484ns  cuDeviceGet
  0.00%     855ns         4     213ns     135ns     324ns  cuDeviceGetAttribute
  0.00%     678ns         1     678ns     678ns     678ns  cuModuleGetFunction
  0.00%     654ns         1     654ns     654ns     654ns  cuDeviceComputeCapability

## program.c
// [siddhart@greina0 compile-cpp-to-gpu]$ cat program.c
#include <stdio.h>
void f(int *begin, int *end, int *arr, int *control, int *readarr) {
    for(int i = *begin; i < *end; i++) {
        int t = 0;
        if (*control > 3) {
            t += *readarr;
        }
        arr[i] = t;
    }
}


void inita(int *A) {
    for(int i = 0; i < 6; i++) { A[i] = 0; }
}

void printarr(int *A) {
    printf("A: ");
    for (int i = 0; i < 6; i++) {
        printf("%d ", A[i]);
    }
    printf("\n");
}

int main() {
    int A[6];
    int readarr = 10;;
    int control;

    for(int control = 3; control <= 4; control++) {
        printf("* Control: %d\n", control);
        for(int b = 0; b < 6; b++) {
            for(int e = b + 1; e < 6; e++) {
                printf("-----\n");
                printf("b: %d | e: %d\n", b, e);
                inita(A);
                f(&b, &e, A, &control, &readarr);
                printarr(A);
            }
        }
    }
    return 0;

}

## program.canonical.ll
[siddhart@greina0 compile-cpp-to-gpu]$ cat program.canonical.ll
; ModuleID = 'program.ll'
source_filename = "program.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

@.str = private unnamed_addr constant [4 x i8] c"A: \00", align 1
@.str.1 = private unnamed_addr constant [4 x i8] c"%d \00", align 1
@.str.2 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1
@.str.3 = private unnamed_addr constant [15 x i8] c"* Control: %d\0A\00", align 1
@.str.4 = private unnamed_addr constant [7 x i8] c"-----\0A\00", align 1
@.str.5 = private unnamed_addr constant [15 x i8] c"b: %d | e: %d\0A\00", align 1
@str = private unnamed_addr constant [6 x i8] c"-----\00"

define void @f(i32* %begin, i32* %end, i32* %arr, i32* %control, i32* %readarr) {
entry:
  br label %entry.split

entry.split:                                      ; preds = %entry
  %tmp1 = load i32, i32* %begin, align 4
  %tmp41 = load i32, i32* %end, align 4
  %cmp2 = icmp slt i32 %tmp1, %tmp41
  br i1 %cmp2, label %for.body.lr.ph, label %for.end

for.body.lr.ph:                                   ; preds = %entry.split
  %0 = sext i32 %tmp1 to i64
  br label %for.body

for.body:                                         ; preds = %for.body.lr.ph, %if.end
  %indvars.iv = phi i64 [ %0, %for.body.lr.ph ], [ %indvars.iv.next, %if.end ]
  %tmp6 = load i32, i32* %control, align 4
  %cmp1 = icmp sgt i32 %tmp6, 3
  br i1 %cmp1, label %if.then, label %if.end

if.then:                                          ; preds = %for.body
  %tmp8 = load i32, i32* %readarr, align 4
  br label %if.end

if.end:                                           ; preds = %if.then, %for.body
  %t.0 = phi i32 [ %tmp8, %if.then ], [ 0, %for.body ]
  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv
  store i32 %t.0, i32* %arrayidx, align 4
  %indvars.iv.next = add i64 %indvars.iv, 1
  %tmp4 = load i32, i32* %end, align 4
  %1 = sext i32 %tmp4 to i64
  %cmp = icmp slt i64 %indvars.iv.next, %1
  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge

for.cond.for.end_crit_edge:                       ; preds = %if.end
  br label %for.end

for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
  ret void
}

define void @inita(i32* %A) {
entry:
  br label %entry.split

entry.split:                                      ; preds = %entry
  br label %for.body

for.body:                                         ; preds = %entry.split, %for.body
  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
  store i32 0, i32* %arrayidx, align 4
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond = icmp ne i64 %indvars.iv.next, 6
  br i1 %exitcond, label %for.body, label %for.end

for.end:                                          ; preds = %for.body
  ret void
}

define void @printarr(i32* %A) {
entry:
  br label %entry.split

entry.split:                                      ; preds = %entry
  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0))
  br label %for.body

for.body:                                         ; preds = %entry.split, %for.body
  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
  %tmp3 = load i32, i32* %arrayidx, align 4
  %call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.1, i64 0, i64 0), i32 %tmp3)
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond = icmp ne i64 %indvars.iv.next, 6
  br i1 %exitcond, label %for.body, label %for.end

for.end:                                          ; preds = %for.body
  %putchar = tail call i32 @putchar(i32 10)
  ret void
}

declare i32 @printf(i8*, ...)

define i32 @main() {
entry:
  %A = alloca [6 x i32], align 16
  %readarr = alloca i32, align 4
  %control1 = alloca i32, align 4
  %b = alloca i32, align 4
  %e = alloca i32, align 4
  br label %entry.split

entry.split:                                      ; preds = %entry
  store i32 10, i32* %readarr, align 4
  store i32 3, i32* %control1, align 4
  br label %for.body

for.body:                                         ; preds = %entry.split, %for.inc15
  %tmp1 = load i32, i32* %control1, align 4
  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.3, i64 0, i64 0), i32 %tmp1)
  store i32 0, i32* %b, align 4
  br label %for.cond5.preheader

for.cond5.preheader:                              ; preds = %for.body, %for.inc12
  %storemerge2.in3 = load i32, i32* %b, align 4
  %storemerge24 = add nsw i32 %storemerge2.in3, 1
  store i32 %storemerge24, i32* %e, align 4
  %cmp65 = icmp slt i32 %storemerge2.in3, 5
  br i1 %cmp65, label %for.body7.lr.ph, label %for.inc12

for.body7.lr.ph:                                  ; preds = %for.cond5.preheader
  br label %for.body7

for.body7:                                        ; preds = %for.body7.lr.ph, %for.body7
  %puts = call i32 @puts(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @str, i64 0, i64 0))
  %tmp5 = load i32, i32* %b, align 4
  %tmp6 = load i32, i32* %e, align 4
  %call9 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 %tmp5, i32 %tmp6)
  %arraydecay = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0
  call void @inita(i32* %arraydecay)
  %arraydecay10 = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0
  call void @f(i32* nonnull %b, i32* nonnull %e, i32* %arraydecay10, i32* nonnull %control1, i32* nonnull %readarr)
  %arraydecay11 = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0
  call void @printarr(i32* %arraydecay11)
  %storemerge2.in = load i32, i32* %e, align 4
  %storemerge2 = add nsw i32 %storemerge2.in, 1
  store i32 %storemerge2, i32* %e, align 4
  %cmp6 = icmp slt i32 %storemerge2.in, 5
  br i1 %cmp6, label %for.body7, label %for.cond5.for.inc12_crit_edge

for.cond5.for.inc12_crit_edge:                    ; preds = %for.body7
  br label %for.inc12

for.inc12:                                        ; preds = %for.cond5.for.inc12_crit_edge, %for.cond5.preheader
  %tmp8 = load i32, i32* %b, align 4
  %inc13 = add nsw i32 %tmp8, 1
  store i32 %inc13, i32* %b, align 4
  %cmp3 = icmp slt i32 %tmp8, 5
  br i1 %cmp3, label %for.cond5.preheader, label %for.inc15

for.inc15:                                        ; preds = %for.inc12
  %tmp9 = load i32, i32* %control1, align 4
  %inc16 = add nsw i32 %tmp9, 1
  store i32 %inc16, i32* %control1, align 4
  %cmp = icmp slt i32 %tmp9, 4
  br i1 %cmp, label %for.body, label %for.end17

for.end17:                                        ; preds = %for.inc15
  ret i32 0
}

; Function Attrs: nounwind
declare i32 @putchar(i32) #0

; Function Attrs: nounwind
declare i32 @puts(i8* nocapture readonly) #0

attributes #0 = { nounwind }

## program.out.ll
16:27 $ cat program.optimised.ll
; ModuleID = 'program.ll'
source_filename = "program.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

@.str = private unnamed_addr constant [4 x i8] c"A: \00", align 1
@.str.1 = private unnamed_addr constant [4 x i8] c"%d \00", align 1
@.str.2 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1
@.str.3 = private unnamed_addr constant [15 x i8] c"* Control: %d\0A\00", align 1
@.str.4 = private unnamed_addr constant [7 x i8] c"-----\0A\00", align 1
@.str.5 = private unnamed_addr constant [15 x i8] c"b: %d | e: %d\0A\00", align 1
@str = private unnamed_addr constant [6 x i8] c"-----\00"
@FUNC_f_SCOP_0_KERNEL_0 = private unnamed_addr constant [1802 x i8] c"//\0A// Generated by LLVM NVPTX Back-End\0A//\0A\0A.version 3.2\0A.target sm_30\0A.address_size 64\0A\0A\09// .globl\09FUNC_f_SCOP_0_KERNEL_0\0A\0A.visible .entry FUNC_f_SCOP_0_KERNEL_0(\0A\09.param .u64 FUNC_f_SCOP_0_KERNEL_0_param_0,\0A\09.param .u64 FUNC_f_SCOP_0_KERNEL_0_param_1,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_2,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_3,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_4,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_5,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_6,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_7,\0A\09.param .u32 FUNC_f_SCOP_0_KERNEL_0_param_8\0A)\0A.maxntid 32, 1, 1\0A{\0A\09.reg .pred \09%p<4>;\0A\09.reg .b32 \09%r<11>;\0A\09.reg .b64 \09%rd<25>;\0A\0A\09ld.param.u64 \09%rd11, [FUNC_f_SCOP_0_KERNEL_0_param_0];\0A\09ld.param.u64 \09%rd12, [FUNC_f_SCOP_0_KERNEL_0_param_1];\0A\09mov.u32 \09%r5, %ctaid.x;\0A\09mov.u32 \09%r6, %tid.x;\0A\09cvt.u64.u32 \09%rd13, %r6;\0A\09ld.param.u32 \09%r7, [FUNC_f_SCOP_0_KERNEL_0_param_4];\0A\09ld.param.u32 \09%r8, [FUNC_f_SCOP_0_KERNEL_0_param_5];\0A\09ld.param.s32 \09%rd14, [FUNC_f_SCOP_0_KERNEL_0_param_2];\0A\09ld.param.s32 \09%rd1, [FUNC_f_SCOP_0_KERNEL_0_param_3];\0A\09mul.wide.u32 \09%rd15, %r5, 32;\0A\09not.b64 \09%rd16, %rd14;\0A\09add.s64 \09%rd17, %rd16, %rd1;\0A\09sub.s64 \09%rd18, %rd17, %rd15;\0A\09shr.u64 \09%rd19, %rd18, 20;\0A\09ld.global.u32 \09%r10, [%rd11];\0A\09add.s64 \09%rd20, %rd15, %rd14;\0A\09add.s64 \09%rd22, %rd20, %rd13;\0A\09setp.gt.s32 \09%p1, %r7, 3;\0A\09selp.b32 \09%r2, %r8, 0, %p1;\0A\09add.s64 \09%rd24, %rd19, 1;\0A\09shl.b64 \09%rd21, %rd22, 2;\0A\09add.s64 \09%rd23, %rd12, %rd21;\0ALBB0_1:\0A\09setp.lt.s64 \09%p2, %rd22, %rd1;\0A\09@%p2 bra \09LBB0_4;\0A\09bra.uni \09LBB0_2;\0ALBB0_4:\0A\09st.global.u32 \09[%rd23], %r2;\0A\09mov.u32 \09%r10, %r2;\0ALBB0_2:\0A\09bar.sync \090;\0A\09add.s64 \09%rd24, %rd24, -1;\0A\09add.s64 \09%rd23, %rd23, 4194304;\0A\09add.s64 \09%rd22, %rd22, 1048576;\0A\09setp.eq.s64 \09%p3, %rd24, 0;\0A\09@%p3 bra \09LBB0_3;\0A\09bra.uni \09LBB0_1;\0ALBB0_3:\0A\09st.global.u32 \09[%rd11], %r10;\0A\09ret;\0A}\0A\0A\0A\00"
@FUNC_f_SCOP_0_KERNEL_0_name = private unnamed_addr constant [23 x i8] c"FUNC_f_SCOP_0_KERNEL_0\00"
@FUNC_inita_SCOP_0_KERNEL_0 = private unnamed_addr constant [477 x i8] c"//\0A// Generated by LLVM NVPTX Back-End\0A//\0A\0A.version 3.2\0A.target sm_30\0A.address_size 64\0A\0A\09// .globl\09FUNC_inita_SCOP_0_KERNEL_0\0A\0A.visible .entry FUNC_inita_SCOP_0_KERNEL_0(\0A\09.param .u64 FUNC_inita_SCOP_0_KERNEL_0_param_0\0A)\0A.maxntid 6, 1, 1\0A{\0A\09.reg .b32 \09%r<3>;\0A\09.reg .b64 \09%rd<4>;\0A\0A\09ld.param.u64 \09%rd1, [FUNC_inita_SCOP_0_KERNEL_0_param_0];\0A\09mov.u32 \09%r1, %tid.x;\0A\09mul.wide.u32 \09%rd2, %r1, 4;\0A\09add.s64 \09%rd3, %rd1, %rd2;\0A\09mov.u32 \09%r2, 0;\0A\09st.global.u32 \09[%rd3], %r2;\0A\09ret;\0A}\0A\0A\0A\00"
@FUNC_inita_SCOP_0_KERNEL_0_name = private unnamed_addr constant [27 x i8] c"FUNC_inita_SCOP_0_KERNEL_0\00"

define void @f(i32* %begin, i32* %end, i32* %arr, i32* %control, i32* %readarr) {
entry:
  %tmp8.preload.s2a = alloca i32
  %tmp6.preload.s2a = alloca i32
  %tmp4.preload.s2a = alloca i32
  %tmp1.preload.s2a = alloca i32
  %polly_launch_0_params = alloca [18 x i8*]
  %polly_launch_0_param_0 = alloca i8*
  %polly_launch_0_param_1 = alloca i8*
  %polly_launch_0_param_2 = alloca i32
  %polly_launch_0_param_3 = alloca i32
  %polly_launch_0_param_4 = alloca i32
  %polly_launch_0_param_5 = alloca i32
  %polly_launch_0_param_6 = alloca i32
  %polly_launch_0_param_7 = alloca i32
  %polly_launch_0_param_8 = alloca i32
  %polly_launch_0_param_size_0 = alloca i32
  %polly_launch_0_param_size_1 = alloca i32
  %polly_launch_0_param_size_2 = alloca i32
  %polly_launch_0_param_size_3 = alloca i32
  %polly_launch_0_param_size_4 = alloca i32
  %polly_launch_0_param_size_5 = alloca i32
  %polly_launch_0_param_size_6 = alloca i32
  %polly_launch_0_param_size_7 = alloca i32
  %polly_launch_0_param_size_8 = alloca i32
  %polly_launch_0_params_i8ptr = bitcast [18 x i8*]* %polly_launch_0_params to i8*
  br label %polly.split_new_and_old

polly.split_new_and_old:                          ; preds = %entry
  %polly.access.begin = getelementptr i32, i32* %begin, i64 0
  %polly.access.begin.load = load i32, i32* %polly.access.begin, align 4, !alias.scope !0, !noalias !2
  store i32 %polly.access.begin.load, i32* %tmp1.preload.s2a
  %polly.access.end = getelementptr i32, i32* %end, i64 0
  %polly.access.end.load = load i32, i32* %polly.access.end, align 4, !alias.scope !3, !noalias !8
  store i32 %polly.access.end.load, i32* %tmp4.preload.s2a
  %0 = sext i32 %polly.access.begin.load to i64
  %1 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %0, i64 1)
  %.obit = extractvalue { i64, i1 } %1, 1
  %polly.overflow.state = or i1 false, %.obit
  %.res = extractvalue { i64, i1 } %1, 0
  %2 = sext i32 %polly.access.end.load to i64
  %3 = icmp sge i64 %2, %.res
  %polly.preload.cond.overflown = xor i1 %polly.overflow.state, true
  %polly.preload.cond.result = and i1 %3, %polly.preload.cond.overflown
  br label %polly.preload.cond

polly.preload.cond:                               ; preds = %polly.split_new_and_old
  br i1 %polly.preload.cond.result, label %polly.preload.exec, label %polly.preload.merge

polly.preload.merge:                              ; preds = %polly.preload.exec, %polly.preload.cond
  %polly.preload.tmp6.merge = phi i32 [ %polly.access.control.load, %polly.preload.exec ], [ 0, %polly.preload.cond ]
  store i32 %polly.preload.tmp6.merge, i32* %tmp6.preload.s2a
  %polly.access.begin4 = getelementptr i32, i32* %begin, i64 1
  %polly.access.arr = getelementptr i32, i32* %arr, i32 %polly.access.begin.load
  %4 = ptrtoint i32* %polly.access.begin4 to i64
  %5 = ptrtoint i32* %polly.access.arr to i64
  %6 = icmp ule i64 %4, %5
  %polly.access.arr5 = getelementptr i32, i32* %arr, i32 %polly.access.end.load
  %polly.access.begin6 = getelementptr i32, i32* %begin, i64 0
  %7 = ptrtoint i32* %polly.access.arr5 to i64
  %8 = ptrtoint i32* %polly.access.begin6 to i64
  %9 = icmp ule i64 %7, %8
  %10 = or i1 %6, %9
  %11 = and i1 true, %10
  %polly.access.control7 = getelementptr i32, i32* %control, i64 1
  %polly.access.arr8 = getelementptr i32, i32* %arr, i32 %polly.access.begin.load
  %12 = ptrtoint i32* %polly.access.control7 to i64
  %13 = ptrtoint i32* %polly.access.arr8 to i64
  %14 = icmp ule i64 %12, %13
  %polly.access.arr9 = getelementptr i32, i32* %arr, i32 %polly.access.end.load
  %polly.access.control10 = getelementptr i32, i32* %control, i64 0
  %15 = ptrtoint i32* %polly.access.arr9 to i64
  %16 = ptrtoint i32* %polly.access.control10 to i64
  %17 = icmp ule i64 %15, %16
  %18 = or i1 %14, %17
  %19 = and i1 %11, %18
  %polly.access.end11 = getelementptr i32, i32* %end, i64 1
  %polly.access.arr12 = getelementptr i32, i32* %arr, i32 %polly.access.begin.load
  %20 = ptrtoint i32* %polly.access.end11 to i64
  %21 = ptrtoint i32* %polly.access.arr12 to i64
  %22 = icmp ule i64 %20, %21
  %polly.access.arr13 = getelementptr i32, i32* %arr, i32 %polly.access.end.load
  %polly.access.end14 = getelementptr i32, i32* %end, i64 0
  %23 = ptrtoint i32* %polly.access.arr13 to i64
  %24 = ptrtoint i32* %polly.access.end14 to i64
  %25 = icmp ule i64 %23, %24
  %26 = or i1 %22, %25
  %27 = and i1 %19, %26
  %polly.access.readarr = getelementptr i32, i32* %readarr, i64 1
  %polly.access.arr15 = getelementptr i32, i32* %arr, i32 %polly.access.begin.load
  %28 = ptrtoint i32* %polly.access.readarr to i64
  %29 = ptrtoint i32* %polly.access.arr15 to i64
  %30 = icmp ule i64 %28, %29
  %polly.access.arr16 = getelementptr i32, i32* %arr, i32 %polly.access.end.load
  %polly.access.readarr17 = getelementptr i32, i32* %readarr, i64 0
  %31 = ptrtoint i32* %polly.access.arr16 to i64
  %32 = ptrtoint i32* %polly.access.readarr17 to i64
  %33 = icmp ule i64 %31, %32
  %34 = or i1 %30, %33
  %35 = and i1 %27, %34
  %36 = sext i32 %polly.access.begin.load to i64
  %37 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 0, i64 %36)
  %.obit18 = extractvalue { i64, i1 } %37, 1
  %polly.overflow.state19 = or i1 false, %.obit18
  %.res20 = extractvalue { i64, i1 } %37, 0
  %38 = sext i32 %polly.access.end.load to i64
  %39 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %.res20, i64 %38)
  %.obit21 = extractvalue { i64, i1 } %39, 1
  %polly.overflow.state22 = or i1 %polly.overflow.state19, %.obit21
  %.res23 = extractvalue { i64, i1 } %39, 0
  %40 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %.res23)
  %.obit24 = extractvalue { i64, i1 } %40, 1
  %polly.overflow.state25 = or i1 %polly.overflow.state22, %.obit24
  %.res26 = extractvalue { i64, i1 } %40, 0
  %41 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 4, i64 %.res26)
  %.obit27 = extractvalue { i64, i1 } %41, 1
  %polly.overflow.state28 = or i1 %polly.overflow.state25, %.obit27
  %.res29 = extractvalue { i64, i1 } %41, 0
  %42 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res29)
  %.obit30 = extractvalue { i64, i1 } %42, 1
  %polly.overflow.state31 = or i1 %polly.overflow.state28, %.obit30
  %.res32 = extractvalue { i64, i1 } %42, 0
  %43 = sext i32 %polly.access.begin.load to i64
  %44 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 0, i64 %43)
  %.obit33 = extractvalue { i64, i1 } %44, 1
  %polly.overflow.state34 = or i1 %polly.overflow.state31, %.obit33
  %.res35 = extractvalue { i64, i1 } %44, 0
  %45 = sext i32 %polly.access.end.load to i64
  %46 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %.res35, i64 %45)
  %.obit36 = extractvalue { i64, i1 } %46, 1
  %polly.overflow.state37 = or i1 %polly.overflow.state34, %.obit36
  %.res38 = extractvalue { i64, i1 } %46, 0
  %47 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %.res38)
  %.obit39 = extractvalue { i64, i1 } %47, 1
  %polly.overflow.state40 = or i1 %polly.overflow.state37, %.obit39
  %.res41 = extractvalue { i64, i1 } %47, 0
  %48 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 2, i64 %.res41)
  %.obit42 = extractvalue { i64, i1 } %48, 1
  %polly.overflow.state43 = or i1 %polly.overflow.state40, %.obit42
  %.res44 = extractvalue { i64, i1 } %48, 0
  %49 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %.res32, i64 %.res44)
  %.obit45 = extractvalue { i64, i1 } %49, 1
  %polly.overflow.state46 = or i1 %polly.overflow.state43, %.obit45
  %.res47 = extractvalue { i64, i1 } %49, 0
  %50 = sext i32 %polly.access.begin.load to i64
  %51 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 0, i64 %50)
  %.obit48 = extractvalue { i64, i1 } %51, 1
  %polly.overflow.state49 = or i1 %polly.overflow.state46, %.obit48
  %.res50 = extractvalue { i64, i1 } %51, 0
  %52 = sext i32 %polly.access.end.load to i64
  %53 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %.res50, i64 %52)
  %.obit51 = extractvalue { i64, i1 } %53, 1
  %polly.overflow.state52 = or i1 %polly.overflow.state49, %.obit51
  %.res53 = extractvalue { i64, i1 } %53, 0
  %54 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %.res53)
  %.obit54 = extractvalue { i64, i1 } %54, 1
  %polly.overflow.state55 = or i1 %polly.overflow.state52, %.obit54
  %.res56 = extractvalue { i64, i1 } %54, 0
  %55 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 8, i64 %.res56)
  %.obit57 = extractvalue { i64, i1 } %55, 1
  %polly.overflow.state58 = or i1 %polly.overflow.state55, %.obit57
  %.res59 = extractvalue { i64, i1 } %55, 0
  %56 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %.res47, i64 %.res59)
  %.obit60 = extractvalue { i64, i1 } %56, 1
  %polly.overflow.state61 = or i1 %polly.overflow.state58, %.obit60
  %.res62 = extractvalue { i64, i1 } %56, 0
  %57 = icmp sge i64 %.res62, 0
  %58 = and i1 %35, %57
  %polly.rtc.overflown = xor i1 %polly.overflow.state61, true
  %polly.rtc.result = and i1 %58, %polly.rtc.overflown
  br i1 false, label %polly.start, label %entry.split.pre_entry_bb

entry.split.pre_entry_bb:                         ; preds = %polly.preload.merge
  br label %entry.split

entry.split:                                      ; preds = %entry.split.pre_entry_bb
  %tmp1 = load i32, i32* %begin, align 4
  %tmp41 = load i32, i32* %end, align 4
  %cmp2 = icmp slt i32 %tmp1, %tmp41
  br i1 %cmp2, label %for.body.lr.ph, label %for.end.region_exiting

for.body.lr.ph:                                   ; preds = %entry.split
  %59 = sext i32 %tmp1 to i64
  br label %for.body

for.body:                                         ; preds = %for.body.lr.ph, %if.end
  %indvars.iv = phi i64 [ %59, %for.body.lr.ph ], [ %indvars.iv.next, %if.end ]
  %tmp6 = load i32, i32* %control, align 4
  %cmp1 = icmp sgt i32 %tmp6, 3
  br i1 %cmp1, label %if.then, label %if.end

if.then:                                          ; preds = %for.body
  %tmp8 = load i32, i32* %readarr, align 4
  br label %if.end

if.end:                                           ; preds = %if.then, %for.body
  %t.0 = phi i32 [ %tmp8, %if.then ], [ 0, %for.body ]
  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv
  store i32 %t.0, i32* %arrayidx, align 4
  %indvars.iv.next = add i64 %indvars.iv, 1
  %tmp4 = load i32, i32* %end, align 4
  %60 = sext i32 %tmp4 to i64
  %cmp = icmp slt i64 %indvars.iv.next, %60
  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge

for.cond.for.end_crit_edge:                       ; preds = %if.end
  br label %for.end.region_exiting

for.end.region_exiting:                           ; preds = %entry.split, %for.cond.for.end_crit_edge
  br label %polly.merge_new_and_old

polly.merge_new_and_old:                          ; preds = %polly.exiting, %for.end.region_exiting
  br label %for.end

for.end:                                          ; preds = %polly.merge_new_and_old
  ret void

polly.start:                                      ; preds = %polly.preload.merge
  br label %polly.acc.initialize

polly.acc.initialize:                             ; preds = %polly.start
  %61 = call i8* @polly_initContextCUDA()
  %p_dev_array_MemRef0 = call i8* @polly_allocateMemoryForDevice(i64 0)
  %p_dev_array_MemRef1 = call i8* @polly_allocateMemoryForDevice(i64 0)
  %p_dev_array_MemRef2 = call i8* @polly_allocateMemoryForDevice(i64 0)
  %p_dev_array_MemRef3__phi = call i8* @polly_allocateMemoryForDevice(i64 4)
  %p_dev_array_MemRef4 = call i8* @polly_allocateMemoryForDevice(i64 0)
  %62 = sext i32 %polly.access.end.load to i64
  %63 = mul i64 4, %62
  %64 = sext i32 %polly.access.begin.load to i64
  %65 = add nsw i64 0, %64
  %66 = mul i64 %65, 4
  %67 = sub i64 %63, %66
  %p_dev_array_MemRef5 = call i8* @polly_allocateMemoryForDevice(i64 %67)
  br label %polly.preload.begin

polly.preload.begin:                              ; preds = %polly.acc.initialize
  %68 = sext i32 %polly.access.begin.load to i64
  %69 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %68, i64 1)
  %.obit63 = extractvalue { i64, i1 } %69, 1
  %polly.overflow.state64 = or i1 false, %.obit63
  %.res65 = extractvalue { i64, i1 } %69, 0
  %70 = sext i32 %polly.access.end.load to i64
  %71 = icmp sge i64 %70, %.res65
  %72 = sext i32 %polly.preload.tmp6.merge to i64
  %73 = icmp sge i64 %72, 4
  %74 = and i1 %71, %73
  %polly.preload.cond.overflown66 = xor i1 %polly.overflow.state64, true
  %polly.preload.cond.result67 = and i1 %74, %polly.preload.cond.overflown66
  br label %polly.preload.cond68

polly.preload.cond68:                             ; preds = %polly.preload.begin
  br i1 %polly.preload.cond.result67, label %polly.preload.exec70, label %polly.preload.merge69

polly.preload.merge69:                            ; preds = %polly.preload.exec70, %polly.preload.cond68
  %polly.preload.tmp8.merge = phi i32 [ %polly.access.readarr71.load, %polly.preload.exec70 ], [ 0, %polly.preload.cond68 ]
  store i32 %polly.preload.tmp8.merge, i32* %tmp8.preload.s2a
  br label %polly.cond

polly.cond:                                       ; preds = %polly.preload.merge69
  %75 = sext i32 %polly.access.begin.load to i64
  %76 = add nsw i64 %75, 1
  %77 = sext i32 %polly.access.end.load to i64
  %78 = icmp sge i64 %77, %76
  br i1 %78, label %polly.then, label %polly.else

polly.merge:                                      ; preds = %polly.else, %polly.merge73
  call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef0)
  call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef1)
  call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef2)
  call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef3__phi)
  call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef4)
  call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef5)
  call void @polly_freeContext(i8* %61)
  br label %polly.exiting

polly.exiting:                                    ; preds = %polly.merge
  br label %polly.merge_new_and_old

polly.preload.exec:                               ; preds = %polly.preload.cond
  %polly.access.control = getelementptr i32, i32* %control, i64 0
  %polly.access.control.load = load i32, i32* %polly.access.control, align 4, !alias.scope !4, !noalias !9
  br label %polly.preload.merge

polly.preload.exec70:                             ; preds = %polly.preload.cond68
  %polly.access.readarr71 = getelementptr i32, i32* %readarr, i64 0
  %polly.access.readarr71.load = load i32, i32* %polly.access.readarr71, align 4, !alias.scope !6, !noalias !10
  br label %polly.preload.merge69

polly.then:                                       ; preds = %polly.cond
  %79 = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef3__phi)
  %80 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 0
  store i8* %79, i8** %polly_launch_0_param_0
  %81 = bitcast i8** %polly_launch_0_param_0 to i8*
  store i8* %81, i8** %80
  %82 = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef5)
  %83 = sext i32 %polly.access.begin.load to i64
  %84 = add nsw i64 0, %83
  %85 = bitcast i8* %82 to i32*
  %86 = sub i64 0, %84
  %87 = getelementptr i32, i32* %85, i64 %86
  %88 = bitcast i32* %87 to i8*
  %89 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 1
  store i8* %88, i8** %polly_launch_0_param_1
  %90 = bitcast i8** %polly_launch_0_param_1 to i8*
  store i8* %90, i8** %89
  store i32 %polly.access.begin.load, i32* %polly_launch_0_param_2
  %91 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 2
  %92 = bitcast i32* %polly_launch_0_param_2 to i8*
  store i8* %92, i8** %91
  store i32 %polly.access.end.load, i32* %polly_launch_0_param_3
  %93 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 3
  %94 = bitcast i32* %polly_launch_0_param_3 to i8*
  store i8* %94, i8** %93
  store i32 %polly.preload.tmp6.merge, i32* %polly_launch_0_param_4
  %95 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 4
  %96 = bitcast i32* %polly_launch_0_param_4 to i8*
  store i8* %96, i8** %95
  store i32 %polly.preload.tmp8.merge, i32* %polly_launch_0_param_5
  %97 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 5
  %98 = bitcast i32* %polly_launch_0_param_5 to i8*
  store i8* %98, i8** %97
  store i32 %polly.access.begin.load, i32* %polly_launch_0_param_6
  %99 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 6
  %100 = bitcast i32* %polly_launch_0_param_6 to i8*
  store i8* %100, i8** %99
  store i32 %polly.preload.tmp6.merge, i32* %polly_launch_0_param_7
  %101 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 7
  %102 = bitcast i32* %polly_launch_0_param_7 to i8*
  store i8* %102, i8** %101
  store i32 %polly.access.end.load, i32* %polly_launch_0_param_8
  %103 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 8
  %104 = bitcast i32* %polly_launch_0_param_8 to i8*
  store i8* %104, i8** %103
  store i32 4, i32* %polly_launch_0_param_size_0
  %105 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 9
  %106 = bitcast i32* %polly_launch_0_param_size_0 to i8*
  store i8* %106, i8** %105
  store i32 4, i32* %polly_launch_0_param_size_1
  %107 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 10
  %108 = bitcast i32* %polly_launch_0_param_size_1 to i8*
  store i8* %108, i8** %107
  store i32 4, i32* %polly_launch_0_param_size_2
  %109 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 11
  %110 = bitcast i32* %polly_launch_0_param_size_2 to i8*
  store i8* %110, i8** %109
  store i32 4, i32* %polly_launch_0_param_size_3
  %111 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 12
  %112 = bitcast i32* %polly_launch_0_param_size_3 to i8*
  store i8* %112, i8** %111
  store i32 4, i32* %polly_launch_0_param_size_4
  %113 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 13
  %114 = bitcast i32* %polly_launch_0_param_size_4 to i8*
  store i8* %114, i8** %113
  store i32 4, i32* %polly_launch_0_param_size_5
  %115 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 14
  %116 = bitcast i32* %polly_launch_0_param_size_5 to i8*
  store i8* %116, i8** %115
  store i32 4, i32* %polly_launch_0_param_size_6
  %117 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 15
  %118 = bitcast i32* %polly_launch_0_param_size_6 to i8*
  store i8* %118, i8** %117
  store i32 4, i32* %polly_launch_0_param_size_7
  %119 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 16
  %120 = bitcast i32* %polly_launch_0_param_size_7 to i8*
  store i8* %120, i8** %119
  store i32 4, i32* %polly_launch_0_param_size_8
  %121 = getelementptr [18 x i8*], [18 x i8*]* %polly_launch_0_params, i64 0, i64 17
  %122 = bitcast i32* %polly_launch_0_param_size_8 to i8*
  store i8* %122, i8** %121
  %123 = call i8* @polly_getKernel(i8* getelementptr inbounds ([1802 x i8], [1802 x i8]* @FUNC_f_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([23 x i8], [23 x i8]* @FUNC_f_SCOP_0_KERNEL_0_name, i32 0, i32 0))
  %124 = sext i32 %polly.access.begin.load to i64
  %125 = add nsw i64 %124, 1048545
  %126 = sext i32 %polly.access.end.load to i64
  %127 = icmp sge i64 %126, %125
  %128 = sext i32 %polly.access.begin.load to i64
  %129 = sub nsw i64 0, %128
  %130 = sext i32 %polly.access.begin.load to i64
  %131 = mul nsw i64 31, %130
  %132 = sext i32 %polly.access.end.load to i64
  %133 = add nsw i64 %131, %132
  %134 = add nsw i64 %133, 31
  %polly.fdiv_q.shr = ashr i64 %134, 5
  %135 = add nsw i64 %129, %polly.fdiv_q.shr
  %136 = select i1 %127, i64 32768, i64 %135
  %137 = trunc i64 %136 to i32
  call void @polly_launchKernel(i8* %123, i32 %137, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
  call void @polly_freeKernel(i8* %123)
  br label %polly.cond72

polly.cond72:                                     ; preds = %polly.then
  %138 = sext i32 %polly.access.end.load to i64
  %139 = icmp sge i64 %138, 1
  br i1 %139, label %polly.then74, label %polly.else75

polly.merge73:                                    ; preds = %polly.else75, %polly.then74
  br label %polly.merge

polly.else:                                       ; preds = %polly.cond
  br label %polly.merge

polly.then74:                                     ; preds = %polly.cond72
  %140 = sext i32 %polly.access.end.load to i64
  %141 = mul i64 4, %140
  %142 = sext i32 %polly.access.begin.load to i64
  %143 = add nsw i64 0, %142
  %144 = getelementptr i32, i32* %arr, i64 %143
  %145 = bitcast i32* %144 to i8*
  %146 = mul i64 %143, 4
  %147 = sub i64 %141, %146
  call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef5, i8* %145, i64 %147)
  br label %polly.merge73

polly.else75:                                     ; preds = %polly.cond72
  br label %polly.merge73
}

define void @inita(i32* %A) {
entry:
  %polly_launch_0_params = alloca [2 x i8*]
  %polly_launch_0_param_0 = alloca i8*
  %polly_launch_0_param_size_0 = alloca i32
  %polly_launch_0_params_i8ptr = bitcast [2 x i8*]* %polly_launch_0_params to i8*
  br label %entry.split

entry.split:                                      ; preds = %entry
  br label %polly.split_new_and_old

polly.split_new_and_old:                          ; preds = %entry.split
  %0 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 6)
  %.obit = extractvalue { i64, i1 } %0, 1
  %polly.overflow.state = or i1 false, %.obit
  %.res = extractvalue { i64, i1 } %0, 0
  %1 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 6, i64 %.res)
  %.obit2 = extractvalue { i64, i1 } %1, 1
  %polly.overflow.state3 = or i1 %polly.overflow.state, %.obit2
  %.res4 = extractvalue { i64, i1 } %1, 0
  %2 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res4)
  %.obit5 = extractvalue { i64, i1 } %2, 1
  %polly.overflow.state6 = or i1 %polly.overflow.state3, %.obit5
  %.res7 = extractvalue { i64, i1 } %2, 0
  %3 = icmp sge i64 %.res7, 0
  %4 = and i1 true, %3
  %polly.rtc.overflown = xor i1 %polly.overflow.state6, true
  %polly.rtc.result = and i1 %4, %polly.rtc.overflown
  br i1 %polly.rtc.result, label %polly.start, label %for.body.pre_entry_bb

for.body.pre_entry_bb:                            ; preds = %polly.split_new_and_old
  br label %for.body

for.body:                                         ; preds = %for.body.pre_entry_bb, %for.body
  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.pre_entry_bb ]
  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
  store i32 0, i32* %arrayidx, align 4
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond = icmp ne i64 %indvars.iv.next, 6
  br i1 %exitcond, label %for.body, label %polly.merge_new_and_old

polly.merge_new_and_old:                          ; preds = %polly.exiting, %for.body
  br label %for.end

for.end:                                          ; preds = %polly.merge_new_and_old
  ret void

polly.start:                                      ; preds = %polly.split_new_and_old
  br label %polly.acc.initialize

polly.acc.initialize:                             ; preds = %polly.start
  %5 = call i8* @polly_initContextCUDA()
  %p_dev_array_MemRef0 = call i8* @polly_allocateMemoryForDevice(i64 24)
  %6 = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef0)
  %7 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
  store i8* %6, i8** %polly_launch_0_param_0
  %8 = bitcast i8** %polly_launch_0_param_0 to i8*
  store i8* %8, i8** %7
  store i32 4, i32* %polly_launch_0_param_size_0
  %9 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
  %10 = bitcast i32* %polly_launch_0_param_size_0 to i8*
  store i8* %10, i8** %9
  %11 = call i8* @polly_getKernel(i8* getelementptr inbounds ([477 x i8], [477 x i8]* @FUNC_inita_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([27 x i8], [27 x i8]* @FUNC_inita_SCOP_0_KERNEL_0_name, i32 0, i32 0))
  call void @polly_launchKernel(i8* %11, i32 1, i32 1, i32 6, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
  call void @polly_freeKernel(i8* %11)
  %12 = bitcast i32* %A to i8*
  call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef0, i8* %12, i64 24)
  call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef0)
  call void @polly_freeContext(i8* %5)
  br label %polly.exiting

polly.exiting:                                    ; preds = %polly.acc.initialize
  br label %polly.merge_new_and_old
}

define void @printarr(i32* %A) {
entry:
  br label %entry.split

entry.split:                                      ; preds = %entry
  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0))
  br label %for.body

for.body:                                         ; preds = %entry.split, %for.body
  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
  %tmp3 = load i32, i32* %arrayidx, align 4
  %call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.1, i64 0, i64 0), i32 %tmp3)
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond = icmp ne i64 %indvars.iv.next, 6
  br i1 %exitcond, label %for.body, label %for.end

for.end:                                          ; preds = %for.body
  %putchar = tail call i32 @putchar(i32 10)
  ret void
}

declare i32 @printf(i8*, ...)

define i32 @main() {
entry:
  %A = alloca [6 x i32], align 16
  %readarr = alloca i32, align 4
  %control1 = alloca i32, align 4
  %b = alloca i32, align 4
  %e = alloca i32, align 4
  br label %entry.split

entry.split:                                      ; preds = %entry
  store i32 10, i32* %readarr, align 4
  store i32 3, i32* %control1, align 4
  br label %for.body

for.body:                                         ; preds = %entry.split, %for.inc15
  %tmp1 = load i32, i32* %control1, align 4
  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.3, i64 0, i64 0), i32 %tmp1)
  store i32 0, i32* %b, align 4
  br label %for.cond5.preheader

for.cond5.preheader:                              ; preds = %for.body, %for.inc12
  %storemerge2.in3 = load i32, i32* %b, align 4
  %storemerge24 = add nsw i32 %storemerge2.in3, 1
  store i32 %storemerge24, i32* %e, align 4
  %cmp65 = icmp slt i32 %storemerge2.in3, 5
  br i1 %cmp65, label %for.body7.lr.ph, label %for.inc12

for.body7.lr.ph:                                  ; preds = %for.cond5.preheader
  br label %for.body7

for.body7:                                        ; preds = %for.body7.lr.ph, %for.body7
  %puts = call i32 @puts(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @str, i64 0, i64 0))
  %tmp5 = load i32, i32* %b, align 4
  %tmp6 = load i32, i32* %e, align 4
  %call9 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 %tmp5, i32 %tmp6)
  %arraydecay = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0
  call void @inita(i32* %arraydecay)
  %arraydecay10 = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0
  call void @f(i32* nonnull %b, i32* nonnull %e, i32* %arraydecay10, i32* nonnull %control1, i32* nonnull %readarr)
  %arraydecay11 = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0
  call void @printarr(i32* %arraydecay11)
  %storemerge2.in = load i32, i32* %e, align 4
  %storemerge2 = add nsw i32 %storemerge2.in, 1
  store i32 %storemerge2, i32* %e, align 4
  %cmp6 = icmp slt i32 %storemerge2.in, 5
  br i1 %cmp6, label %for.body7, label %for.cond5.for.inc12_crit_edge

for.cond5.for.inc12_crit_edge:                    ; preds = %for.body7
  br label %for.inc12

for.inc12:                                        ; preds = %for.cond5.for.inc12_crit_edge, %for.cond5.preheader
  %tmp8 = load i32, i32* %b, align 4
  %inc13 = add nsw i32 %tmp8, 1
  store i32 %inc13, i32* %b, align 4
  %cmp3 = icmp slt i32 %tmp8, 5
  br i1 %cmp3, label %for.cond5.preheader, label %for.inc15

for.inc15:                                        ; preds = %for.inc12
  %tmp9 = load i32, i32* %control1, align 4
  %inc16 = add nsw i32 %tmp9, 1
  store i32 %inc16, i32* %control1, align 4
  %cmp = icmp slt i32 %tmp9, 4
  br i1 %cmp, label %for.body, label %for.end17

for.end17:                                        ; preds = %for.inc15
  ret i32 0
}

; Function Attrs: nounwind
declare i32 @putchar(i32) #0

; Function Attrs: nounwind
declare i32 @puts(i8* nocapture readonly) #0

; Function Attrs: nounwind readnone speculatable
declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) #1

; Function Attrs: nounwind readnone speculatable
declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) #1

; Function Attrs: nounwind readnone speculatable
declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64) #1

declare i8* @polly_initContextCUDA()

declare i8* @polly_allocateMemoryForDevice(i64)

declare i8* @polly_getDevicePtr(i8*)

declare i8* @polly_getKernel(i8*, i8*)

declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)

declare void @polly_freeKernel(i8*)

declare void @polly_copyFromDeviceToHost(i8*, i8*, i64)

declare void @polly_freeDeviceMemory(i8*)

declare void @polly_freeContext(i8*)

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone speculatable }

!0 = distinct !{!0, !1, !"polly.alias.scope.MemRef0"}
!1 = distinct !{!1, !"polly.alias.scope.domain"}
!2 = !{!3, !4, !5, !6, !7}
!3 = distinct !{!3, !1, !"polly.alias.scope.MemRef1"}
!4 = distinct !{!4, !1, !"polly.alias.scope.MemRef2"}
!5 = distinct !{!5, !1, !"polly.alias.scope.MemRef3__phi"}
!6 = distinct !{!6, !1, !"polly.alias.scope.MemRef4"}
!7 = distinct !{!7, !1, !"polly.alias.scope.MemRef5"}
!8 = !{!0, !4, !5, !6, !7}
!9 = !{!0, !3, !5, !6, !7}
!10 = !{!0, !3, !4, !5, !7}
	16:25 $ make runbatch
	rm *.optimised.ll
	rm *.out
	rm *.bench
	rm: cannot remove '*.bench': No such file or directory
	makefile:23: recipe for target 'clean' failed
	make: [clean] Error 1 (ignored)
	rm *.s
	/users/siddhart/llvm-install/bin/opt -S -polly-canonicalize -polly-process-unprofitable -polly-invariant-load-hoisting -polly-codegen-ppcg \
	-polly-acc-mincompute=0 program.ll -o program.optimised.ll
	/users/siddhart/llvm-install/bin/llc program.optimised.ll -o program.s
	/users/siddhart/llvm-install/bin/clang program.s -lcudart -lGPURuntime -ldl -lOpenCL -lgfortran -lstdc++ -o program.out -L/opt/nvidia/cudatoolkit8.0/8.0.54_2.2.8_ga620558-2.1/lib64/
	export POLLY_DEBUG=1
	POLLY_DEBUG=1 srun -n 1 -Cgpu --partition=debug nvprof ./program.out
	srun: job 2387457 queued and waiting for resources
	srun: job 2387457 has been allocated resources
	-> polly_initContext
	-> initContextCUDA
	==29336== NVPROF is profiling process 29336, command: ./program.out
	> Running on GPU device 0 : Tesla P100-PCIE-16GB.

	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	CUDA Link Completed in 0.000000ms. Linker Output:
	ptxas info : 0 bytes gmem
	ptxas info : Compiling entry function 'FUNC_inita_SCOP_0_KERNEL_0' for 'sm_60'
	ptxas info : Function properties for FUNC_inita_SCOP_0_KERNEL_0
	ptxas . 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
	ptxas info : Used 4 registers, 328 bytes cmem[0]
	info : 0 bytes gmem
	info : Function properties for 'FUNC_inita_SCOP_0_KERNEL_0':
	info : used 4 registers, 0 stack, 0 bytes smem, 328 bytes cmem[0], 0 bytes lmem
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	-> polly_initContext
	-> initContextCUDA
	-> polly_allocateMemoryForDevice
	-> allocateMemoryForDeviceCUDA
	-> polly_getDevicePtr
	-> getDevicePtrCUDA
	-> polly_getKernel
	-> getKernelCUDA
	-> using cached kernel
	-> polly_launchKernel
	-> launchKernelCUDA
	-> polly_freeKernel
	-> freeKernelCUDA
	-> polly_copyFromDeviceToHost
	-> copyFromDeviceToHostCUDA
	-> polly_freeDeviceMemory
	-> freeDeviceMemoryCUDA
	-> polly_freeContext
	==29336== Profiling application: ./program.out
	* Control: 3
	-----
	b: 0 \| e: 1
	A: 0 0 0 0 0 0
	-----
	b: 0 \| e: 2
	A: 0 0 0 0 0 0
	-----
	b: 0 \| e: 3
	A: 0 0 0 0 0 0
	-----
	b: 0 \| e: 4
	A: 0 0 0 0 0 0
	-----
	b: 0 \| e: 5
	A: 0 0 0 0 0 0
	-----
	b: 1 \| e: 2
	A: 0 0 0 0 0 0
	-----
	b: 1 \| e: 3
	A: 0 0 0 0 0 0
	-----
	b: 1 \| e: 4
	A: 0 0 0 0 0 0
	-----
	b: 1 \| e: 5
	A: 0 0 0 0 0 0
	-----
	b: 2 \| e: 3
	A: 0 0 0 0 0 0
	-----
	b: 2 \| e: 4
	A: 0 0 0 0 0 0
	-----
	b: 2 \| e: 5
	A: 0 0 0 0 0 0
	-----
	b: 3 \| e: 4
	A: 0 0 0 0 0 0
	-----
	b: 3 \| e: 5
	A: 0 0 0 0 0 0
	-----
	b: 4 \| e: 5
	A: 0 0 0 0 0 0
	* Control: 4
	-----
	b: 0 \| e: 1
	A: 10 0 0 0 0 0
	-----
	b: 0 \| e: 2
	A: 10 10 0 0 0 0
	-----
	b: 0 \| e: 3
	A: 10 10 10 0 0 0
	-----
	b: 0 \| e: 4
	A: 10 10 10 10 0 0
	-----
	b: 0 \| e: 5
	A: 10 10 10 10 10 0
	-----
	b: 1 \| e: 2
	A: 0 10 0 0 0 0
	-----
	b: 1 \| e: 3
	A: 0 10 10 0 0 0
	-----
	b: 1 \| e: 4
	A: 0 10 10 10 0 0
	-----
	b: 1 \| e: 5
	A: 0 10 10 10 10 0
	-----
	b: 2 \| e: 3
	A: 0 0 10 0 0 0
	-----
	b: 2 \| e: 4
	A: 0 0 10 10 0 0
	-----
	b: 2 \| e: 5
	A: 0 0 10 10 10 0
	-----
	b: 3 \| e: 4
	A: 0 0 0 10 0 0
	-----
	b: 3 \| e: 5
	A: 0 0 0 10 10 0
	-----
	b: 4 \| e: 5
	A: 0 0 0 0 10 0
	==29336== Profiling result:
	Time(%) Time Calls Avg Min Max Name
	72.49% 53.280us 30 1.7760us 1.6640us 3.2640us FUNC_inita_SCOP_0_KERNEL_0
	27.51% 20.224us 30 674ns 640ns 832ns [CUDA memcpy DtoH]

	==29336== API calls:
	Time(%) Time Calls Avg Min Max Name
	91.74% 265.27ms 1 265.27ms 265.27ms 265.27ms cuCtxCreate
	5.67% 16.381ms 30 546.03us 499.71us 580.99us cuMemAlloc
	1.50% 4.3289ms 1 4.3289ms 4.3289ms 4.3289ms cuLinkAddData
	0.73% 2.1040ms 30 70.134us 66.598us 95.121us cuMemFree
	0.17% 480.85us 30 16.028us 15.338us 20.152us cuMemcpyDtoH
	0.13% 366.10us 30 12.203us 10.963us 23.069us cuLaunchKernel
	0.03% 97.328us 1 97.328us 97.328us 97.328us cuLinkComplete
	0.03% 73.145us 1 73.145us 73.145us 73.145us cuModuleLoadData
	0.01% 24.702us 1 24.702us 24.702us 24.702us cuLinkCreate
	0.01% 17.288us 1 17.288us 17.288us 17.288us cuDeviceGetName
	0.00% 2.2970us 3 765ns 180ns 1.7200us cuDeviceGetCount
	0.00% 2.2530us 1 2.2530us 2.2530us 2.2530us cuLinkDestroy
	0.00% 874ns 3 291ns 184ns 484ns cuDeviceGet
	0.00% 855ns 4 213ns 135ns 324ns cuDeviceGetAttribute
	0.00% 678ns 1 678ns 678ns 678ns cuModuleGetFunction
	0.00% 654ns 1 654ns 654ns 654ns cuDeviceComputeCapability
	// [siddhart@greina0 compile-cpp-to-gpu]$ cat program.c
	#include <stdio.h>
	void f(int begin, int end, int arr, int control, int *readarr) {
	for(int i = begin; i < end; i++) {
	int t = 0;
	if (*control > 3) {
	t += *readarr;
	}
	arr[i] = t;
	}
	}


	void inita(int *A) {
	for(int i = 0; i < 6; i++) { A[i] = 0; }
	}

	void printarr(int *A) {
	printf("A: ");
	for (int i = 0; i < 6; i++) {
	printf("%d ", A[i]);
	}
	printf("\n");
	}

	int main() {
	int A[6];
	int readarr = 10;;
	int control;

	for(int control = 3; control <= 4; control++) {
	printf("* Control: %d\n", control);
	for(int b = 0; b < 6; b++) {
	for(int e = b + 1; e < 6; e++) {
	printf("-----\n");
	printf("b: %d \| e: %d\n", b, e);
	inita(A);
	f(&b, &e, A, &control, &readarr);
	printarr(A);
	}
	}
	}
	return 0;

	}
	[siddhart@greina0 compile-cpp-to-gpu]$ cat program.canonical.ll
	; ModuleID = 'program.ll'
	source_filename = "program.c"
	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

	@.str = private unnamed_addr constant [4 x i8] c"A: \00", align 1
	@.str.1 = private unnamed_addr constant [4 x i8] c"%d \00", align 1
	@.str.2 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1
	@.str.3 = private unnamed_addr constant [15 x i8] c"* Control: %d\0A\00", align 1
	@.str.4 = private unnamed_addr constant [7 x i8] c"-----\0A\00", align 1
	@.str.5 = private unnamed_addr constant [15 x i8] c"b: %d \| e: %d\0A\00", align 1
	@str = private unnamed_addr constant [6 x i8] c"-----\00"

	define void @f(i32* %begin, i32* %end, i32* %arr, i32* %control, i32* %readarr) {
	entry:
	br label %entry.split

	entry.split: ; preds = %entry
	%tmp1 = load i32, i32* %begin, align 4
	%tmp41 = load i32, i32* %end, align 4
	%cmp2 = icmp slt i32 %tmp1, %tmp41
	br i1 %cmp2, label %for.body.lr.ph, label %for.end

	for.body.lr.ph: ; preds = %entry.split
	%0 = sext i32 %tmp1 to i64
	br label %for.body

	for.body: ; preds = %for.body.lr.ph, %if.end
	%indvars.iv = phi i64 [ %0, %for.body.lr.ph ], [ %indvars.iv.next, %if.end ]
	%tmp6 = load i32, i32* %control, align 4
	%cmp1 = icmp sgt i32 %tmp6, 3
	br i1 %cmp1, label %if.then, label %if.end

	if.then: ; preds = %for.body
	%tmp8 = load i32, i32* %readarr, align 4
	br label %if.end

	if.end: ; preds = %if.then, %for.body
	%t.0 = phi i32 [ %tmp8, %if.then ], [ 0, %for.body ]
	%arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv
	store i32 %t.0, i32* %arrayidx, align 4
	%indvars.iv.next = add i64 %indvars.iv, 1
	%tmp4 = load i32, i32* %end, align 4
	%1 = sext i32 %tmp4 to i64
	%cmp = icmp slt i64 %indvars.iv.next, %1
	br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge

	for.cond.for.end_crit_edge: ; preds = %if.end
	br label %for.end

	for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split
	ret void
	}

	define void @inita(i32* %A) {
	entry:
	br label %entry.split

	entry.split: ; preds = %entry
	br label %for.body

	for.body: ; preds = %entry.split, %for.body
	%indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
	%arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
	store i32 0, i32* %arrayidx, align 4
	%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
	%exitcond = icmp ne i64 %indvars.iv.next, 6
	br i1 %exitcond, label %for.body, label %for.end

	for.end: ; preds = %for.body
	ret void
	}

	define void @printarr(i32* %A) {
	entry:
	br label %entry.split

	entry.split: ; preds = %entry
	%call = tail call i32 (i8, ...) @printf(i8 getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0))
	br label %for.body

	for.body: ; preds = %entry.split, %for.body
	%indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
	%arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
	%tmp3 = load i32, i32* %arrayidx, align 4
	%call1 = tail call i32 (i8, ...) @printf(i8 getelementptr inbounds ([4 x i8], [4 x i8]* @.str.1, i64 0, i64 0), i32 %tmp3)
	%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
	%exitcond = icmp ne i64 %indvars.iv.next, 6
	br i1 %exitcond, label %for.body, label %for.end

	for.end: ; preds = %for.body
	%putchar = tail call i32 @putchar(i32 10)
	ret void
	}

	declare i32 @printf(i8*, ...)

	define i32 @main() {
	entry:
	%A = alloca [6 x i32], align 16
	%readarr = alloca i32, align 4
	%control1 = alloca i32, align 4
	%b = alloca i32, align 4
	%e = alloca i32, align 4
	br label %entry.split

	entry.split: ; preds = %entry
	store i32 10, i32* %readarr, align 4
	store i32 3, i32* %control1, align 4
	br label %for.body

	for.body: ; preds = %entry.split, %for.inc15
	%tmp1 = load i32, i32* %control1, align 4
	%call = call i32 (i8, ...) @printf(i8 getelementptr inbounds ([15 x i8], [15 x i8]* @.str.3, i64 0, i64 0), i32 %tmp1)
	store i32 0, i32* %b, align 4
	br label %for.cond5.preheader

	for.cond5.preheader: ; preds = %for.body, %for.inc12
	%storemerge2.in3 = load i32, i32* %b, align 4
	%storemerge24 = add nsw i32 %storemerge2.in3, 1
	store i32 %storemerge24, i32* %e, align 4
	%cmp65 = icmp slt i32 %storemerge2.in3, 5
	br i1 %cmp65, label %for.body7.lr.ph, label %for.inc12

	for.body7.lr.ph: ; preds = %for.cond5.preheader
	br label %for.body7

	for.body7: ; preds = %for.body7.lr.ph, %for.body7
	%puts = call i32 @puts(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @str, i64 0, i64 0))
	%tmp5 = load i32, i32* %b, align 4
	%tmp6 = load i32, i32* %e, align 4
	%call9 = call i32 (i8, ...) @printf(i8 getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 %tmp5, i32 %tmp6)
	%arraydecay = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0
	call void @inita(i32* %arraydecay)
	%arraydecay10 = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0
	call void @f(i32* nonnull %b, i32* nonnull %e, i32* %arraydecay10, i32* nonnull %control1, i32* nonnull %readarr)
	%arraydecay11 = getelementptr inbounds [6 x i32], [6 x i32]* %A, i64 0, i64 0
	call void @printarr(i32* %arraydecay11)
	%storemerge2.in = load i32, i32* %e, align 4
	%storemerge2 = add nsw i32 %storemerge2.in, 1
	store i32 %storemerge2, i32* %e, align 4
	%cmp6 = icmp slt i32 %storemerge2.in, 5
	br i1 %cmp6, label %for.body7, label %for.cond5.for.inc12_crit_edge

	for.cond5.for.inc12_crit_edge: ; preds = %for.body7
	br label %for.inc12

	for.inc12: ; preds = %for.cond5.for.inc12_crit_edge, %for.cond5.preheader
	%tmp8 = load i32, i32* %b, align 4
	%inc13 = add nsw i32 %tmp8, 1
	store i32 %inc13, i32* %b, align 4
	%cmp3 = icmp slt i32 %tmp8, 5
	br i1 %cmp3, label %for.cond5.preheader, label %for.inc15

	for.inc15: ; preds = %for.inc12
	%tmp9 = load i32, i32* %control1, align 4
	%inc16 = add nsw i32 %tmp9, 1
	store i32 %inc16, i32* %control1, align 4
	%cmp = icmp slt i32 %tmp9, 4
	br i1 %cmp, label %for.body, label %for.end17

	for.end17: ; preds = %for.inc15
	ret i32 0
	}

	; Function Attrs: nounwind
	declare i32 @putchar(i32) #0

	; Function Attrs: nounwind
	declare i32 @puts(i8* nocapture readonly) #0

	attributes #0 = { nounwind }