Skip to content

Instantly share code, notes, and snippets.

@BeMg
Last active May 14, 2020 02:15
Show Gist options
  • Save BeMg/c9603e554a889138fa6cf99398aa6b43 to your computer and use it in GitHub Desktop.
Save BeMg/c9603e554a889138fa6cf99398aa6b43 to your computer and use it in GitHub Desktop.
TI-opencl clocl example
__kernel void Conv2D( __global int * image_in, //image input
__global int * filter_in, //filter input
int K, //filter kernel size
__global int * image_out) //feature map output
{
int W; //work group global size
int Wn; //padded image width
int x; //global id x
int y; //global id y
int ki, kj; //filter coordinate,(kj, ki)
int sum = 0; //multiply and sum of filter and data
W = get_global_size(0);
x = get_global_id(0);
y = get_global_id(1);
Wn = W + (K - 1);
for(ki=0; ki<K; ki++)
for(kj=0; kj<K; kj++)
{
sum = sum + filter_in[ki*K + kj] * image_in[Wn*(y+ki) + x + kj];
}
image_out[y*W + x] = sum;
barrier(CLK_GLOBAL_MEM_FENCE);
for(ki=0; ki<K; ki++)
for(kj=0; kj<K; kj++)
{
sum = sum + filter_in[ki*K + kj] * image_in[Wn*(y+ki) + x + kj];
}
image_out[y*W + x] = sum;
}
; ModuleID = 'conv2d.bc'
source_filename = "conv2d.bc"
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "c6000-unknown-unknown-unknown"
@kernel_config_l2 = external addrspace(2) constant [32 x i32]
; Function Attrs: noduplicate
declare linkonce protected void @barrier(i32) #0
; Function Attrs: noinline nounwind
define void @Conv2D(i32 addrspace(1)* nocapture readonly %image_in, i32 addrspace(1)* nocapture readonly %filter_in, i32 %K, i32 addrspace(1)* nocapture %image_out) #1 {
pregion_for_init24:
%0 = call i32 @__core_num()
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 15), align 4
%2 = mul i32 %0, %1
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 14), align 4
%4 = add i32 %3, %2
%5 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4
%6 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4
%7 = mul i32 %5, %6
%8 = mul i32 4, %7
%9 = add i32 %4, %8
%10 = add i32 %9, 7
%11 = and i32 %10, -8
%.1.pocl_context = inttoptr i32 %4 to i32 addrspace(1)**, !ocl.restrict !3
%12 = mul i32 4, %7
%13 = add i32 %11, %12
%14 = add i32 %13, 7
%15 = and i32 %14, -8
%.0.pocl_context = inttoptr i32 %11 to i32*, !ocl.restrict !3
call void @barrier(i32 0) #2
%16 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 1), align 4, !tbaa !4
%17 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !4
%18 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !4
%19 = add nsw i32 %K, -1
%20 = add nsw i32 %19, %16
%21 = icmp sgt i32 %K, 0
br label %pregion_for_entry.pregion_for_init21
pregion_for_entry.pregion_for_init21: ; preds = %pregion_for_cond26, %pregion_for_init24
%22 = phi i32 [ 0, %pregion_for_init24 ], [ %56, %pregion_for_cond26 ]
%23 = add i32 %22, %18
%24 = mul i32 %22, %5
%25 = mul nsw i32 %23, %16
br label %.r_entry
.r_entry: ; preds = %pregion_for_cond23, %pregion_for_entry.pregion_for_init21
%26 = phi i32 [ 0, %pregion_for_entry.pregion_for_init21 ], [ %54, %pregion_for_cond23 ]
%27 = add i32 %26, %17
br i1 %21, label %.lr.ph10.preheader, label %pregion_for_cond23
.lr.ph10.preheader: ; preds = %.r_entry
br label %.lr.ph10
.lr.ph10: ; preds = %._crit_edge11, %.lr.ph10.preheader
%28 = phi i32 [ %46, %._crit_edge11 ], [ 0, %.lr.ph10.preheader ]
%.lcssa114 = phi i32 [ %43, %._crit_edge11 ], [ 0, %.lr.ph10.preheader ]
%29 = mul nsw i32 %28, %K
%30 = add nsw i32 %28, %23
%31 = mul nsw i32 %30, %20
%32 = add nsw i32 %31, %27
br label %33
33: ; preds = %33, %.lr.ph10
%34 = phi i32 [ %44, %33 ], [ 0, %.lr.ph10 ]
%35 = phi i32 [ %43, %33 ], [ %.lcssa114, %.lr.ph10 ]
%36 = add nsw i32 %34, %29
%37 = getelementptr inbounds i32, i32 addrspace(1)* %filter_in, i32 %36
%38 = load i32, i32 addrspace(1)* %37, align 4, !tbaa !4, !llvm.mem.parallel_loop_access !8
%39 = add nsw i32 %32, %34
%40 = getelementptr inbounds i32, i32 addrspace(1)* %image_in, i32 %39
%41 = load i32, i32 addrspace(1)* %40, align 4, !tbaa !4, !llvm.mem.parallel_loop_access !8
%42 = mul nsw i32 %41, %38
%43 = add nsw i32 %42, %35
%44 = add nuw nsw i32 %34, 1
%45 = icmp slt i32 %44, %K
br i1 %45, label %33, label %._crit_edge11
._crit_edge11: ; preds = %33
%46 = add nuw nsw i32 %28, 1
%47 = icmp slt i32 %46, %K
br i1 %47, label %.lr.ph10, label %pregion_for_cond23.loopexit
pregion_for_cond23.loopexit: ; preds = %._crit_edge11
br label %pregion_for_cond23
pregion_for_cond23: ; preds = %pregion_for_cond23.loopexit, %.r_entry
%48 = phi i32 [ 0, %.r_entry ], [ %43, %pregion_for_cond23.loopexit ]
%49 = add i32 %26, %24
%50 = getelementptr i32, i32* %.0.pocl_context, i32 %49
store i32 %48, i32* %50, align 4, !llvm.mem.parallel_loop_access !8
%51 = add nsw i32 %25, %27
%52 = getelementptr inbounds i32, i32 addrspace(1)* %image_out, i32 %51
%53 = getelementptr i32 addrspace(1)*, i32 addrspace(1)** %.1.pocl_context, i32 %49
store i32 addrspace(1)* %52, i32 addrspace(1)** %53, align 4, !llvm.mem.parallel_loop_access !8
store i32 %48, i32 addrspace(1)* %52, align 4, !tbaa !4, !llvm.mem.parallel_loop_access !8
%54 = add i32 %26, 1
%55 = icmp slt i32 %54, %5
br i1 %55, label %.r_entry, label %pregion_for_cond26, !llvm.loop !9
pregion_for_cond26: ; preds = %pregion_for_cond23
%56 = add i32 %22, 1
%57 = icmp slt i32 %56, %6
br i1 %57, label %pregion_for_entry.pregion_for_init21, label %pregion_for_init, !llvm.loop !10
pregion_for_init: ; preds = %pregion_for_cond26
tail call void @barrier(i32 2) #3
br label %pregion_for_entry.pregion_for_init
pregion_for_entry.pregion_for_init: ; preds = %pregion_for_cond20, %pregion_for_init
%58 = phi i32 [ 0, %pregion_for_init ], [ %91, %pregion_for_cond20 ]
%59 = mul i32 %58, %5
%60 = add i32 %58, %18
br label %._crit_edge15.r_entry
._crit_edge15.r_entry: ; preds = %pregion_for_cond, %pregion_for_entry.pregion_for_init
%61 = phi i32 [ 0, %pregion_for_entry.pregion_for_init ], [ %89, %pregion_for_cond ]
%62 = add i32 %61, %59
%63 = getelementptr i32, i32* %.0.pocl_context, i32 %62
%64 = load i32, i32* %63, align 4, !llvm.mem.parallel_loop_access !11
br i1 %21, label %.lr.ph.preheader, label %pregion_for_cond
.lr.ph.preheader: ; preds = %._crit_edge15.r_entry
%65 = add i32 %61, %17
br label %.lr.ph
.lr.ph: ; preds = %._crit_edge, %.lr.ph.preheader
%66 = phi i32 [ %84, %._crit_edge ], [ 0, %.lr.ph.preheader ]
%.lcssa6 = phi i32 [ %81, %._crit_edge ], [ %64, %.lr.ph.preheader ]
%67 = mul nsw i32 %66, %K
%68 = add nsw i32 %66, %60
%69 = mul nsw i32 %68, %20
%70 = add nsw i32 %69, %65
br label %71
71: ; preds = %71, %.lr.ph
%72 = phi i32 [ %82, %71 ], [ 0, %.lr.ph ]
%73 = phi i32 [ %81, %71 ], [ %.lcssa6, %.lr.ph ]
%74 = add nsw i32 %72, %67
%75 = getelementptr inbounds i32, i32 addrspace(1)* %filter_in, i32 %74
%76 = load i32, i32 addrspace(1)* %75, align 4, !tbaa !4, !llvm.mem.parallel_loop_access !11
%77 = add nsw i32 %70, %72
%78 = getelementptr inbounds i32, i32 addrspace(1)* %image_in, i32 %77
%79 = load i32, i32 addrspace(1)* %78, align 4, !tbaa !4, !llvm.mem.parallel_loop_access !11
%80 = mul nsw i32 %79, %76
%81 = add nsw i32 %80, %73
%82 = add nuw nsw i32 %72, 1
%83 = icmp slt i32 %82, %K
br i1 %83, label %71, label %._crit_edge
._crit_edge: ; preds = %71
%84 = add nuw nsw i32 %66, 1
%85 = icmp slt i32 %84, %K
br i1 %85, label %.lr.ph, label %pregion_for_cond.loopexit
pregion_for_cond.loopexit: ; preds = %._crit_edge
br label %pregion_for_cond
pregion_for_cond: ; preds = %pregion_for_cond.loopexit, %._crit_edge15.r_entry
%86 = phi i32 [ %64, %._crit_edge15.r_entry ], [ %81, %pregion_for_cond.loopexit ]
%87 = getelementptr i32 addrspace(1)*, i32 addrspace(1)** %.1.pocl_context, i32 %62
%88 = load i32 addrspace(1)*, i32 addrspace(1)** %87, align 4, !llvm.mem.parallel_loop_access !11
store i32 %86, i32 addrspace(1)* %88, align 4, !tbaa !4, !llvm.mem.parallel_loop_access !11
%89 = add i32 %61, 1
%90 = icmp slt i32 %89, %5
br i1 %90, label %._crit_edge15.r_entry, label %pregion_for_cond20, !llvm.loop !12
pregion_for_cond20: ; preds = %pregion_for_cond
%91 = add i32 %58, 1
%92 = icmp slt i32 %91, %6
br i1 %92, label %pregion_for_entry.pregion_for_init, label %exit.barrier, !llvm.loop !13
exit.barrier: ; preds = %pregion_for_cond20
call void @barrier(i32 0) #2
ret void
}
declare i32 @__core_num()
attributes #0 = { noduplicate "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { noinline nounwind "_kernel_local_size"="0" "_wi_alloca_size"="12" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind }
attributes #3 = { noduplicate nounwind }
!llvm.ident = !{!0}
!ocl.restrict = !{!1}
!opencl.kernels = !{!2}
!0 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"}
!1 = distinct !{!1}
!2 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32, i32 addrspace(1)*)* @Conv2D}
!3 = distinct !{!3}
!4 = !{!5, !5, i64 0}
!5 = !{!"int", !6, i64 0}
!6 = !{!"omnipotent char", !7, i64 0}
!7 = !{!"Simple C/C++ TBAA"}
!8 = !{!9, !10}
!9 = distinct !{!9}
!10 = distinct !{!10}
!11 = !{!12, !13}
!12 = distinct !{!12}
!13 = distinct !{!13}
#define PADDING (32)
#define GROUP_DIMX (32)
#define LOG_GROUP_DIMX (5)
#define GROUP_DIMY (2)
#define WIDTH (256)
#define HEIGHT (4096)
__kernel void
matrix_transpose(__global float *output,
__global float *input,
__local float *tile)
{
int block_x = get_group_id(0);
int block_y = get_group_id(1);
int local_x = get_local_id(0) & (GROUP_DIMX - 1);
int local_y = get_local_id(0) >> LOG_GROUP_DIMX;
int local_input = mad24(local_y, GROUP_DIMX + 1, local_x);
int local_output = mad24(local_x, GROUP_DIMX + 1, local_y);
int in_x = mad24(block_x, GROUP_DIMX, local_x);
int in_y = mad24(block_y, GROUP_DIMX, local_y);
int input_index = mad24(in_y, WIDTH, in_x);
int out_x = mad24(block_y, GROUP_DIMX, local_x);
int out_y = mad24(block_x, GROUP_DIMX, local_y);
int output_index = mad24(out_y, HEIGHT + PADDING, out_x);
int global_input_stride = WIDTH * GROUP_DIMY;
int global_output_stride = (HEIGHT + PADDING) * GROUP_DIMY;
int local_input_stride = GROUP_DIMY * (GROUP_DIMX + 1);
int local_output_stride = GROUP_DIMY;
tile[local_input] = input[input_index];
local_input += local_input_stride;
input_index += global_input_stride;
tile[local_input] = input[input_index];
local_input += local_input_stride;
input_index += global_input_stride;
tile[local_input] = input[input_index];
local_input += local_input_stride;
input_index += global_input_stride;
tile[local_input] = input[input_index];
local_input += local_input_stride;
input_index += global_input_stride;
tile[local_input] = input[input_index];
local_input += local_input_stride;
input_index += global_input_stride;
tile[local_input] = input[input_index];
local_input += local_input_stride;
input_index += global_input_stride;
tile[local_input] = input[input_index];
local_input += local_input_stride;
input_index += global_input_stride;
tile[local_input] = input[input_index];
local_input += local_input_stride;
input_index += global_input_stride;
tile[local_input] = input[input_index];
local_input += local_input_stride;
input_index += global_input_stride;
tile[local_input] = input[input_index];
local_input += local_input_stride;
input_index += global_input_stride;
tile[local_input] = input[input_index];
local_input += local_input_stride;
input_index += global_input_stride;
tile[local_input] = input[input_index];
local_input += local_input_stride;
input_index += global_input_stride;
tile[local_input] = input[input_index];
local_input += local_input_stride;
input_index += global_input_stride;
tile[local_input] = input[input_index];
local_input += local_input_stride;
input_index += global_input_stride;
tile[local_input] = input[input_index];
local_input += local_input_stride;
input_index += global_input_stride;
tile[local_input] = input[input_index];
barrier(CLK_LOCAL_MEM_FENCE);
output[output_index] = tile[local_output];
local_output += local_output_stride;
output_index += global_output_stride;
output[output_index] = tile[local_output];
local_output += local_output_stride;
output_index += global_output_stride;
output[output_index] = tile[local_output];
local_output += local_output_stride;
output_index += global_output_stride;
output[output_index] = tile[local_output];
local_output += local_output_stride;
output_index += global_output_stride;
output[output_index] = tile[local_output];
local_output += local_output_stride;
output_index += global_output_stride;
output[output_index] = tile[local_output];
local_output += local_output_stride;
output_index += global_output_stride;
output[output_index] = tile[local_output];
local_output += local_output_stride;
output_index += global_output_stride;
output[output_index] = tile[local_output];
local_output += local_output_stride;
output_index += global_output_stride;
output[output_index] = tile[local_output];
local_output += local_output_stride;
output_index += global_output_stride;
output[output_index] = tile[local_output];
local_output += local_output_stride;
output_index += global_output_stride;
output[output_index] = tile[local_output];
local_output += local_output_stride;
output_index += global_output_stride;
output[output_index] = tile[local_output];
local_output += local_output_stride;
output_index += global_output_stride;
output[output_index] = tile[local_output];
local_output += local_output_stride;
output_index += global_output_stride;
output[output_index] = tile[local_output];
local_output += local_output_stride;
output_index += global_output_stride;
output[output_index] = tile[local_output];
local_output += local_output_stride;
output_index += global_output_stride;
output[output_index] = tile[local_output];
}
; ModuleID = 'example2.bc'
source_filename = "example2.bc"
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "c6000-unknown-unknown-unknown"
@kernel_config_l2 = external addrspace(2) constant [32 x i32]
; Function Attrs: noduplicate
declare linkonce protected void @barrier(i32) #0
; Function Attrs: noinline nounwind
define void @matrix_transpose(float addrspace(1)* nocapture %output, float addrspace(1)* nocapture readonly %input, float addrspace(3)* %tile) #1 {
pregion_for_init1:
%0 = call i32 @__core_num()
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 15), align 4
%2 = mul i32 %0, %1
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 14), align 4
%4 = add i32 %3, %2
%5 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4
%6 = mul i32 4, %5
%7 = add i32 %4, %6
%8 = add i32 %7, 7
%9 = and i32 %8, -8
%.0.pocl_context = inttoptr i32 %4 to i32*, !ocl.restrict !3
call void @barrier(i32 0) #2
%10 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !4
%11 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 7), align 4, !tbaa !4
%12 = sub i32 %10, %11
%13 = udiv i32 %12, %5
%14 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !4
%15 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 8), align 4, !tbaa !4
%16 = sub i32 %14, %15
%17 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4, !tbaa !4
%18 = udiv i32 %16, %17
%19 = shl nsw i32 %13, 5
%20 = shl nsw i32 %18, 5
br label %pregion_for_cond3
pregion_for_cond3: ; preds = %pregion_for_cond3, %pregion_for_init1
%21 = phi i32 [ 0, %pregion_for_init1 ], [ %113, %pregion_for_cond3 ]
%22 = and i32 %21, 31
%23 = lshr i32 %21, 5
%24 = mul nsw i32 %23, 33
%25 = add nuw nsw i32 %24, %22
%26 = or i32 %22, %19
%27 = add nsw i32 %23, %20
%28 = shl nsw i32 %27, 8
%29 = add nsw i32 %28, %26
%30 = or i32 %22, %20
%31 = add nsw i32 %23, %19
%32 = mul nsw i32 %31, 4128
%33 = add nsw i32 %32, %30
%34 = getelementptr i32, i32* %.0.pocl_context, i32 %21
store i32 %33, i32* %34, align 4, !llvm.mem.parallel_loop_access !8
%35 = getelementptr inbounds float, float addrspace(1)* %input, i32 %29
%36 = load float, float addrspace(1)* %35, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%37 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %25
store float %36, float addrspace(3)* %37, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%38 = add nuw nsw i32 %25, 66
%39 = add nsw i32 %29, 512
%40 = getelementptr inbounds float, float addrspace(1)* %input, i32 %39
%41 = load float, float addrspace(1)* %40, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%42 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %38
store float %41, float addrspace(3)* %42, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%43 = add nuw nsw i32 %25, 132
%44 = add nsw i32 %29, 1024
%45 = getelementptr inbounds float, float addrspace(1)* %input, i32 %44
%46 = load float, float addrspace(1)* %45, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%47 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %43
store float %46, float addrspace(3)* %47, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%48 = add nuw nsw i32 %25, 198
%49 = add nsw i32 %29, 1536
%50 = getelementptr inbounds float, float addrspace(1)* %input, i32 %49
%51 = load float, float addrspace(1)* %50, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%52 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %48
store float %51, float addrspace(3)* %52, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%53 = add nuw nsw i32 %25, 264
%54 = add nsw i32 %29, 2048
%55 = getelementptr inbounds float, float addrspace(1)* %input, i32 %54
%56 = load float, float addrspace(1)* %55, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%57 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %53
store float %56, float addrspace(3)* %57, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%58 = add nuw nsw i32 %25, 330
%59 = add nsw i32 %29, 2560
%60 = getelementptr inbounds float, float addrspace(1)* %input, i32 %59
%61 = load float, float addrspace(1)* %60, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%62 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %58
store float %61, float addrspace(3)* %62, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%63 = add nuw nsw i32 %25, 396
%64 = add nsw i32 %29, 3072
%65 = getelementptr inbounds float, float addrspace(1)* %input, i32 %64
%66 = load float, float addrspace(1)* %65, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%67 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %63
store float %66, float addrspace(3)* %67, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%68 = add nuw nsw i32 %25, 462
%69 = add nsw i32 %29, 3584
%70 = getelementptr inbounds float, float addrspace(1)* %input, i32 %69
%71 = load float, float addrspace(1)* %70, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%72 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %68
store float %71, float addrspace(3)* %72, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%73 = add nuw nsw i32 %25, 528
%74 = add nsw i32 %29, 4096
%75 = getelementptr inbounds float, float addrspace(1)* %input, i32 %74
%76 = load float, float addrspace(1)* %75, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%77 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %73
store float %76, float addrspace(3)* %77, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%78 = add nuw nsw i32 %25, 594
%79 = add nsw i32 %29, 4608
%80 = getelementptr inbounds float, float addrspace(1)* %input, i32 %79
%81 = load float, float addrspace(1)* %80, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%82 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %78
store float %81, float addrspace(3)* %82, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%83 = add nuw nsw i32 %25, 660
%84 = add nsw i32 %29, 5120
%85 = getelementptr inbounds float, float addrspace(1)* %input, i32 %84
%86 = load float, float addrspace(1)* %85, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%87 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %83
store float %86, float addrspace(3)* %87, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%88 = add nuw nsw i32 %25, 726
%89 = add nsw i32 %29, 5632
%90 = getelementptr inbounds float, float addrspace(1)* %input, i32 %89
%91 = load float, float addrspace(1)* %90, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%92 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %88
store float %91, float addrspace(3)* %92, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%93 = add nuw nsw i32 %25, 792
%94 = add nsw i32 %29, 6144
%95 = getelementptr inbounds float, float addrspace(1)* %input, i32 %94
%96 = load float, float addrspace(1)* %95, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%97 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %93
store float %96, float addrspace(3)* %97, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%98 = add nuw nsw i32 %25, 858
%99 = add nsw i32 %29, 6656
%100 = getelementptr inbounds float, float addrspace(1)* %input, i32 %99
%101 = load float, float addrspace(1)* %100, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%102 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %98
store float %101, float addrspace(3)* %102, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%103 = add nuw nsw i32 %25, 924
%104 = add nsw i32 %29, 7168
%105 = getelementptr inbounds float, float addrspace(1)* %input, i32 %104
%106 = load float, float addrspace(1)* %105, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%107 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %103
store float %106, float addrspace(3)* %107, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%108 = add nuw nsw i32 %25, 990
%109 = add nsw i32 %29, 7680
%110 = getelementptr inbounds float, float addrspace(1)* %input, i32 %109
%111 = load float, float addrspace(1)* %110, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%112 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %108
store float %111, float addrspace(3)* %112, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8
%113 = add i32 %21, 1
%114 = icmp slt i32 %113, %5
br i1 %114, label %pregion_for_cond3, label %pregion_for_init, !llvm.loop !9
pregion_for_init: ; preds = %pregion_for_cond3
tail call void @barrier(i32 1) #3
br label %pregion_for_cond
pregion_for_cond: ; preds = %pregion_for_cond, %pregion_for_init
%115 = phi i32 [ 0, %pregion_for_init ], [ %200, %pregion_for_cond ]
%116 = and i32 %115, 31
%117 = mul nuw nsw i32 %116, 33
%118 = lshr i32 %115, 5
%119 = add nuw nsw i32 %117, %118
%120 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %119
%121 = load float, float addrspace(3)* %120, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%122 = getelementptr i32, i32* %.0.pocl_context, i32 %115
%123 = load i32, i32* %122, align 4
%124 = getelementptr inbounds float, float addrspace(1)* %output, i32 %123
store float %121, float addrspace(1)* %124, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%125 = add nuw nsw i32 %119, 2
%126 = add nsw i32 %123, 8256
%127 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %125
%128 = load float, float addrspace(3)* %127, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%129 = getelementptr inbounds float, float addrspace(1)* %output, i32 %126
store float %128, float addrspace(1)* %129, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%130 = add nuw nsw i32 %119, 4
%131 = add nsw i32 %123, 16512
%132 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %130
%133 = load float, float addrspace(3)* %132, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%134 = getelementptr inbounds float, float addrspace(1)* %output, i32 %131
store float %133, float addrspace(1)* %134, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%135 = add nuw nsw i32 %119, 6
%136 = add nsw i32 %123, 24768
%137 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %135
%138 = load float, float addrspace(3)* %137, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%139 = getelementptr inbounds float, float addrspace(1)* %output, i32 %136
store float %138, float addrspace(1)* %139, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%140 = add nuw nsw i32 %119, 8
%141 = add nsw i32 %123, 33024
%142 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %140
%143 = load float, float addrspace(3)* %142, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%144 = getelementptr inbounds float, float addrspace(1)* %output, i32 %141
store float %143, float addrspace(1)* %144, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%145 = add nuw nsw i32 %119, 10
%146 = add nsw i32 %123, 41280
%147 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %145
%148 = load float, float addrspace(3)* %147, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%149 = getelementptr inbounds float, float addrspace(1)* %output, i32 %146
store float %148, float addrspace(1)* %149, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%150 = add nuw nsw i32 %119, 12
%151 = add nsw i32 %123, 49536
%152 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %150
%153 = load float, float addrspace(3)* %152, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%154 = getelementptr inbounds float, float addrspace(1)* %output, i32 %151
store float %153, float addrspace(1)* %154, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%155 = add nuw nsw i32 %119, 14
%156 = add nsw i32 %123, 57792
%157 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %155
%158 = load float, float addrspace(3)* %157, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%159 = getelementptr inbounds float, float addrspace(1)* %output, i32 %156
store float %158, float addrspace(1)* %159, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%160 = add nuw nsw i32 %119, 16
%161 = add nsw i32 %123, 66048
%162 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %160
%163 = load float, float addrspace(3)* %162, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%164 = getelementptr inbounds float, float addrspace(1)* %output, i32 %161
store float %163, float addrspace(1)* %164, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%165 = add nuw nsw i32 %119, 18
%166 = add nsw i32 %123, 74304
%167 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %165
%168 = load float, float addrspace(3)* %167, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%169 = getelementptr inbounds float, float addrspace(1)* %output, i32 %166
store float %168, float addrspace(1)* %169, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%170 = add nuw nsw i32 %119, 20
%171 = add nsw i32 %123, 82560
%172 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %170
%173 = load float, float addrspace(3)* %172, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%174 = getelementptr inbounds float, float addrspace(1)* %output, i32 %171
store float %173, float addrspace(1)* %174, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%175 = add nuw nsw i32 %119, 22
%176 = add nsw i32 %123, 90816
%177 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %175
%178 = load float, float addrspace(3)* %177, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%179 = getelementptr inbounds float, float addrspace(1)* %output, i32 %176
store float %178, float addrspace(1)* %179, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%180 = add nuw nsw i32 %119, 24
%181 = add nsw i32 %123, 99072
%182 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %180
%183 = load float, float addrspace(3)* %182, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%184 = getelementptr inbounds float, float addrspace(1)* %output, i32 %181
store float %183, float addrspace(1)* %184, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%185 = add nuw nsw i32 %119, 26
%186 = add nsw i32 %123, 107328
%187 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %185
%188 = load float, float addrspace(3)* %187, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%189 = getelementptr inbounds float, float addrspace(1)* %output, i32 %186
store float %188, float addrspace(1)* %189, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%190 = add nuw nsw i32 %119, 28
%191 = add nsw i32 %123, 115584
%192 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %190
%193 = load float, float addrspace(3)* %192, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%194 = getelementptr inbounds float, float addrspace(1)* %output, i32 %191
store float %193, float addrspace(1)* %194, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%195 = add nuw nsw i32 %119, 30
%196 = add nsw i32 %123, 123840
%197 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %195
%198 = load float, float addrspace(3)* %197, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%199 = getelementptr inbounds float, float addrspace(1)* %output, i32 %196
store float %198, float addrspace(1)* %199, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12
%200 = add i32 %115, 1
%201 = icmp slt i32 %200, %5
br i1 %201, label %pregion_for_cond, label %exit.barrier, !llvm.loop !13
exit.barrier: ; preds = %pregion_for_cond
call void @barrier(i32 0) #2
ret void
}
declare i32 @__core_num()
attributes #0 = { noduplicate "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { noinline nounwind "_kernel_local_size"="0" "_wi_alloca_size"="4" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind }
attributes #3 = { noduplicate nounwind }
!llvm.ident = !{!0}
!ocl.restrict = !{!1}
!opencl.kernels = !{!2}
!0 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"}
!1 = distinct !{!1}
!2 = !{void (float addrspace(1)*, float addrspace(1)*, float addrspace(3)*)* @matrix_transpose}
!3 = distinct !{!3}
!4 = !{!5, !5, i64 0}
!5 = !{!"int", !6, i64 0}
!6 = !{!"omnipotent char", !7, i64 0}
!7 = !{!"Simple C/C++ TBAA"}
!8 = !{!9}
!9 = distinct !{!9}
!10 = !{!11, !11, i64 0}
!11 = !{!"float", !6, i64 0}
!12 = !{!13}
!13 = distinct !{!13}
kernel void
matadd (__global const float *A,
__global const float *B,
__global float *C)
{
size_t X = get_global_id(0);
size_t Y = get_global_id(1);
size_t Idx = Y*get_global_size(0) + X;
C[Idx] = A[Idx] + B[Idx];
}
; ModuleID = 'matadd.bc'
source_filename = "matadd.bc"
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "c6000-unknown-unknown-unknown"
@kernel_config_l2 = external addrspace(2) constant [32 x i32]
; Function Attrs: nounwind
define void @matadd(float addrspace(1)* nocapture readonly %A, float addrspace(1)* nocapture readonly %B, float addrspace(1)* nocapture %C) #0 {
.entry:
%0 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4, !llvm.mem.parallel_loop_access !7
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !10, !llvm.mem.parallel_loop_access !7
%2 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !10, !llvm.mem.parallel_loop_access !7
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 1), align 4, !tbaa !10, !llvm.mem.parallel_loop_access !7
%4 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4, !llvm.mem.parallel_loop_access !7
%5 = icmp sgt i32 %4, 0
br i1 %5, label %.bodyTop3.preheader, label %.exit2
.bodyTop3.preheader: ; preds = %.entry
br label %.bodyTop3
.bodyTop3: ; preds = %.bodyEnd4, %.bodyTop3.preheader
%6 = phi i32 [ %21, %.bodyEnd4 ], [ 0, %.bodyTop3.preheader ]
%7 = icmp sgt i32 %0, 0
br i1 %7, label %.bodyTop.preheader, label %.bodyEnd4
.bodyTop.preheader: ; preds = %.bodyTop3
br label %.bodyTop
.bodyTop: ; preds = %.bodyTop, %.bodyTop.preheader
%8 = phi i32 [ %19, %.bodyTop ], [ 0, %.bodyTop.preheader ]
%9 = add i32 %8, %1
%10 = add i32 %6, %2
%11 = mul i32 %3, %10
%12 = add i32 %11, %9
%13 = getelementptr inbounds float, float addrspace(1)* %A, i32 %12
%14 = load float, float addrspace(1)* %13, align 4, !tbaa !14, !llvm.mem.parallel_loop_access !7
%15 = getelementptr inbounds float, float addrspace(1)* %B, i32 %12
%16 = load float, float addrspace(1)* %15, align 4, !tbaa !14, !llvm.mem.parallel_loop_access !7
%17 = fadd float %14, %16
%18 = getelementptr inbounds float, float addrspace(1)* %C, i32 %12
store float %17, float addrspace(1)* %18, align 4, !tbaa !14, !llvm.mem.parallel_loop_access !7
%19 = add i32 %8, 1
%20 = icmp slt i32 %19, %0
br i1 %20, label %.bodyTop, label %.bodyEnd4.loopexit, !llvm.loop !8
.bodyEnd4.loopexit: ; preds = %.bodyTop
br label %.bodyEnd4
.bodyEnd4: ; preds = %.bodyEnd4.loopexit, %.bodyTop3
%21 = add i32 %6, 1
%22 = icmp slt i32 %21, %4
br i1 %22, label %.bodyTop3, label %.exit2.loopexit, !llvm.loop !9
.exit2.loopexit: ; preds = %.bodyEnd4
br label %.exit2
.exit2: ; preds = %.exit2.loopexit, %.entry
ret void
}
attributes #0 = { nounwind "_kernel_local_size"="0" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
!opencl.kernels = !{!0}
!llvm.ident = !{!6}
!0 = !{void (float addrspace(1)*, float addrspace(1)*, float addrspace(1)*)* @matadd, !1, !2, !3, !4, !5}
!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1}
!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"}
!3 = !{!"kernel_arg_type", !"float*", !"float*", !"float*"}
!4 = !{!"kernel_arg_base_type", !"float*", !"float*", !"float*"}
!5 = !{!"kernel_arg_type_qual", !"const", !"const", !""}
!6 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"}
!7 = !{!8, !9}
!8 = distinct !{!8}
!9 = distinct !{!9}
!10 = !{!11, !11, i64 0}
!11 = !{!"int", !12, i64 0}
!12 = !{!"omnipotent char", !13, i64 0}
!13 = !{!"Simple C/C++ TBAA"}
!14 = !{!15, !15, i64 0}
!15 = !{!"float", !12, i64 0}
/* Original code:
* The MIT License (MIT)
* Copyright (c) 2014 SURFsara
* https://github.com/CNugteren/myGEMM/blob/master/src/kernels.cl
*/
#define ITYPE uint
#define TS (100)
/* work per thread */
#define WPT (100 / 4)
// TS/WPT == RTS
#define RTS 4
// Tiled and coalesced version
__kernel void
myGEMM4 (const __global float *A, const __global float *B, __global float *C,
uint M, uint N, uint K)
{
// Thread identifiers
const ITYPE row = get_local_id (0); // Local row ID (max: TS)
const ITYPE col = get_local_id (1); // Local col ID (max: TS/WPT == RTS)
const ITYPE globalRow = TS * get_group_id (0) + row; // Row ID of C (0..M)
const ITYPE globalCol = TS * get_group_id (1) + col; // Col ID of C (0..N)
// Local memory to fit a tile of TS*TS elements of A and B
__local float Asub[TS][TS];
__local float Bsub[TS][TS];
// Initialise the accumulation registers
float acc[WPT];
for (ITYPE w = 0; w < WPT; w++)
{
acc[w] = 0.0f;
}
// Loop over all tiles
const ITYPE numTiles = K / TS;
for (ITYPE t = 0; t < numTiles; t++)
{
// Load one tile of A and B into local memory
for (ITYPE w = 0; w < WPT; w++)
{
const ITYPE tiledRow = TS * t + row;
const ITYPE tiledCol = TS * t + col;
Asub[col + w * RTS][row] = A[(tiledCol + w * RTS) * M + globalRow];
Bsub[col + w * RTS][row] = B[(globalCol + w * RTS) * K + tiledRow];
}
// Synchronise to make sure the tile is loaded
barrier (CLK_LOCAL_MEM_FENCE);
// Perform the computation for a single tile
for (ITYPE k = 0; k < TS; k++)
{
for (ITYPE w = 0; w < WPT; w++)
{
#ifdef USE_FMA
acc[w] = fma (Asub[k][row], Bsub[col + w * RTS][k], acc[w]);
#else
acc[w] += Asub[k][row] * Bsub[col + w * RTS][k];
#endif
}
}
// Synchronise before loading the next tile
barrier (CLK_LOCAL_MEM_FENCE);
}
// Store the final results in C
for (ITYPE w = 0; w < WPT; w++)
{
C[(globalCol + w * RTS) * M + globalRow] = acc[w];
}
}
#define TRANSPOSEX 8
#define TRANSPOSEY 8
// Simple transpose kernel for a P * Q matrix
__kernel void
transpose (const ITYPE P, const ITYPE Q, const __global float *input,
__global float *output)
{
// Thread identifiers
const ITYPE tx = get_local_id (0);
const ITYPE ty = get_local_id (1);
const ITYPE ID0 = get_group_id (0) * TRANSPOSEX + tx; // 0..P
const ITYPE ID1 = get_group_id (1) * TRANSPOSEY + ty; // 0..Q
// Set-up the local memory for shuffling
__local float buffer[TRANSPOSEX][TRANSPOSEY];
// Swap the x and y coordinates to perform the rotation (coalesced)
// if (ID0 < P && ID1 < Q) {
buffer[ty][tx] = input[ID1 * P + ID0];
// }
// Synchronise all threads
barrier (CLK_LOCAL_MEM_FENCE);
// We don't have to swap the x and y thread indices here,
// because that's already done in the local memory
const ITYPE newID0 = get_group_id (1) * TRANSPOSEY + tx;
const ITYPE newID1 = get_group_id (0) * TRANSPOSEX + ty;
// Store the transposed result (coalesced)
// if (newID0 < Q && newID1 < P) {
output[newID1 * Q + newID0] = buffer[tx][ty];
// }
}
; ModuleID = 'matrix1.bc'
source_filename = "matrix1.bc"
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "c6000-unknown-unknown-unknown"
@myGEMM4.Asub = internal unnamed_addr addrspace(3) global [100 x [100 x float]] undef, align 4
@myGEMM4.Bsub = internal unnamed_addr addrspace(3) global [100 x [100 x float]] undef, align 4
@transpose.buffer = internal unnamed_addr addrspace(3) global [8 x [8 x float]] undef, align 4
@kernel_config_l2 = external addrspace(2) constant [32 x i32]
; Function Attrs: noduplicate
declare linkonce protected void @barrier(i32) #0
; Function Attrs: noinline nounwind
define void @myGEMM4(float addrspace(1)* nocapture readonly %A, float addrspace(1)* nocapture readonly %B, float addrspace(1)* nocapture %C, i32 %M, i32 %N, i32 %K) #1 {
.r_entry:
%0 = call i32 @__core_num()
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 15), align 4
%2 = mul i32 %0, %1
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 14), align 4
%4 = add i32 %3, %2
%5 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4
%6 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4
%7 = mul i32 %5, %6
%8 = mul i32 100, %7
%9 = add i32 %4, %8
%10 = add i32 %9, 7
%11 = and i32 %10, -8
%.acc.pocl_context = inttoptr i32 %4 to [25 x float]*, !ocl.restrict !4
%12 = mul i32 4, %7
%13 = add i32 %11, %12
%14 = add i32 %13, 7
%15 = and i32 %14, -8
%.t.010.ex_phi.pocl_context = inttoptr i32 %11 to i32*, !ocl.restrict !4
call void @barrier(i32 0) #5
%16 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !5
%17 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 7), align 4, !tbaa !5
%18 = sub i32 %16, %17
%19 = udiv i32 %18, %5
%20 = mul i32 %19, 100
%21 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !5
%22 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 8), align 4, !tbaa !5
%23 = sub i32 %21, %22
%24 = udiv i32 %23, %6
%25 = mul i32 %24, 100
%26 = udiv i32 %K, 100
%27 = icmp ugt i32 %K, 99
%28 = bitcast [25 x float]* %.acc.pocl_context to i8*
call void @llvm.lifetime.start.p0i8(i64 100, i8* %28) #5
call void @llvm.memset.p0i8.i32(i8* align 64 %28, i8 0, i32 100, i1 false)
br i1 %27, label %pregion_for_init38, label %.preheader.preheader
.preheader.preheader: ; preds = %.r_entry
br label %.preheader
pregion_for_init38: ; preds = %.r_entry
store i32 0, i32* %.t.010.ex_phi.pocl_context, align 64
br label %pregion_for_entry.pregion_for_init35
pregion_for_entry.pregion_for_init35: ; preds = %pregion_for_cond40, %pregion_for_init38
%29 = phi i32 [ 0, %pregion_for_init38 ], [ %39, %pregion_for_cond40 ]
%30 = phi i32 [ 1, %pregion_for_init38 ], [ 0, %pregion_for_cond40 ]
%31 = mul i32 %29, %5
br label %pregion_for_cond37
pregion_for_cond37: ; preds = %pregion_for_cond37, %pregion_for_entry.pregion_for_init35
%32 = phi i32 [ %30, %pregion_for_entry.pregion_for_init35 ], [ %37, %pregion_for_cond37 ]
%33 = add i32 %32, %31
%34 = getelementptr [25 x float], [25 x float]* %.acc.pocl_context, i32 %33
%35 = bitcast [25 x float]* %34 to i8*
call void @llvm.lifetime.start.p0i8(i64 100, i8* %35) #5, !llvm.mem.parallel_loop_access !9
call void @llvm.memset.p0i8.i32(i8* align 4 %35, i8 0, i32 100, i1 false)
%36 = getelementptr i32, i32* %.t.010.ex_phi.pocl_context, i32 %33
store i32 0, i32* %36, align 4, !llvm.mem.parallel_loop_access !9
%37 = add i32 %32, 1
%38 = icmp slt i32 %37, %5
br i1 %38, label %pregion_for_cond37, label %pregion_for_cond40, !llvm.loop !10
pregion_for_cond40: ; preds = %pregion_for_cond37
%39 = add i32 %29, 1
%40 = icmp slt i32 %39, %6
br i1 %40, label %pregion_for_entry.pregion_for_init35, label %.preheader5.preheader.loopbarrier, !llvm.loop !11
.preheader5.preheader.loopbarrier: ; preds = %pregion_for_cond40
call void @barrier(i32 0) #5
br label %pregion_for_entry.pregion_for_init29
pregion_for_entry.pregion_for_init29: ; preds = %pregion_for_entry.pregion_for_init29.backedge, %.preheader5.preheader.loopbarrier
%41 = phi i32 [ 0, %.preheader5.preheader.loopbarrier ], [ %.be, %pregion_for_entry.pregion_for_init29.backedge ]
%42 = mul i32 %41, %5
%43 = add i32 %25, %41
br label %pregion_for_entry..preheader5
pregion_for_entry..preheader5: ; preds = %pregion_for_cond31, %pregion_for_entry.pregion_for_init29
%44 = phi i32 [ 0, %pregion_for_entry.pregion_for_init29 ], [ %70, %pregion_for_cond31 ]
%45 = add i32 %44, %42
%46 = getelementptr i32, i32* %.t.010.ex_phi.pocl_context, i32 %45
%47 = load i32, i32* %46, align 4, !llvm.mem.parallel_loop_access !12
%48 = mul i32 %47, 100
%49 = add i32 %48, %44
%50 = add i32 %48, %41
%51 = add i32 %20, %44
br label %52
52: ; preds = %52, %pregion_for_entry..preheader5
%53 = phi i32 [ %68, %52 ], [ 0, %pregion_for_entry..preheader5 ]
%54 = shl i32 %53, 2
%55 = add i32 %50, %54
%56 = mul i32 %55, %M
%57 = add i32 %56, %51
%58 = getelementptr inbounds float, float addrspace(1)* %A, i32 %57
%59 = load float, float addrspace(1)* %58, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !12
%60 = add i32 %54, %41
%61 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]] addrspace(3)* @myGEMM4.Asub, i32 0, i32 %60, i32 %44
store float %59, float addrspace(3)* %61, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !12
%62 = add i32 %54, %43
%63 = mul i32 %62, %K
%64 = add i32 %49, %63
%65 = getelementptr inbounds float, float addrspace(1)* %B, i32 %64
%66 = load float, float addrspace(1)* %65, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !12
%67 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]] addrspace(3)* @myGEMM4.Bsub, i32 0, i32 %60, i32 %44
store float %66, float addrspace(3)* %67, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !12
%68 = add nuw nsw i32 %53, 1
%69 = icmp ult i32 %68, 25
br i1 %69, label %52, label %pregion_for_cond31
pregion_for_cond31: ; preds = %52
%70 = add i32 %44, 1
%71 = icmp slt i32 %70, %5
br i1 %71, label %pregion_for_entry..preheader5, label %pregion_for_cond34, !llvm.loop !13
pregion_for_cond34: ; preds = %pregion_for_cond31
%72 = add i32 %41, 1
%73 = icmp slt i32 %72, %6
br i1 %73, label %pregion_for_entry.pregion_for_init29.backedge, label %pregion_for_init26, !llvm.loop !14
pregion_for_entry.pregion_for_init29.backedge: ; preds = %.brexitbarrier.latchbarrier.postbarrier, %pregion_for_cond34
%.be = phi i32 [ %72, %pregion_for_cond34 ], [ 0, %.brexitbarrier.latchbarrier.postbarrier ]
br label %pregion_for_entry.pregion_for_init29
pregion_for_init26: ; preds = %pregion_for_cond34
tail call void @barrier(i32 1) #6
br label %pregion_for_entry.pregion_for_init23
pregion_for_entry.pregion_for_init23: ; preds = %pregion_for_cond28, %pregion_for_init26
%74 = phi i32 [ 0, %pregion_for_init26 ], [ %98, %pregion_for_cond28 ]
%75 = mul i32 %74, %5
br label %pregion_for_entry..postbarrier
pregion_for_entry..postbarrier: ; preds = %pregion_for_cond25, %pregion_for_entry.pregion_for_init23
%76 = phi i32 [ 0, %pregion_for_entry.pregion_for_init23 ], [ %96, %pregion_for_cond25 ]
%77 = add i32 %76, %75
br label %.preheader4
.preheader4: ; preds = %93, %pregion_for_entry..postbarrier
%78 = phi i32 [ %94, %93 ], [ 0, %pregion_for_entry..postbarrier ]
%79 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]] addrspace(3)* @myGEMM4.Asub, i32 0, i32 %78, i32 %76
%80 = load float, float addrspace(3)* %79, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !17
br label %81
81: ; preds = %81, %.preheader4
%82 = phi i32 [ %91, %81 ], [ 0, %.preheader4 ]
%83 = shl i32 %82, 2
%84 = add i32 %83, %74
%85 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]] addrspace(3)* @myGEMM4.Bsub, i32 0, i32 %84, i32 %78
%86 = load float, float addrspace(3)* %85, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !17
%87 = fmul float %80, %86
%88 = getelementptr inbounds [25 x float], [25 x float]* %.acc.pocl_context, i32 %77, i32 %82
%89 = load float, float* %88, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !17
%90 = fadd float %89, %87
store float %90, float* %88, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !17
%91 = add nuw nsw i32 %82, 1
%92 = icmp ult i32 %91, 25
br i1 %92, label %81, label %93
93: ; preds = %81
%94 = add nuw nsw i32 %78, 1
%95 = icmp ult i32 %94, 100
br i1 %95, label %.preheader4, label %pregion_for_cond25
pregion_for_cond25: ; preds = %93
%96 = add i32 %76, 1
%97 = icmp slt i32 %96, %5
br i1 %97, label %pregion_for_entry..postbarrier, label %pregion_for_cond28, !llvm.loop !18
pregion_for_cond28: ; preds = %pregion_for_cond25
%98 = add i32 %74, 1
%99 = icmp slt i32 %98, %6
br i1 %99, label %pregion_for_entry.pregion_for_init23, label %pregion_for_init20, !llvm.loop !19
pregion_for_init20: ; preds = %pregion_for_cond28
tail call void @barrier(i32 1) #6
%100 = add nuw nsw i32 %47, 1
br label %pregion_for_entry.pregion_for_init17
pregion_for_entry.pregion_for_init17: ; preds = %pregion_for_cond22, %pregion_for_init20
%101 = phi i32 [ 0, %pregion_for_init20 ], [ %108, %pregion_for_cond22 ]
%102 = mul i32 %101, %5
br label %pregion_for_cond19
pregion_for_cond19: ; preds = %pregion_for_cond19, %pregion_for_entry.pregion_for_init17
%103 = phi i32 [ 0, %pregion_for_entry.pregion_for_init17 ], [ %106, %pregion_for_cond19 ]
%104 = add i32 %103, %102
%105 = getelementptr i32, i32* %.t.010.ex_phi.pocl_context, i32 %104
store i32 %100, i32* %105, align 4, !llvm.mem.parallel_loop_access !20
%106 = add i32 %103, 1
%107 = icmp slt i32 %106, %5
br i1 %107, label %pregion_for_cond19, label %pregion_for_cond22, !llvm.loop !21
pregion_for_cond22: ; preds = %pregion_for_cond19
%108 = add i32 %101, 1
%109 = icmp slt i32 %108, %6
br i1 %109, label %pregion_for_entry.pregion_for_init17, label %.brexitbarrier.latchbarrier.postbarrier, !llvm.loop !22
.brexitbarrier.latchbarrier.postbarrier: ; preds = %pregion_for_cond22
%110 = icmp ult i32 %100, %26
call void @barrier(i32 0) #5
br i1 %110, label %pregion_for_entry.pregion_for_init29.backedge, label %pregion_for_entry.pregion_for_init.preheader
pregion_for_entry.pregion_for_init.preheader: ; preds = %.brexitbarrier.latchbarrier.postbarrier
br label %pregion_for_entry.pregion_for_init
pregion_for_entry.pregion_for_init: ; preds = %pregion_for_cond16, %pregion_for_entry.pregion_for_init.preheader
%111 = phi i32 [ %164, %pregion_for_cond16 ], [ 0, %pregion_for_entry.pregion_for_init.preheader ]
%112 = mul i32 %111, %5
%113 = add i32 %25, %111
br label %.preheader.preheader.btr
.preheader: ; preds = %.preheader, %.preheader.preheader
%114 = phi i32 [ %122, %.preheader ], [ 0, %.preheader.preheader ]
%115 = getelementptr inbounds [25 x float], [25 x float]* %.acc.pocl_context, i32 0, i32 %114
%116 = load float, float* %115, align 4, !tbaa !15
%117 = shl i32 %114, 2
%118 = add i32 %117, %25
%119 = mul i32 %118, %M
%120 = add i32 %119, %20
%121 = getelementptr inbounds float, float addrspace(1)* %C, i32 %120
store float %116, float addrspace(1)* %121, align 4, !tbaa !15
%122 = add nuw nsw i32 %114, 1
%123 = icmp ult i32 %122, 25
br i1 %123, label %.preheader, label %pregion_for_init45
.preheader.peeled_wi: ; preds = %.preheader.preheader.peeled_wi, %.preheader.peeled_wi
%124 = phi i32 [ 0, %.preheader.preheader.peeled_wi ], [ %132, %.preheader.peeled_wi ]
%125 = getelementptr inbounds [25 x float], [25 x float]* %.acc.pocl_context, i32 %135, i32 %124
%126 = load float, float* %125, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !23
%127 = shl i32 %124, 2
%128 = add i32 %127, %140
%129 = mul i32 %128, %M
%130 = add i32 %129, %136
%131 = getelementptr inbounds float, float addrspace(1)* %C, i32 %130
store float %126, float addrspace(1)* %131, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !23
%132 = add nuw nsw i32 %124, 1
%133 = icmp ult i32 %132, 25
br i1 %133, label %.preheader.peeled_wi, label %pregion_for_cond44
.preheader.preheader.peeled_wi: ; preds = %pregion_for_cond44, %pregion_for_entry.pregion_for_init42
%134 = phi i32 [ %138, %pregion_for_entry.pregion_for_init42 ], [ %143, %pregion_for_cond44 ]
call void @llvm.lifetime.start.p0i8(i64 100, i8* %28) #5, !llvm.mem.parallel_loop_access !23
call void @llvm.memset.p0i8.i32(i8* align 64 %28, i8 0, i32 100, i1 false)
%135 = add i32 %134, %139
%136 = add i32 %20, %134
br label %.preheader.peeled_wi
pregion_for_init45: ; preds = %.preheader
call void @llvm.lifetime.end.p0i8(i64 100, i8* %28) #5
br label %pregion_for_entry.pregion_for_init42
pregion_for_entry.pregion_for_init42: ; preds = %pregion_for_cond47, %pregion_for_init45
%137 = phi i32 [ 0, %pregion_for_init45 ], [ %145, %pregion_for_cond47 ]
%138 = phi i32 [ 1, %pregion_for_init45 ], [ 0, %pregion_for_cond47 ]
%139 = mul i32 %137, %5
%140 = add i32 %25, %137
br label %.preheader.preheader.peeled_wi
pregion_for_cond44: ; preds = %.preheader.peeled_wi
%141 = getelementptr [25 x float], [25 x float]* %.acc.pocl_context, i32 %135
%142 = bitcast [25 x float]* %141 to i8*
call void @llvm.lifetime.end.p0i8(i64 100, i8* %142) #5, !llvm.mem.parallel_loop_access !23
%143 = add i32 %134, 1
%144 = icmp slt i32 %143, %5
br i1 %144, label %.preheader.preheader.peeled_wi, label %pregion_for_cond47, !llvm.loop !24
pregion_for_cond47: ; preds = %pregion_for_cond44
%145 = add i32 %137, 1
%146 = icmp slt i32 %145, %6
br i1 %146, label %pregion_for_entry.pregion_for_init42, label %exit.barrier, !llvm.loop !25
exit.barrier: ; preds = %pregion_for_cond47
call void @barrier(i32 0) #5
br label %UnifiedReturnBlock
.preheader.preheader.btr: ; preds = %pregion_for_cond, %pregion_for_entry.pregion_for_init
%147 = phi i32 [ 0, %pregion_for_entry.pregion_for_init ], [ %162, %pregion_for_cond ]
%148 = add i32 %147, %112
%149 = add i32 %20, %147
br label %.preheader.btr
.preheader.btr: ; preds = %.preheader.btr, %.preheader.preheader.btr
%150 = phi i32 [ %158, %.preheader.btr ], [ 0, %.preheader.preheader.btr ]
%151 = getelementptr inbounds [25 x float], [25 x float]* %.acc.pocl_context, i32 %148, i32 %150
%152 = load float, float* %151, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !26
%153 = shl i32 %150, 2
%154 = add i32 %153, %113
%155 = mul i32 %154, %M
%156 = add i32 %155, %149
%157 = getelementptr inbounds float, float addrspace(1)* %C, i32 %156
store float %152, float addrspace(1)* %157, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !26
%158 = add nuw nsw i32 %150, 1
%159 = icmp ult i32 %158, 25
br i1 %159, label %.preheader.btr, label %pregion_for_cond
pregion_for_cond: ; preds = %.preheader.btr
%160 = getelementptr [25 x float], [25 x float]* %.acc.pocl_context, i32 %148
%161 = bitcast [25 x float]* %160 to i8*
call void @llvm.lifetime.end.p0i8(i64 100, i8* %161) #5, !llvm.mem.parallel_loop_access !26
%162 = add i32 %147, 1
%163 = icmp slt i32 %162, %5
br i1 %163, label %.preheader.preheader.btr, label %pregion_for_cond16, !llvm.loop !27
pregion_for_cond16: ; preds = %pregion_for_cond
%164 = add i32 %111, 1
%165 = icmp slt i32 %164, %6
br i1 %165, label %pregion_for_entry.pregion_for_init, label %exit.barrier14, !llvm.loop !28
exit.barrier14: ; preds = %pregion_for_cond16
call void @barrier(i32 0) #5
br label %UnifiedReturnBlock
UnifiedReturnBlock: ; preds = %exit.barrier14, %exit.barrier
ret void
}
; Function Attrs: noinline nounwind
define void @transpose(i32 %P, i32 %Q, float addrspace(1)* nocapture readonly %input, float addrspace(1)* nocapture %output) #2 {
pregion_for_init6:
%0 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4
call void @barrier(i32 0) #5
%2 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !5
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 7), align 4, !tbaa !5
%4 = sub i32 %2, %3
%5 = udiv i32 %4, %0
%6 = shl i32 %5, 3
%7 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !5
%8 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 8), align 4, !tbaa !5
%9 = sub i32 %7, %8
%10 = udiv i32 %9, %1
%11 = shl i32 %10, 3
br label %pregion_for_entry.pregion_for_init3
pregion_for_entry.pregion_for_init3: ; preds = %pregion_for_cond8, %pregion_for_init6
%12 = phi i32 [ 0, %pregion_for_init6 ], [ %23, %pregion_for_cond8 ]
%13 = add i32 %11, %12
%14 = mul i32 %13, %P
br label %pregion_for_cond5
pregion_for_cond5: ; preds = %pregion_for_cond5, %pregion_for_entry.pregion_for_init3
%15 = phi i32 [ 0, %pregion_for_entry.pregion_for_init3 ], [ %21, %pregion_for_cond5 ]
%16 = add i32 %6, %15
%17 = add i32 %16, %14
%18 = getelementptr inbounds float, float addrspace(1)* %input, i32 %17
%19 = load float, float addrspace(1)* %18, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !29
%20 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %12, i32 %15
store float %19, float addrspace(3)* %20, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !29
%21 = add i32 %15, 1
%22 = icmp slt i32 %21, %0
br i1 %22, label %pregion_for_cond5, label %pregion_for_cond8, !llvm.loop !30
pregion_for_cond8: ; preds = %pregion_for_cond5
%23 = add i32 %12, 1
%24 = icmp slt i32 %23, %1
br i1 %24, label %pregion_for_entry.pregion_for_init3, label %pregion_for_init, !llvm.loop !31
pregion_for_init: ; preds = %pregion_for_cond8
tail call void @barrier(i32 1) #6
br label %pregion_for_entry.pregion_for_init
pregion_for_entry.pregion_for_init: ; preds = %pregion_for_cond2, %pregion_for_init
%25 = phi i32 [ 0, %pregion_for_init ], [ %36, %pregion_for_cond2 ]
%26 = add i32 %6, %25
%27 = mul i32 %26, %Q
br label %pregion_for_cond
pregion_for_cond: ; preds = %pregion_for_cond, %pregion_for_entry.pregion_for_init
%28 = phi i32 [ 0, %pregion_for_entry.pregion_for_init ], [ %34, %pregion_for_cond ]
%29 = add i32 %11, %28
%30 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %28, i32 %25
%31 = load float, float addrspace(3)* %30, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !32
%32 = add i32 %29, %27
%33 = getelementptr inbounds float, float addrspace(1)* %output, i32 %32
store float %31, float addrspace(1)* %33, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !32
%34 = add i32 %28, 1
%35 = icmp slt i32 %34, %0
br i1 %35, label %pregion_for_cond, label %pregion_for_cond2, !llvm.loop !33
pregion_for_cond2: ; preds = %pregion_for_cond
%36 = add i32 %25, 1
%37 = icmp slt i32 %36, %1
br i1 %37, label %pregion_for_entry.pregion_for_init, label %exit.barrier, !llvm.loop !34
exit.barrier: ; preds = %pregion_for_cond2
call void @barrier(i32 0) #5
ret void
}
declare i32 @__core_num()
; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3
; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #3
; Function Attrs: argmemonly nounwind willreturn writeonly
declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg) #4
attributes #0 = { noduplicate "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { noinline nounwind "_kernel_local_size"="80000" "_wi_alloca_size"="108" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { noinline nounwind "_kernel_local_size"="256" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { argmemonly nounwind willreturn }
attributes #4 = { argmemonly nounwind willreturn writeonly }
attributes #5 = { nounwind }
attributes #6 = { noduplicate nounwind }
!llvm.ident = !{!0}
!ocl.restrict = !{!1}
!opencl.kernels = !{!2, !3}
!0 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"}
!1 = distinct !{!1}
!2 = !{void (float addrspace(1)*, float addrspace(1)*, float addrspace(1)*, i32, i32, i32)* @myGEMM4}
!3 = !{void (i32, i32, float addrspace(1)*, float addrspace(1)*)* @transpose}
!4 = distinct !{!4}
!5 = !{!6, !6, i64 0}
!6 = !{!"int", !7, i64 0}
!7 = !{!"omnipotent char", !8, i64 0}
!8 = !{!"Simple C/C++ TBAA"}
!9 = !{!10, !11}
!10 = distinct !{!10}
!11 = distinct !{!11}
!12 = !{!13, !14}
!13 = distinct !{!13}
!14 = distinct !{!14}
!15 = !{!16, !16, i64 0}
!16 = !{!"float", !7, i64 0}
!17 = !{!18, !19}
!18 = distinct !{!18}
!19 = distinct !{!19}
!20 = !{!21, !22}
!21 = distinct !{!21}
!22 = distinct !{!22}
!23 = !{!24, !25}
!24 = distinct !{!24}
!25 = distinct !{!25}
!26 = !{!27, !28}
!27 = distinct !{!27}
!28 = distinct !{!28}
!29 = !{!30, !31}
!30 = distinct !{!30}
!31 = distinct !{!31}
!32 = !{!33, !34}
!33 = distinct !{!33}
!34 = distinct !{!34}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment