-
-
Save navyxliu/b72419edbea4e349cd54 to your computer and use it in GitHub Desktop.
#include <amp.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <vector> | |
using namespace Concurrency; | |
template <typename _type> | |
int test_debug() __GPU { | |
const int rank = _type::rank; | |
int data1[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; | |
int correct_size = 1; | |
extent<rank> e1(data1); | |
const _type g1(e1); | |
//bad | |
for (int i = 0; i < 2; i++) | |
correct_size = data1[i]; | |
//okay | |
// correct_size = data1[1]; | |
return correct_size; | |
} | |
int main() | |
{ | |
int gpu_result; | |
concurrency::array_view<int, 1> gpu_resultv(1, &gpu_result); | |
gpu_resultv.discard_data(); | |
concurrency::parallel_for_each(gpu_resultv.get_extent() | |
, [=](concurrency::index<1> idx) restrict(amp) { | |
gpu_resultv[idx] = []() restrict(amp,cpu)->int{ | |
return test_debug<extent<4>>(); | |
}(); | |
}); | |
gpu_resultv.synchronize(); | |
printf("%d", gpu_result); | |
return 0; | |
} | |
simple one.
llc -march=r600 -mcpu=tahiti -filetype=asm xxx.ll
; ====== CG2 GPU IR After optimizations ======
; ModuleID = 'simple_bug018.cc_gpu'
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "r600--"
%struct.index = type <{ i32 }>
define void @"_ZZ4mainENK3$_0clEN11Concurrency5indexILi1EEE.amp"([48 x i8] addrspace(2)* byval %this_struct.s0, %struct.index addrspace(1)* %idx) #0 {
BB:
%_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr = alloca [10 x i32], align 16
%0 = getelementptr inbounds [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 0
store i32 1, i32* %0, align 16
%1 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 1
store i32 2, i32* %1, align 4
%2 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 2
store i32 3, i32* %2, align 8
%3 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 3
store i32 4, i32* %3, align 4
%4 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 4
store i32 5, i32* %4, align 16
%5 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 5
store i32 6, i32* %5, align 4
%6 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 6
store i32 7, i32* %6, align 8
%7 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 7
store i32 8, i32* %7, align 4
%8 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 8
store i32 9, i32* %8, align 16
%9 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 9
store i32 10, i32* %9, align 4
%10 = ptrtoint [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr to i32
%11 = zext i32 %10 to i64
%12 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 1
%13 = ptrtoint i32* %12 to i32
%14 = zext i32 %13 to i64
br label %BB_label_2562
BB_label_2562: ; preds = %BB_label_2562, %BB
%preg.61.addr.0 = phi i64 [ %11, %BB ], [ %15, %BB_label_2562 ]
%15 = add i64 %preg.61.addr.0, 4
%16 = icmp sgt i64 %15, %14
br i1 %16, label %BB2, label %BB_label_2562
BB2: ; preds = %BB_label_2562
%17 = inttoptr i64 %preg.61.addr.0 to i32 addrspace(1)*
%18 = load i32 addrspace(1)* %17, align 4
%19 = getelementptr [48 x i8] addrspace(2)* %this_struct.s0, i64 0, i64 24
%20 = bitcast i8 addrspace(2)* %19 to i64 addrspace(2)*
%21 = load i64 addrspace(2)* %20, align 8
%22 = getelementptr [48 x i8] addrspace(2)* %this_struct.s0, i64 0, i64 12
%23 = bitcast i8 addrspace(2)* %22 to i32 addrspace(2)*
%24 = load i32 addrspace(2)* %23, align 4
%25 = add i32 %24, 1
%26 = sext i32 %25 to i64
%27 = shl nsw i64 %26, 2
%28 = inttoptr i64 %26 to i32 addrspace(1)*
store i32 %18, i32 addrspace(1)* %28, align 4
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.device.thread.id.x() #1
; Function Attrs: nounwind readnone
declare i32 @llvm.device.block.id.x() #1
; Function Attrs: nounwind readnone
declare i32 @llvm.device.block.size.x() #1
attributes #0 = { "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
attributes #1 = { nounwind readnone }
I spot the same issue using both pathscale's branch and llvm svn trunk. I am targeting to southern island processors.
the problem instruction is buffer_load_dword in BB#2. you can reproduce it using 'llc -march=r600 -mcpu=tahiti --filetype=asm bug018.ll -o bug018.s'. the sbase s[8:11] is wrong because s[8:9] is assigned as 0. it should point to the scratch. the values of data1 are on it.
pathamp -device=tahiti xxx.cc
this two blocks should give the same answer. it's not true on my machine.
we get correct-size == 0 for the loop, and 2 in direct load.
//bad
for (int i = 0; i < 2; i++)
correct_size = data1[i];
//okay
// correct_size = data1[1];