Skip to content

Instantly share code, notes, and snippets.

@navyxliu
Created January 23, 2015 16:13
Show Gist options
  • Save navyxliu/b72419edbea4e349cd54 to your computer and use it in GitHub Desktop.
Save navyxliu/b72419edbea4e349cd54 to your computer and use it in GitHub Desktop.
bug018
#include <amp.h>
#include <stdio.h>
#include <stdlib.h>
#include <vector>
using namespace Concurrency;
template <typename _type>
int test_debug() __GPU {
const int rank = _type::rank;
int data1[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
int correct_size = 1;
extent<rank> e1(data1);
const _type g1(e1);
//bad
for (int i = 0; i < 2; i++)
correct_size = data1[i];
//okay
// correct_size = data1[1];
return correct_size;
}
int main()
{
int gpu_result;
concurrency::array_view<int, 1> gpu_resultv(1, &gpu_result);
gpu_resultv.discard_data();
concurrency::parallel_for_each(gpu_resultv.get_extent()
, [=](concurrency::index<1> idx) restrict(amp) {
gpu_resultv[idx] = []() restrict(amp,cpu)->int{
return test_debug<extent<4>>();
}();
});
gpu_resultv.synchronize();
printf("%d", gpu_result);
return 0;
}
@navyxliu
Copy link
Author

pathamp -device=tahiti xxx.cc

this two blocks should give the same answer. it's not true on my machine.
we get correct-size == 0 for the loop, and 2 in direct load.

//bad
for (int i = 0; i < 2; i++)
correct_size = data1[i];
//okay
// correct_size = data1[1];

@navyxliu
Copy link
Author

simple one.
llc -march=r600 -mcpu=tahiti -filetype=asm xxx.ll

; ====== CG2 GPU IR After optimizations ======
; ModuleID = 'simple_bug018.cc_gpu'
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "r600--"

%struct.index = type <{ i32 }>

define void @"_ZZ4mainENK3$_0clEN11Concurrency5indexILi1EEE.amp"([48 x i8] addrspace(2)* byval %this_struct.s0, %struct.index addrspace(1)* %idx) #0 {
BB:
%_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr = alloca [10 x i32], align 16
%0 = getelementptr inbounds [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 0
store i32 1, i32* %0, align 16
%1 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 1
store i32 2, i32* %1, align 4
%2 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 2
store i32 3, i32* %2, align 8
%3 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 3
store i32 4, i32* %3, align 4
%4 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 4
store i32 5, i32* %4, align 16
%5 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 5
store i32 6, i32* %5, align 4
%6 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 6
store i32 7, i32* %6, align 8
%7 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 7
store i32 8, i32* %7, align 4
%8 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 8
store i32 9, i32* %8, align 16
%9 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 9
store i32 10, i32* %9, align 4
%10 = ptrtoint [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr to i32
%11 = zext i32 %10 to i64
%12 = getelementptr [10 x i32]* %_ZZ10test_debugIN11Concurrency6extentILi4EEEEivE5data1.addr, i32 0, i32 1
%13 = ptrtoint i32* %12 to i32
%14 = zext i32 %13 to i64
br label %BB_label_2562

BB_label_2562: ; preds = %BB_label_2562, %BB
%preg.61.addr.0 = phi i64 [ %11, %BB ], [ %15, %BB_label_2562 ]
%15 = add i64 %preg.61.addr.0, 4
%16 = icmp sgt i64 %15, %14
br i1 %16, label %BB2, label %BB_label_2562

BB2: ; preds = %BB_label_2562
%17 = inttoptr i64 %preg.61.addr.0 to i32 addrspace(1)*
%18 = load i32 addrspace(1)* %17, align 4
%19 = getelementptr [48 x i8] addrspace(2)* %this_struct.s0, i64 0, i64 24
%20 = bitcast i8 addrspace(2)* %19 to i64 addrspace(2)*
%21 = load i64 addrspace(2)* %20, align 8
%22 = getelementptr [48 x i8] addrspace(2)* %this_struct.s0, i64 0, i64 12
%23 = bitcast i8 addrspace(2)* %22 to i32 addrspace(2)*
%24 = load i32 addrspace(2)* %23, align 4
%25 = add i32 %24, 1
%26 = sext i32 %25 to i64
%27 = shl nsw i64 %26, 2
%28 = inttoptr i64 %26 to i32 addrspace(1)*
store i32 %18, i32 addrspace(1)* %28, align 4
ret void
}

; Function Attrs: nounwind readnone
declare i32 @llvm.device.thread.id.x() #1

; Function Attrs: nounwind readnone
declare i32 @llvm.device.block.id.x() #1

; Function Attrs: nounwind readnone
declare i32 @llvm.device.block.size.x() #1

attributes #0 = { "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
attributes #1 = { nounwind readnone }

@navyxliu
Copy link
Author

navyxliu commented Feb 7, 2015

I spot the same issue using both pathscale's branch and llvm svn trunk. I am targeting to southern island processors.

the problem instruction is buffer_load_dword in BB#2. you can reproduce it using 'llc -march=r600 -mcpu=tahiti --filetype=asm bug018.ll -o bug018.s'. the sbase s[8:11] is wrong because s[8:9] is assigned as 0. it should point to the scratch. the values of data1 are on it.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment