-
-
Save navyxliu/cc624857eb54f13db2fb to your computer and use it in GitHub Desktop.
//adapted from bug011 | |
#include <amp.h> | |
using namespace Concurrency; | |
int test() restrict(amp) | |
{ | |
int data0[] = {1, 2, 3, 4}; | |
extent<4> e4(data0); | |
for (int i = 0; i < 4; i++) | |
{ | |
if (e4[i] != i + 1) | |
{ | |
return 12; | |
} | |
} | |
return 0; | |
} | |
void kernel(index<1>& idx, array<int, 1>& result) restrict(amp) | |
{ | |
result[idx] = test(); | |
} | |
const int size = 4; | |
int test_device() | |
{ | |
accelerator device;// = require_device(Device::ALL_DEVICES); | |
accelerator_view av = device.get_default_view(); | |
extent<1> e(size); | |
array<int, 1> result(e, av); | |
std::vector<int> presult(size, 0); | |
parallel_for_each(e, [&](index<1> idx) restrict(amp) { | |
kernel(idx, result); | |
}); | |
presult = result; | |
for (int i = 0; i < size; i++) | |
{ | |
if (presult[i] != 0) | |
{ | |
printf("Test failed. Return code: %d\n", presult[i]); | |
return 1; | |
} | |
} | |
return 0; | |
} | |
int main(int argc, char **argv) | |
{ | |
int result = test_device(); | |
printf("Test %s on device\n", ((result == 0) ? "passed" : "failed")); | |
return result; | |
} |
here is part of llvm ir for the function test. please note %9 is using addressspace(1). it is suspicious. it's inconsistent with in initializer before. should e4 on local memory or global memory here?
define void @"_ZZ11test_devicevENK3$_0clEN11Concurrency5indexILi1EEE.amp"([8 x i8] addrspace(2)* byval %this_struct.s0, %struct.index addrspace(1)* %idx) #0 {
BB:
%_ZZ4testvE2e4.addr = alloca %struct.extent, align 8
%0 = getelementptr inbounds %struct.extent* %_ZZ4testvE2e4.addr, i32 0, i32 0, i32 0
store i32 1, i32* %0, align 8
%1 = getelementptr %struct.extent* %_ZZ4testvE2e4.addr, i32 0, i32 0, i32 1
store i32 2, i32* %1, align 4
%2 = getelementptr %struct.extent* %_ZZ4testvE2e4.addr, i32 0, i32 0, i32 2
store i32 3, i32* %2, align 8
%3 = getelementptr %struct.extent* %_ZZ4testvE2e4.addr, i32 0, i32 0, i32 3
store i32 4, i32* %3, align 4
br label %BB_label_3842
BB_label_3842: ; preds = %BB_label_4098, %BB
%preg.50.addr.0 = phi i32 [ 1, %BB ], [ %13, %BB_label_4098 ]
%preg.49.addr.0 = phi i32 [ 0, %BB ], [ %12, %BB_label_4098 ]
%4 = ptrtoint %struct.extent* %_ZZ4testvE2e4.addr to i32
%5 = zext i32 %4 to i64
%6 = zext i32 %preg.49.addr.0 to i64
%7 = shl nuw nsw i64 %6, 2
%8 = add i64 %5, %7
%9 = inttoptr i64 %8 to i32 addrspace(1)*
%10 = load i32 addrspace(1)* %9, align 4
%11 = icmp eq i32 %10, %preg.50.addr.0
br i1 %11, label %BB_label_4098, label %BB2
vadimg: yes, that's wrong. it shouldn't be converted to address space 1 (global) pointer. emitter does that because we have no info about address spaces in WHIRL, and by default it assumes global ptr
pathamp bug011.cc -S will dump GPU assemblies.
{1, 2, 3,4} are initialized on LDS. instructions like these:
V_MOV_B32_e32 v3, 1
DS_WRITE_B32 v1, v3, 0x0 [M0]
V_OR_B32_e32 v4, 4, v1
V_MOV_B32_e32 v5, 2
DS_WRITE_B32 v4, v5, 0x0 [M0]
on the other side, e4[i] load data from global memory?
I observed "BUFFER_LOAD_DWORD v8, s[4:7] + v[8:9] + 0x0"