Last active
April 27, 2017 11:35
-
-
Save maleadt/67b86455a8b308ebe5b5caba2572077b to your computer and use it in GitHub Desktop.
CUDAnative.jl/#4 repro
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" | |
target triple = "nvptx64-nvidia-cuda" | |
@shmem = internal addrspace(3) global [4 x i32] zeroinitializer, align 4 | |
define void @kernel(i32*) { | |
top: | |
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() | |
%2 = and i32 %1, 31 | |
%3 = icmp eq i32 %2, 0 | |
br i1 %3, label %lane0_boundscheck, label %sync_shfl | |
lane0_boundscheck: | |
%4 = icmp ugt i32 %1, 3 | |
br i1 %4, label %lane0_oob, label %lane0_shmem | |
sync_shfl: | |
tail call void @llvm.nvvm.barrier0() | |
%5 = tail call i32 @llvm.nvvm.shfl.down.i32(i32 32, i32 1, i32 7199) | |
br i1 %3, label %lane0_writeback, label %end | |
lane0_oob: | |
tail call void @llvm.trap() | |
unreachable | |
lane0_shmem: | |
%6 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @shmem, i32 0, i32 %1 | |
store i32 0, i32 addrspace(3)* %6, align 8 | |
br label %sync_shfl | |
lane0_writeback: | |
store i32 %5, i32* %0, align 8 | |
br label %end | |
end: | |
ret void | |
} | |
declare void @llvm.trap() | |
declare i32 @llvm.nvvm.shfl.down.i32(i32, i32, i32) | |
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() | |
declare void @llvm.nvvm.barrier0() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.version 3.2 | |
.target sm_30 | |
.address_size 64 | |
.visible .entry kernel( | |
.param .u64 output // single int output | |
) | |
{ | |
.reg .pred %p<4>; | |
.reg .b32 %r<6>; | |
.reg .b64 %rd<6>; | |
.shared .align 4 .b8 shmem[16]; // 4 integers | |
ld.param.u64 %rd1, [output]; | |
// calculate lane, check if 0 | |
mov.u32 %r1, %tid.x; | |
and.b32 %r2, %r1, 31; | |
setp.ne.s32 %p1, %r2, 0; | |
@%p1 bra BB_SHFL; | |
// bounds check for shmem access | |
setp.gt.u32 %p2, %r1, 3; | |
@%p2 bra BB_OOB; | |
bra.uni BB_SHMEM; | |
BB_SHMEM: | |
mul.wide.s32 %rd2, %r1, 4; | |
mov.u64 %rd3, shmem; | |
add.s64 %rd4, %rd3, %rd2; | |
mov.u32 %r4, 0; | |
st.shared.u32 [%rd4], %r4; | |
BB_SHFL: | |
setp.eq.s32 %p3, %r2, 0; | |
bar.sync 0; | |
mov.u32 %r5, 32; | |
shfl.down.b32 %r3, %r5, 1, 7199; | |
@%p3 bra BB_WRITEBACK; | |
bra.uni BB_END; | |
BB_WRITEBACK: | |
cvta.to.global.u64 %rd5, %rd1; | |
st.global.u32 [%rd5], %r3; | |
BB_END: | |
ret; | |
BB_OOB: | |
trap; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
code for sm_35 | |
Function : kernel | |
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)" | |
/* 0x08b010a010a010bc */ | |
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */ | |
/*0010*/ S2R R0, SR_TID.X; /* 0x86400000109c0002 */ | |
/*0018*/ MOV R4, c[0x0][0x140]; /* 0x64c03c00281c0012 */ | |
/*0020*/ LOP.AND R2, R0, 0x1f; /* 0xc20000000f9c0009 */ | |
/*0028*/ SSY 0x70; /* 0x1480000020000000 */ | |
/*0030*/ ISETP.NE.AND P0, PT, R2, RZ, PT; /* 0xdb581c007f9c081e */ | |
/*0038*/ MOV R5, c[0x0][0x144]; /* 0x64c03c00289c0016 */ | |
/* 0x08c48000a000b088 */ | |
/*0048*/ @P0 NOP.S; /* 0x8580000000403c02 */ | |
/*0050*/ ISETP.GT.U32.AND P0, PT, R0, 0x3, PT; /* 0xb3401c00019c001d */ | |
/*0058*/ @P0 BRA 0xb0; /* 0x120000002800003c */ | |
/*0060*/ SHF.L.W R0, RZ, 0x2, R0; /* 0xb7e00000011ffc01 */ | |
/*0068*/ STS.S [R0], RZ; /* 0x7ae00000005c03fe */ | |
/*0070*/ ISETP.EQ.AND P0, PT, R2, RZ, PT; /* 0xdb281c007f9c081e */ | |
/*0078*/ BAR.SYNC 0x0; /* 0x8540dc00001c0002 */ | |
/* 0x088000b810a0b810 */ | |
/*0088*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */ | |
/*0090*/ @!P0 EXIT; /* 0x180000000020003c */ | |
/*0098*/ MOV32I R0, 0x20; /* 0x74000000101fc002 */ | |
/*00a0*/ ST.E [R4], R0; /* 0xe4800000001c1000 */ | |
/*00a8*/ BRA 0xb8; /* 0x12000000041c003c */ | |
/*00b0*/ BPT.TRAP 0x1; /* 0x0000000000800300 */ | |
/*00b8*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */ | |
/* 0x08000000000000b8 */ | |
/*00c8*/ EXIT; /* 0x18000000001c003c */ | |
/*00d0*/ BRA 0xd0; /* 0x12007ffffc1c003c */ | |
/*00d8*/ NOP; /* 0x85800000001c3c02 */ | |
/*00e0*/ NOP; /* 0x85800000001c3c02 */ | |
/*00e8*/ NOP; /* 0x85800000001c3c02 */ | |
/*00f0*/ NOP; /* 0x85800000001c3c02 */ | |
/*00f8*/ NOP; /* 0x85800000001c3c02 */ | |
....................... | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using CUDAdrv, CUDAnative, LLVM | |
dev = CuDevice(0) | |
ctx = CuContext(dev) | |
capability(dev) >= v"6.1" || warn("This bug has only been reproduced on sm_61 or higher.") | |
function test(name) | |
ir = readstring("$name.ll") | |
mod = parse(LLVM.Module, ir) | |
entry = get(functions(mod), "kernel") | |
ptx = CUDAnative.mcgen(mod, entry, v"3.0") | |
cuda_mod = CuModule(ptx) | |
cuda_fun = CuFunction(cuda_mod, "kernel") | |
ref = CuArray{Cint}(1) | |
cudacall(cuda_fun, 1, 4, Tuple{Ptr{Cint}}, pointer(ref)) | |
println("$name: ", Array(ref)[1]) | |
if !isfile("$name.ptx") | |
open("$name.ptx", "w") do io | |
write(io, ptx) | |
end | |
end | |
end | |
test("working") | |
test("broken") | |
destroy(ctx) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda.h> | |
#include <stdio.h> | |
extern "C" __global__ void kernel(int* ptr) { | |
static __shared__ int shared[4]; | |
int lane = threadIdx.x % warpSize; | |
if (lane==0) | |
shared[threadIdx.x] = 0; | |
__syncthreads(); | |
int val = __shfl_down(32, 1, 4); | |
if (lane == 0) | |
*ptr = val; | |
return; | |
} | |
int main() { | |
int *gpu_val; | |
cudaMalloc(&gpu_val, sizeof(int)); | |
kernel<<<1, 4>>>(gpu_val); | |
int val; | |
cudaMemcpy(&val, gpu_val, sizeof(int), cudaMemcpyDeviceToHost); | |
printf("%d\n", val); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using CUDAdrv, CUDAnative | |
import Base: pointerset | |
const warpsize = 32 | |
function kernel(ptr::Ptr{Cint}) | |
shared = @cuStaticSharedMem(Cint, 4) | |
# shared = Base.llvmcall( | |
# ("""@shmem1 = internal addrspace(3) global [4 x i32] zeroinitializer, align 4""", | |
# """%1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @shmem1, i64 0, i64 0 | |
# %2 = addrspacecast i32 addrspace(3)* %1 to i32 addrspace(0)* | |
# ret i32* %2"""), | |
# Ptr{Cint}, Tuple{}) | |
lane = (threadIdx().x-Cint(1)) % warpsize | |
if lane == 0 | |
@boundscheck Base.checkbounds(shared, threadIdx().x) | |
pointerset(shared.ptr, Cint(0), Int(threadIdx().x), 8) | |
end | |
sync_threads() | |
val = shfl_down(Cint(32), 1, 4) | |
if lane == 0 | |
pointerset(ptr, val, 1, 8) | |
end | |
return | |
end | |
dev = CuDevice(0) | |
ctx = CuContext(dev) | |
# println(code_typed(kernel, (Ptr{Cint},))) | |
println(CUDAnative.code_llvm(kernel, Tuple{Ptr{Cint}}; dump_module=true)) | |
gpu_val = CuArray{Cint}(1) | |
@cuda (1,4) kernel(pointer(gpu_val)) | |
val = Array(gpu_val)[1] | |
println(val) | |
destroy(ctx) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <cuda.h> | |
#define CHECK(err) __check(err, __FILE__, __LINE__) | |
static void __check(CUresult err, const char *file, const int line) { | |
if (CUDA_SUCCESS != err) { | |
const char *name, *descr; | |
cuGetErrorName(err, &name); | |
cuGetErrorString(err, &descr); | |
std::cerr << name << ": " << descr << ", at " << file << ":" << line << std::endl; | |
abort(); | |
} | |
} | |
void test(const std::string name) | |
{ | |
CUmodule mod; | |
std::string path = name + ".ptx"; | |
CHECK(cuModuleLoad(&mod, path.c_str())); | |
CUfunction fun; | |
CHECK(cuModuleGetFunction(&fun, mod, "kernel")); | |
int *gpu_ref; | |
CHECK(cuMemAlloc((CUdeviceptr*) &gpu_ref, sizeof(int))); | |
void *args[1] = {&gpu_ref}; | |
cuLaunchKernel(fun, 1, 1, 1, 4, 1, 1, 0, NULL, args, NULL); | |
int cpu_ref; | |
CHECK(cuMemcpyDtoH(&cpu_ref, (CUdeviceptr) gpu_ref, sizeof(int))); | |
std::cout << name << ": " << cpu_ref << std::endl; | |
CHECK(cuModuleUnload(mod)); | |
} | |
int main() { | |
CHECK(cuInit(0)); | |
CUdevice dev; | |
CHECK(cuDeviceGet(&dev, 0)); | |
CUcontext ctx; | |
CHECK(cuCtxCreate(&ctx, 0, dev)); | |
test("working"); | |
test("broken"); | |
CHECK(cuCtxDestroy(ctx)); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using CUDAdrv | |
dev = CuDevice(0) | |
ctx = CuContext(dev) | |
capability(dev) >= v"6.1" || warn("This bug has only been reproduced on sm_61 or higher.") | |
function test(name) | |
ptx = readstring("$name.ptx") | |
cuda_mod = CuModule(ptx) | |
cuda_fun = CuFunction(cuda_mod, "kernel") | |
ref = CuArray{Cint}(1) | |
cudacall(cuda_fun, 1, 4, Tuple{Ptr{Cint}}, pointer(ref)) | |
println("$name: ", Array(ref)[1]) | |
if !isfile("$name.sass") | |
cap = capability(dev) | |
gpu = "sm_$(cap.major)$(cap.minor)" | |
run(`ptxas --gpu-name $gpu --output-file $name.cuobj --input-as-string $ptx`) | |
open("$name.sass", "w") do io | |
print(io, readstring(`cuobjdump --dump-sass $name.cuobj`)) | |
end | |
rm("$name.cuobj") | |
end | |
end | |
test("working") | |
test("broken") | |
destroy(ctx) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" | |
target triple = "nvptx64-nvidia-cuda" | |
@shmem = internal addrspace(3) global [4 x i32] zeroinitializer, align 4 | |
define void @kernel(i32*) { | |
top: | |
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() | |
%2 = and i32 %1, 31 | |
%3 = icmp eq i32 %2, 0 | |
br i1 %3, label %lane0_boundscheck, label %sync_shfl | |
lane0_boundscheck: | |
%4 = icmp ugt i32 %1, 3 | |
br i1 %4, label %lane0_oob, label %lane0_shmem | |
lane0_oob: | |
tail call void @llvm.trap() | |
unreachable | |
sync_shfl: | |
tail call void @llvm.nvvm.barrier0() | |
%5 = tail call i32 @llvm.nvvm.shfl.down.i32(i32 32, i32 1, i32 7199) | |
br i1 %3, label %lane0_writeback, label %end | |
lane0_shmem: | |
%6 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @shmem, i32 0, i32 %1 | |
store i32 0, i32 addrspace(3)* %6, align 8 | |
br label %sync_shfl | |
lane0_writeback: | |
store i32 %5, i32* %0, align 8 | |
br label %end | |
end: | |
ret void | |
} | |
declare void @llvm.trap() | |
declare i32 @llvm.nvvm.shfl.down.i32(i32, i32, i32) | |
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() | |
declare void @llvm.nvvm.barrier0() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.version 3.2 | |
.target sm_30 | |
.address_size 64 | |
.visible .entry kernel( | |
.param .u64 output // single int output | |
) | |
{ | |
.reg .pred %p<4>; | |
.reg .b32 %r<6>; | |
.reg .b64 %rd<6>; | |
.shared .align 4 .b8 shmem[16]; // 4 integers | |
ld.param.u64 %rd1, [output]; | |
// calculate lane, check if 0 | |
mov.u32 %r1, %tid.x; | |
and.b32 %r2, %r1, 31; | |
setp.ne.s32 %p1, %r2, 0; | |
@%p1 bra BB_SHFL; | |
// bounds check for shmem access | |
setp.lt.u32 %p2, %r1, 4; | |
@%p2 bra BB_SHMEM; | |
bra.uni BB_OOB; | |
BB_SHMEM: | |
mul.wide.s32 %rd2, %r1, 4; | |
mov.u64 %rd3, shmem; | |
add.s64 %rd4, %rd3, %rd2; | |
mov.u32 %r4, 0; | |
st.shared.u32 [%rd4], %r4; | |
BB_SHFL: | |
setp.eq.s32 %p3, %r2, 0; | |
bar.sync 0; | |
mov.u32 %r5, 32; | |
shfl.down.b32 %r3, %r5, 1, 7199; | |
@%p3 bra BB_WRITEBACK; | |
bra.uni BB_END; | |
BB_WRITEBACK: | |
cvta.to.global.u64 %rd5, %rd1; | |
st.global.u32 [%rd5], %r3; | |
BB_END: | |
ret; | |
BB_OOB: | |
trap; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
code for sm_35 | |
Function : kernel | |
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)" | |
/* 0x08b010a010a010bc */ | |
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */ | |
/*0010*/ S2R R0, SR_TID.X; /* 0x86400000109c0002 */ | |
/*0018*/ MOV R4, c[0x0][0x140]; /* 0x64c03c00281c0012 */ | |
/*0020*/ LOP.AND R2, R0, 0x1f; /* 0xc20000000f9c0009 */ | |
/*0028*/ SSY 0x70; /* 0x1480000020000000 */ | |
/*0030*/ ISETP.NE.AND P0, PT, R2, RZ, PT; /* 0xdb581c007f9c081e */ | |
/*0038*/ MOV R5, c[0x0][0x144]; /* 0x64c03c00289c0016 */ | |
/* 0x08c48000a000b088 */ | |
/*0048*/ @P0 NOP.S; /* 0x8580000000403c02 */ | |
/*0050*/ ISETP.LT.U32.AND P0, PT, R0, 0x4, PT; /* 0xb3101c00021c001d */ | |
/*0058*/ @!P0 BRA 0xb0; /* 0x120000002820003c */ | |
/*0060*/ SHF.L.W R0, RZ, 0x2, R0; /* 0xb7e00000011ffc01 */ | |
/*0068*/ STS.S [R0], RZ; /* 0x7ae00000005c03fe */ | |
/*0070*/ ISETP.EQ.AND P0, PT, R2, RZ, PT; /* 0xdb281c007f9c081e */ | |
/*0078*/ BAR.SYNC 0x0; /* 0x8540dc00001c0002 */ | |
/* 0x088000b810a0b810 */ | |
/*0088*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */ | |
/*0090*/ @!P0 EXIT; /* 0x180000000020003c */ | |
/*0098*/ MOV32I R0, 0x20; /* 0x74000000101fc002 */ | |
/*00a0*/ ST.E [R4], R0; /* 0xe4800000001c1000 */ | |
/*00a8*/ BRA 0xb8; /* 0x12000000041c003c */ | |
/*00b0*/ BPT.TRAP 0x1; /* 0x0000000000800300 */ | |
/*00b8*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */ | |
/* 0x08000000000000b8 */ | |
/*00c8*/ EXIT; /* 0x18000000001c003c */ | |
/*00d0*/ BRA 0xd0; /* 0x12007ffffc1c003c */ | |
/*00d8*/ NOP; /* 0x85800000001c3c02 */ | |
/*00e0*/ NOP; /* 0x85800000001c3c02 */ | |
/*00e8*/ NOP; /* 0x85800000001c3c02 */ | |
/*00f0*/ NOP; /* 0x85800000001c3c02 */ | |
/*00f8*/ NOP; /* 0x85800000001c3c02 */ | |
....................... | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment