Skip to content

Instantly share code, notes, and snippets.

@maleadt
Last active April 27, 2017 11:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save maleadt/67b86455a8b308ebe5b5caba2572077b to your computer and use it in GitHub Desktop.
Save maleadt/67b86455a8b308ebe5b5caba2572077b to your computer and use it in GitHub Desktop.
CUDAnative.jl/#4 repro
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
@shmem = internal addrspace(3) global [4 x i32] zeroinitializer, align 4
define void @kernel(i32*) {
top:
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%2 = and i32 %1, 31
%3 = icmp eq i32 %2, 0
br i1 %3, label %lane0_boundscheck, label %sync_shfl
lane0_boundscheck:
%4 = icmp ugt i32 %1, 3
br i1 %4, label %lane0_oob, label %lane0_shmem
sync_shfl:
tail call void @llvm.nvvm.barrier0()
%5 = tail call i32 @llvm.nvvm.shfl.down.i32(i32 32, i32 1, i32 7199)
br i1 %3, label %lane0_writeback, label %end
lane0_oob:
tail call void @llvm.trap()
unreachable
lane0_shmem:
%6 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @shmem, i32 0, i32 %1
store i32 0, i32 addrspace(3)* %6, align 8
br label %sync_shfl
lane0_writeback:
store i32 %5, i32* %0, align 8
br label %end
end:
ret void
}
declare void @llvm.trap()
declare i32 @llvm.nvvm.shfl.down.i32(i32, i32, i32)
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
declare void @llvm.nvvm.barrier0()
.version 3.2
.target sm_30
.address_size 64
.visible .entry kernel(
.param .u64 output // single int output
)
{
.reg .pred %p<4>;
.reg .b32 %r<6>;
.reg .b64 %rd<6>;
.shared .align 4 .b8 shmem[16]; // 4 integers
ld.param.u64 %rd1, [output];
// calculate lane, check if 0
mov.u32 %r1, %tid.x;
and.b32 %r2, %r1, 31;
setp.ne.s32 %p1, %r2, 0;
@%p1 bra BB_SHFL;
// bounds check for shmem access
setp.gt.u32 %p2, %r1, 3;
@%p2 bra BB_OOB;
bra.uni BB_SHMEM;
BB_SHMEM:
mul.wide.s32 %rd2, %r1, 4;
mov.u64 %rd3, shmem;
add.s64 %rd4, %rd3, %rd2;
mov.u32 %r4, 0;
st.shared.u32 [%rd4], %r4;
BB_SHFL:
setp.eq.s32 %p3, %r2, 0;
bar.sync 0;
mov.u32 %r5, 32;
shfl.down.b32 %r3, %r5, 1, 7199;
@%p3 bra BB_WRITEBACK;
bra.uni BB_END;
BB_WRITEBACK:
cvta.to.global.u64 %rd5, %rd1;
st.global.u32 [%rd5], %r3;
BB_END:
ret;
BB_OOB:
trap;
}
code for sm_35
Function : kernel
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x08b010a010a010bc */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ S2R R0, SR_TID.X; /* 0x86400000109c0002 */
/*0018*/ MOV R4, c[0x0][0x140]; /* 0x64c03c00281c0012 */
/*0020*/ LOP.AND R2, R0, 0x1f; /* 0xc20000000f9c0009 */
/*0028*/ SSY 0x70; /* 0x1480000020000000 */
/*0030*/ ISETP.NE.AND P0, PT, R2, RZ, PT; /* 0xdb581c007f9c081e */
/*0038*/ MOV R5, c[0x0][0x144]; /* 0x64c03c00289c0016 */
/* 0x08c48000a000b088 */
/*0048*/ @P0 NOP.S; /* 0x8580000000403c02 */
/*0050*/ ISETP.GT.U32.AND P0, PT, R0, 0x3, PT; /* 0xb3401c00019c001d */
/*0058*/ @P0 BRA 0xb0; /* 0x120000002800003c */
/*0060*/ SHF.L.W R0, RZ, 0x2, R0; /* 0xb7e00000011ffc01 */
/*0068*/ STS.S [R0], RZ; /* 0x7ae00000005c03fe */
/*0070*/ ISETP.EQ.AND P0, PT, R2, RZ, PT; /* 0xdb281c007f9c081e */
/*0078*/ BAR.SYNC 0x0; /* 0x8540dc00001c0002 */
/* 0x088000b810a0b810 */
/*0088*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/*0090*/ @!P0 EXIT; /* 0x180000000020003c */
/*0098*/ MOV32I R0, 0x20; /* 0x74000000101fc002 */
/*00a0*/ ST.E [R4], R0; /* 0xe4800000001c1000 */
/*00a8*/ BRA 0xb8; /* 0x12000000041c003c */
/*00b0*/ BPT.TRAP 0x1; /* 0x0000000000800300 */
/*00b8*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/* 0x08000000000000b8 */
/*00c8*/ EXIT; /* 0x18000000001c003c */
/*00d0*/ BRA 0xd0; /* 0x12007ffffc1c003c */
/*00d8*/ NOP; /* 0x85800000001c3c02 */
/*00e0*/ NOP; /* 0x85800000001c3c02 */
/*00e8*/ NOP; /* 0x85800000001c3c02 */
/*00f0*/ NOP; /* 0x85800000001c3c02 */
/*00f8*/ NOP; /* 0x85800000001c3c02 */
.......................
using CUDAdrv, CUDAnative, LLVM
dev = CuDevice(0)
ctx = CuContext(dev)
capability(dev) >= v"6.1" || warn("This bug has only been reproduced on sm_61 or higher.")
function test(name)
ir = readstring("$name.ll")
mod = parse(LLVM.Module, ir)
entry = get(functions(mod), "kernel")
ptx = CUDAnative.mcgen(mod, entry, v"3.0")
cuda_mod = CuModule(ptx)
cuda_fun = CuFunction(cuda_mod, "kernel")
ref = CuArray{Cint}(1)
cudacall(cuda_fun, 1, 4, Tuple{Ptr{Cint}}, pointer(ref))
println("$name: ", Array(ref)[1])
if !isfile("$name.ptx")
open("$name.ptx", "w") do io
write(io, ptx)
end
end
end
test("working")
test("broken")
destroy(ctx)
#include <cuda.h>
#include <stdio.h>
extern "C" __global__ void kernel(int* ptr) {
static __shared__ int shared[4];
int lane = threadIdx.x % warpSize;
if (lane==0)
shared[threadIdx.x] = 0;
__syncthreads();
int val = __shfl_down(32, 1, 4);
if (lane == 0)
*ptr = val;
return;
}
int main() {
int *gpu_val;
cudaMalloc(&gpu_val, sizeof(int));
kernel<<<1, 4>>>(gpu_val);
int val;
cudaMemcpy(&val, gpu_val, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\n", val);
}
using CUDAdrv, CUDAnative
import Base: pointerset
const warpsize = 32
function kernel(ptr::Ptr{Cint})
shared = @cuStaticSharedMem(Cint, 4)
# shared = Base.llvmcall(
# ("""@shmem1 = internal addrspace(3) global [4 x i32] zeroinitializer, align 4""",
# """%1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @shmem1, i64 0, i64 0
# %2 = addrspacecast i32 addrspace(3)* %1 to i32 addrspace(0)*
# ret i32* %2"""),
# Ptr{Cint}, Tuple{})
lane = (threadIdx().x-Cint(1)) % warpsize
if lane == 0
@boundscheck Base.checkbounds(shared, threadIdx().x)
pointerset(shared.ptr, Cint(0), Int(threadIdx().x), 8)
end
sync_threads()
val = shfl_down(Cint(32), 1, 4)
if lane == 0
pointerset(ptr, val, 1, 8)
end
return
end
dev = CuDevice(0)
ctx = CuContext(dev)
# println(code_typed(kernel, (Ptr{Cint},)))
println(CUDAnative.code_llvm(kernel, Tuple{Ptr{Cint}}; dump_module=true))
gpu_val = CuArray{Cint}(1)
@cuda (1,4) kernel(pointer(gpu_val))
val = Array(gpu_val)[1]
println(val)
destroy(ctx)
#include <iostream>
#include <cuda.h>
#define CHECK(err) __check(err, __FILE__, __LINE__)
static void __check(CUresult err, const char *file, const int line) {
if (CUDA_SUCCESS != err) {
const char *name, *descr;
cuGetErrorName(err, &name);
cuGetErrorString(err, &descr);
std::cerr << name << ": " << descr << ", at " << file << ":" << line << std::endl;
abort();
}
}
void test(const std::string name)
{
CUmodule mod;
std::string path = name + ".ptx";
CHECK(cuModuleLoad(&mod, path.c_str()));
CUfunction fun;
CHECK(cuModuleGetFunction(&fun, mod, "kernel"));
int *gpu_ref;
CHECK(cuMemAlloc((CUdeviceptr*) &gpu_ref, sizeof(int)));
void *args[1] = {&gpu_ref};
cuLaunchKernel(fun, 1, 1, 1, 4, 1, 1, 0, NULL, args, NULL);
int cpu_ref;
CHECK(cuMemcpyDtoH(&cpu_ref, (CUdeviceptr) gpu_ref, sizeof(int)));
std::cout << name << ": " << cpu_ref << std::endl;
CHECK(cuModuleUnload(mod));
}
int main() {
CHECK(cuInit(0));
CUdevice dev;
CHECK(cuDeviceGet(&dev, 0));
CUcontext ctx;
CHECK(cuCtxCreate(&ctx, 0, dev));
test("working");
test("broken");
CHECK(cuCtxDestroy(ctx));
return 0;
}
using CUDAdrv
dev = CuDevice(0)
ctx = CuContext(dev)
capability(dev) >= v"6.1" || warn("This bug has only been reproduced on sm_61 or higher.")
function test(name)
ptx = readstring("$name.ptx")
cuda_mod = CuModule(ptx)
cuda_fun = CuFunction(cuda_mod, "kernel")
ref = CuArray{Cint}(1)
cudacall(cuda_fun, 1, 4, Tuple{Ptr{Cint}}, pointer(ref))
println("$name: ", Array(ref)[1])
if !isfile("$name.sass")
cap = capability(dev)
gpu = "sm_$(cap.major)$(cap.minor)"
run(`ptxas --gpu-name $gpu --output-file $name.cuobj --input-as-string $ptx`)
open("$name.sass", "w") do io
print(io, readstring(`cuobjdump --dump-sass $name.cuobj`))
end
rm("$name.cuobj")
end
end
test("working")
test("broken")
destroy(ctx)
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
@shmem = internal addrspace(3) global [4 x i32] zeroinitializer, align 4
define void @kernel(i32*) {
top:
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%2 = and i32 %1, 31
%3 = icmp eq i32 %2, 0
br i1 %3, label %lane0_boundscheck, label %sync_shfl
lane0_boundscheck:
%4 = icmp ugt i32 %1, 3
br i1 %4, label %lane0_oob, label %lane0_shmem
lane0_oob:
tail call void @llvm.trap()
unreachable
sync_shfl:
tail call void @llvm.nvvm.barrier0()
%5 = tail call i32 @llvm.nvvm.shfl.down.i32(i32 32, i32 1, i32 7199)
br i1 %3, label %lane0_writeback, label %end
lane0_shmem:
%6 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @shmem, i32 0, i32 %1
store i32 0, i32 addrspace(3)* %6, align 8
br label %sync_shfl
lane0_writeback:
store i32 %5, i32* %0, align 8
br label %end
end:
ret void
}
declare void @llvm.trap()
declare i32 @llvm.nvvm.shfl.down.i32(i32, i32, i32)
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
declare void @llvm.nvvm.barrier0()
.version 3.2
.target sm_30
.address_size 64
.visible .entry kernel(
.param .u64 output // single int output
)
{
.reg .pred %p<4>;
.reg .b32 %r<6>;
.reg .b64 %rd<6>;
.shared .align 4 .b8 shmem[16]; // 4 integers
ld.param.u64 %rd1, [output];
// calculate lane, check if 0
mov.u32 %r1, %tid.x;
and.b32 %r2, %r1, 31;
setp.ne.s32 %p1, %r2, 0;
@%p1 bra BB_SHFL;
// bounds check for shmem access
setp.lt.u32 %p2, %r1, 4;
@%p2 bra BB_SHMEM;
bra.uni BB_OOB;
BB_SHMEM:
mul.wide.s32 %rd2, %r1, 4;
mov.u64 %rd3, shmem;
add.s64 %rd4, %rd3, %rd2;
mov.u32 %r4, 0;
st.shared.u32 [%rd4], %r4;
BB_SHFL:
setp.eq.s32 %p3, %r2, 0;
bar.sync 0;
mov.u32 %r5, 32;
shfl.down.b32 %r3, %r5, 1, 7199;
@%p3 bra BB_WRITEBACK;
bra.uni BB_END;
BB_WRITEBACK:
cvta.to.global.u64 %rd5, %rd1;
st.global.u32 [%rd5], %r3;
BB_END:
ret;
BB_OOB:
trap;
}
code for sm_35
Function : kernel
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x08b010a010a010bc */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ S2R R0, SR_TID.X; /* 0x86400000109c0002 */
/*0018*/ MOV R4, c[0x0][0x140]; /* 0x64c03c00281c0012 */
/*0020*/ LOP.AND R2, R0, 0x1f; /* 0xc20000000f9c0009 */
/*0028*/ SSY 0x70; /* 0x1480000020000000 */
/*0030*/ ISETP.NE.AND P0, PT, R2, RZ, PT; /* 0xdb581c007f9c081e */
/*0038*/ MOV R5, c[0x0][0x144]; /* 0x64c03c00289c0016 */
/* 0x08c48000a000b088 */
/*0048*/ @P0 NOP.S; /* 0x8580000000403c02 */
/*0050*/ ISETP.LT.U32.AND P0, PT, R0, 0x4, PT; /* 0xb3101c00021c001d */
/*0058*/ @!P0 BRA 0xb0; /* 0x120000002820003c */
/*0060*/ SHF.L.W R0, RZ, 0x2, R0; /* 0xb7e00000011ffc01 */
/*0068*/ STS.S [R0], RZ; /* 0x7ae00000005c03fe */
/*0070*/ ISETP.EQ.AND P0, PT, R2, RZ, PT; /* 0xdb281c007f9c081e */
/*0078*/ BAR.SYNC 0x0; /* 0x8540dc00001c0002 */
/* 0x088000b810a0b810 */
/*0088*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/*0090*/ @!P0 EXIT; /* 0x180000000020003c */
/*0098*/ MOV32I R0, 0x20; /* 0x74000000101fc002 */
/*00a0*/ ST.E [R4], R0; /* 0xe4800000001c1000 */
/*00a8*/ BRA 0xb8; /* 0x12000000041c003c */
/*00b0*/ BPT.TRAP 0x1; /* 0x0000000000800300 */
/*00b8*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/* 0x08000000000000b8 */
/*00c8*/ EXIT; /* 0x18000000001c003c */
/*00d0*/ BRA 0xd0; /* 0x12007ffffc1c003c */
/*00d8*/ NOP; /* 0x85800000001c3c02 */
/*00e0*/ NOP; /* 0x85800000001c3c02 */
/*00e8*/ NOP; /* 0x85800000001c3c02 */
/*00f0*/ NOP; /* 0x85800000001c3c02 */
/*00f8*/ NOP; /* 0x85800000001c3c02 */
.......................
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment