Skip to content

Instantly share code, notes, and snippets.

@maleadt
Created April 27, 2017 11:39
Show Gist options
  • Save maleadt/c544b76ce742673ab61f9c4ff0ba465e to your computer and use it in GitHub Desktop.
Save maleadt/c544b76ce742673ab61f9c4ff0ba465e to your computer and use it in GitHub Desktop.
CUDAnative.jl/#58
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
define void @kernel(i32, i32* nocapture) {
top:
%2 = shl i32 %0, 1
%3 = add i32 %2, 524288
store i32 %3, i32* %1, align 8
ret void
}
.version 3.2
.target sm_35
.address_size 64
.visible .entry kernel(
.param .u32 param_0,
.param .u64 param_1
)
{
.reg .s32 %r<4>;
.reg .s64 %rd<3>;
ld.param.u32 %r1, [param_0];
ld.param.u64 %rd1, [param_1];
cvta.to.global.u64 %rd2, %rd1;
shl.b32 %r2, %r1, 1;
add.s32 %r3, %r2, 524288;
st.global.u32 [%rd2], %r3;
ret;
}
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
define void @kernel(i64, i64* nocapture) {
top:
%2 = shl i64 %0, 1
%3 = add i64 %2, 524288
store i64 %3, i64* %1, align 8
ret void
}
.version 3.2
.target sm_35
.address_size 64
.visible .entry kernel(
.param .u64 param_0,
.param .u64 param_1
)
{
.reg .s64 %rd<6>;
ld.param.u64 %rd1, [param_0];
ld.param.u64 %rd2, [param_1];
cvta.to.global.u64 %rd3, %rd2;
shl.b64 %rd4, %rd1, 1;
add.s64 %rd5, %rd4, 524288;
st.global.u64 [%rd3], %rd5;
ret;
}
using CUDAdrv, CUDAnative
function kernel{T}(one::T, ptr::Ptr{T})
val = T(524288) + T(2) * one
Base.pointerset(ptr, val, 1, 8)
return nothing
end
dev = CuDevice(0)
ctx = CuContext(dev)
function test(name, T)
ref = CuArray{T}(1)
@cuda (1,1) kernel(T(1), pointer(ref))
println("$name: ", Array(ref)[1])
if !isfile("$name.ll")
open("$name.ll", "w") do io
CUDAnative.code_llvm(io, kernel, Tuple{T, Ptr{T}};
dump_module=true, cap=capability(dev))
end
end
end
test("32bit", Int32)
test("64bit", Int64)
destroy(ctx)
using CUDAdrv, CUDAnative, LLVM
dev = CuDevice(0)
ctx = CuContext(dev)
function test(name, T)
ir = readstring("$name.ll")
mod = parse(LLVM.Module, ir)
entry = get(functions(mod), "kernel")
ptx = CUDAnative.mcgen(mod, entry, capability(dev))
cuda_mod = CuModule(ptx)
cuda_fun = CuFunction(cuda_mod, "kernel")
ref = CuArray{Int64}(1)
cudacall(cuda_fun, 1, 1, Tuple{Int32, Ptr{Int64}}, Int32(1), pointer(ref))
println("$name: ", Array(ref)[1])
if !isfile("$name.ptx")
open("$name.ptx", "w") do io
write(io, ptx)
end
end
end
test("32bit", Int32)
test("64bit", Int64)
destroy(ctx)
ptx: ptx.cpp
clang++ -o $@ $< -I/opt/cuda/include -lcuda
#!/usr/bin/env julia
using CUDAdrv, CUDAnative
function oob(reference)
index = 4130784 - 32784 * blockIdx().x + 16 * blockIdx().x + threadIdx().x + 2051
ref = @cuStaticSharedMem(Int32, (16, 16))
for ty = 0:15
i = index + 2049 * ty + 1
@inbounds ref[threadIdx().x, ty + 1] = reference[i]
end
return nothing
end
function main(args)
array = CuArray{Int32}(2049, 2049)
@cuda (1,1) oob(array)
end
dev = CuDevice(0)
ctx = CuContext(dev, CUDAdrv.SCHED_BLOCKING_SYNC)
main(ARGS)
destroy(ctx)
#include <iostream>
#include <cuda.h>
#define CHECK(err) __check(err, __FILE__, __LINE__)
static void __check(CUresult err, const char *file, const int line) {
if (CUDA_SUCCESS != err) {
const char *name, *descr;
cuGetErrorName(err, &name);
cuGetErrorString(err, &descr);
std::cerr << name << ": " << descr << ", at " << file << ":" << line << std::endl;
abort();
}
}
template <typename T>
void test(const std::string name)
{
CUmodule mod;
std::string path = name + ".ptx";
CHECK(cuModuleLoad(&mod, path.c_str()));
CUfunction fun;
CHECK(cuModuleGetFunction(&fun, mod, "kernel"));
T *gpu_ref;
CHECK(cuMemAlloc((CUdeviceptr*) &gpu_ref, sizeof(T)));
T one = 1;
void *args[2] = {&one, &gpu_ref};
cuLaunchKernel(fun, 1, 1, 1, 1, 1, 1, 0, NULL, args, NULL);
T cpu_ref;
CHECK(cuMemcpyDtoH(&cpu_ref, (CUdeviceptr) gpu_ref, sizeof(T)));
std::cout << name << ": " << cpu_ref << std::endl;
CHECK(cuModuleUnload(mod));
}
int main() {
CHECK(cuInit(0));
CUdevice dev;
CHECK(cuDeviceGet(&dev, 0));
CUcontext ctx;
CHECK(cuCtxCreate(&ctx, 0, dev));
test<int>("32bit");
test<long long int>("64bit");
CHECK(cuCtxDestroy(ctx));
return 0;
}
using CUDAdrv
dev = CuDevice(0)
ctx = CuContext(dev)
function test(name, T)
ptx = readstring("$name.ptx")
cuda_mod = CuModule(ptx)
cuda_fun = CuFunction(cuda_mod, "kernel")
ref = CuArray{T}(1)
cudacall(cuda_fun, 1, 1, Tuple{T, Ptr{T}}, T(1), pointer(ref))
println("$name: ", Array(ref)[1])
if !isfile("$name.sass")
cap = capability(dev)
gpu = "sm_$(cap.major)$(cap.minor)"
run(`ptxas --gpu-name $gpu --output-file $name.cuobj --input-as-string $ptx`)
open("$name.sass", "w") do io
print(io, readstring(`cuobjdump --dump-sass $name.cuobj`))
end
rm("$name.cuobj")
end
end
test("32bit", Int32)
test("64bit", Int64)
destroy(ctx)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment