Skip to content

Instantly share code, notes, and snippets.

@barche
Last active November 30, 2017 06:46
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save barche/9cc583ad85dd2d02782642af04f44dd7 to your computer and use it in GitHub Desktop.
Save barche/9cc583ad85dd2d02782642af04f44dd7 to your computer and use it in GitHub Desktop.
Julia implementation of https://devblogs.nvidia.com/parallelforall/unified-memory-cuda-beginners/, first using pointers and then using a custom `UnifiedArray` type.
# Version using only CUDAdrv
using CUDAdrv, CUDAnative
using BenchmarkTools
const N = 1000000
function kernel_init(A, B, numElements)
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
if i <= numElements
A[i] = 1.0f0
B[i] = 2.0f0
end
return
end
function kernel_add(A, B, C, numElements)
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
if i <= numElements
C[i] = A[i] + B[i]
end
return
end
function allocate(T=Float32)
A = CuArray{T}(N)
B = CuArray{T}(N)
C = CuArray{T}(N)
return A, B, C
end
function check(C)
result = 0.0f0
for c in C
t = abs(c-3.0f0)
if t > result
result = t
end
end
return result
end
blockSize = 1024;
numBlocks = (N + blockSize - 1) ÷ blockSize;
init!(A, B) = @cuda (numBlocks,blockSize) kernel_init(A, B, N)
add!(C,A,B) = @cuda (numBlocks,blockSize) kernel_add(A, B, C, N)
function run_kernels(A,B,C)
init!(A,B)
add!(C,A,B)
end
println("allocation time:")
A,B,C = @btime allocate() samples=1 evals=1
println("kernels total time:")
@btime run_kernels($A,$B, $C) samples=10 evals=1
println("copy from device time")
C_cpu = @btime Array(C) samples=1 evals=1
println("check time")
testresult = @btime check($C_cpu) samples=1 evals=1
@show testresult
using CUDAdrv, CUDAnative
import CUDArt
using BenchmarkTools
const N = 1000000
ctx = CuContext(first(devices()))
function kernel_init(a, b, numElements)
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
if i <= numElements
Base.pointerset(a, 1.0f0, i, 8)
Base.pointerset(b, 2.0f0, i, 8)
end
return
end
function kernel_add(a, b, c, numElements)
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
if i <= numElements
a_val = Base.pointerref(a, i, 8)
b_val = Base.pointerref(b, i, 8)
Base.pointerset(c, a_val + b_val, i, 8)
end
return
end
function cudaMemPrefetchAsync(devPtr, size, dev=device(CuCurrentContext()), stream=CUDArt.null_stream)
CUDArt.rt.checkerror(ccall((:cudaMemPrefetchAsync,CUDArt.libcudart),CUDArt.rt.cudaError_t,(Ptr{Void},Csize_t,Cint,CUDArt.rt.cudaStream_t),devPtr.ptr,size,dev.handle,stream))
end
function managed_alloc{T}(::Type{T}, n_elems)
p = Ref{Ptr{Void}}(C_NULL)
CUDArt.rt.cudaMallocManaged(p, n_elems*sizeof(T), CUDArt.rt.cudaMemAttachGlobal);
return DevicePtr{T}(Base.unsafe_convert(Ptr{T}, p[]), CuCurrentContext())
end
function allocate(T=Float32)
A = managed_alloc(T, N)
B = managed_alloc(T, N)
C = managed_alloc(T, N)
cudaMemPrefetchAsync(A,N*sizeof(Float32))
cudaMemPrefetchAsync(B,N*sizeof(Float32))
cudaMemPrefetchAsync(C,N*sizeof(Float32))
return A, B, C
end
function check(C)
result = 0.0f0
c_ptr = Base.unsafe_convert(Ptr{Float32}, C)
for i in 1:N
c = Base.pointerref(c_ptr, i, 8)
t = abs(c-3.0f0)
if t > result
result = t
end
end
return result
end
blockSize = 1024;
numBlocks = (N + blockSize - 1) ÷ blockSize;
function init!(A, B)
@cuda (numBlocks,blockSize) kernel_init(A, B, N)
end
function add!(C,A,B)
@cuda (numBlocks,blockSize) kernel_add(A, B, C, N)
end
function run_kernels(A,B,C)
init!(A,B)
add!(C,A,B)
CUDArt.device_synchronize()
end
println("allocation time:")
A,B,C = @btime allocate() samples=1 evals=1
println("kernels total time:")
@btime run_kernels($A,$B, $C) samples=10 evals=1
println("check time")
testresult = @btime check($C) samples=1 evals=1
@show testresult
using CUDAdrv, CUDAnative
using BenchmarkTools
function cuMemPrefetchAsync(devPtr, size, dev=device(CuCurrentContext()), stream=CUDAdrv.CuDefaultStream())
CUDAdrv.@apicall(:cuMemPrefetchAsync, (Ptr{Void}, Csize_t, CUDAdrv.CuDevice_t, CUDAdrv.CuStream_t), devPtr,size,dev.handle,stream)
end
function managed_alloc{T}(::Type{T}, n_elems)
p = Ref{Ptr{Void}}(C_NULL)
CU_MEM_ATTACH_GLOBAL = Cuint(0x1)
CUDAdrv.@apicall(:cuMemAllocManaged, (Ptr{Ptr{Void}}, Csize_t, Cuint), p, n_elems*sizeof(T), CU_MEM_ATTACH_GLOBAL)
return p[]
end
struct UnifiedArray{T,N} <: AbstractArray{T,N}
ptr::Ptr{T}
size::NTuple{N,Int}
end
UnifiedArray{T,N}(d::NTuple{N,Int}) where {T,N} = UnifiedArray{T,N}(managed_alloc(T, prod(d)), d)
UnifiedArray{T}(m::Int) where {T} = UnifiedArray{T,1}((m,))
Base.IndexStyle(::Type{<:UnifiedArray}) = IndexLinear()
Base.size(arr::UnifiedArray) = arr.size
Base.checkbounds(::UnifiedArray, I...) = nothing
@inline function Base.getindex(arr::UnifiedArray{T}, index::Int) where {T}
@boundscheck checkbounds(arr, index)
return Base.pointerref(arr.ptr, index, 8)::T
end
@inline function Base.setindex!(arr::UnifiedArray{T}, val, index::Int) where {T}
@boundscheck checkbounds(arr, index)
Base.pointerset(arr.ptr, val, index, 8)
end
function prefetch(arr::UnifiedArray{T}) where T
cuMemPrefetchAsync(arr.ptr, prod(size(arr))*sizeof(T))
end
const N = 1000000
ctx = CuContext(first(devices()))
function kernel_init(A, B)
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
if i <= length(A)
A[i] = 1.0f0
B[i] = 2.0f0
end
return
end
function kernel_add(A, B, C)
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
if i <= length(A)
C[i] = A[i] + B[i]
end
return
end
function allocate(T=Float32)
A = UnifiedArray{T}(N)
B = UnifiedArray{T}(N)
C = UnifiedArray{T}(N)
prefetch(A)
prefetch(B)
prefetch(C)
return A, B, C
end
function check(C)
result = 0.0f0
for c in C
t = abs(c-3.0f0)
if t > result
result = t
end
end
return result
end
blockSize = 1024;
numBlocks = (N + blockSize - 1) ÷ blockSize;
function init!(A, B)
@cuda (numBlocks,blockSize) kernel_init(A, B)
end
function add!(C,A,B)
@cuda (numBlocks,blockSize) kernel_add(A, B, C)
end
function run_kernels(A,B,C)
init!(A,B)
add!(C,A,B)
synchronize()
end
println("allocation time:")
A,B,C = @btime allocate() samples=1 evals=1
println("kernels total time:")
@btime run_kernels($A,$B, $C) samples=10 evals=1
println("check time")
testresult = @btime check($C) samples=1 evals=1
@show testresult
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment