Skip to content

Instantly share code, notes, and snippets.

@simonbyrne
Created January 22, 2020 23:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save simonbyrne/0ade43d68c79992a3b0cfddc1a6165f9 to your computer and use it in GitHub Desktop.
Save simonbyrne/0ade43d68c79992a3b0cfddc1a6165f9 to your computer and use it in GitHub Desktop.
using CUDAdrv
using CUDAnative
using CuArrays
using CUDAapi
using GPUifyLoops
using BenchmarkTools
using StaticArrays
using Test
#-----------------------------------------------
function cpu_calc!(vin, phir, phis, phit, vout)
vout_l = 0
i = 1; j = 1; k = 1
for ik in 1:qm1
v_ij_l = 0
for ij in 1:qm1
v_ii_l = 0
for ii in 1:qm1
@inbounds v_ii_l += vin[ii + (ij-1)*qm1 + (ik-1)*qm1*qm1] * phir[i,ii]
end # ii loop
@inbounds v_ij_l += v_ii_l * phis[j,ij]
end # ij loop
@inbounds vout_l += v_ij_l * phit[k,ik]
end # ik loop
vout[i,j,k] = vout_l
return nothing
end
#-----------------------------------------------
#-----------------------------------------------
function gpu_calc1!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1}
FT = eltype(d_vin)
ii = threadIdx().x; ij = threadIdx().y; ik = threadIdx().z
sx = blockDim().x; sy = blockDim().y; sz = blockDim().z
bx = blockIdx().x; by = blockIdx().y; bz = blockIdx().z
vout_ijk = @cuStaticSharedMem(FT, qm1*qm1*qm1)
loc = ii + (ij-1)*sx + (ik-1)*sx*sy
vout_ijk[loc] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik]
sync_threads()
# Reduction
if ii == 1 && ij==1 && ik==1
temp = 0.0
for i in 1:sx*sy*sz
@inbounds temp += vout_ijk[i]
end
@inbounds d_vout[1,1,1] = temp
end
return nothing
end
#-----------------------------------------------
#-----------------------------------------------
function gpu_calc2!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1}
FT = eltype(d_vin)
ij = threadIdx().x; ik = threadIdx().y;
vout_jk = @cuStaticSharedMem(FT, (qm1,qm1))
vout_jk[ij,ik] = FT(0)
for i in 1:qm1
@inbounds vout_jk[ij,ik] += d_vin[i,ij,ik]*d_phir[1,i]
end
@inbounds vout_jk[ij,ik] *= d_phis[1,ij]
@synchronize
if ij==1 # Reduction and multiplication with phit
for i = 2:qm1
@inbounds vout_jk[1,ik] += vout_jk[i,ik]
end
@inbounds vout_jk[1,ik] *= d_phit[1,ik]
end
@synchronize
if ij==1 & ik==1
for k in 2:qm1
@inbounds vout_jk[1,1] += vout_jk[1,k]
end
d_vout[1,1,1] = vout_jk[1,1]
end
return nothing
end
#-----------------------------------------------
#-----------------------------------------------
function gpu_calc3!(d_vin, d_phir, d_phis, d_phit, d_vout)
FT = eltype(d_vin)
qm1 = length(d_phir)
ii = threadIdx().x; ij = threadIdx().y; ik = threadIdx().z
sx = blockDim().x; sy = blockDim().y; sz = blockDim().z
bx = blockIdx().x; by = blockIdx().y; bz = blockIdx().z
vout_ijk = @cuDynamicSharedMem(FT, qm1*qm1*qm1)
loc = ii + (ij-1)*sx + (ik-1)*sx*sy
vout_ijk[loc] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik]
# vout_ijk[ii,ij,ik] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik]
sync_threads()
# Reduction
if ii == 1 && ij==1 && ik==1
temp = 0.0
for i in 1:sx*sy*sz
@inbounds temp += vout_ijk[i]
end
@inbounds d_vout[1,1,1] = temp
end
return nothing
end
#-----------------------------------------------
#-----------------------------------------------
function gpu_calc4!(d_vin, d_phir, d_phis, d_phit, d_vout)
FT = eltype(d_vin)
ij = threadIdx().x; ik = threadIdx().y;
qm1 = length(d_phir)
vout_jk = @cuDynamicSharedMem(FT, (qm1,qm1))
vout_jk[ij,ik] = FT(0)
for i in 1:qm1
@inbounds vout_jk[ij,ik] += d_vin[i,ij,ik]*d_phir[1,i]
end
@inbounds vout_jk[ij,ik] *= d_phis[1,ij]
@synchronize
if ij==1 # Reduction and multiplication with phit
for i = 2:qm1
@inbounds vout_jk[1,ik] += vout_jk[i,ik]
end
@inbounds vout_jk[1,ik] *= d_phit[1,ik]
end
@synchronize
if ij==1 & ik==1
for k in 2:qm1
@inbounds vout_jk[1,1] += vout_jk[1,k]
end
d_vout[1,1,1] = vout_jk[1,1]
end
return nothing
end
#-----------------------------------------------
#-----------------------------------------------
function GPUify_Loops_mixed_calc!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1}
FT = eltype(d_vin)
ii = threadIdx().x; ij = threadIdx().y; ik = threadIdx().z
# @cuprintln("sx = $sx; sy = $sy; sz = $sz;")
vout_ijk = @shmem FT (qm1*qm1*qm1,)
loc = ii + (ij-1)*qm1 + (ik-1)*qm1*qm1
vout_ijk[loc] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik]
# Reduction
if ii == 1 && ij==1 && ik==1
temp = 0.0
for i in 1:qm1*qm1*qm1
@inbounds temp += vout_ijk[i]
end
@inbounds d_vout[1,1,1] = temp
end
return nothing
end
#-----------------------------------------------
#-----------------------------------------------
function GPUify_Loops_mixed_calc2!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1}
FT = eltype(d_vin)
ij = threadIdx().x; ik = threadIdx().y;
# @cuprintln("sx = $sx; sy = $sy; sz = $sz;")
vout_jk = @shmem FT (qm1,qm1)
vout_jk[ij,ik] = FT(0)
for i in 1:qm1
@inbounds vout_jk[ij,ik] += d_vin[i,ij,ik]*d_phir[1,i]
end
@inbounds vout_jk[ij,ik] *= d_phis[1,ij]
@synchronize
if ij==1 # Reduction and multiplication with phit
for i = 2:qm1
@inbounds vout_jk[1,ik] += vout_jk[i,ik]
end
@inbounds vout_jk[1,ik] *= d_phit[1,ik]
end
@synchronize
if ij==1 & ik==1
for k in 2:qm1
@inbounds vout_jk[1,1] += vout_jk[1,k]
end
d_vout[1,1,1] = vout_jk[1,1]
end
return nothing
end
#-----------------------------------------------
FT = Float64
poly_order = 5
qm1 = poly_order + 1
sr = 1; ss = 1; st = 1
vin = rand(FT, qm1, qm1, qm1)
vout = zeros(FT, sr, ss, st)
phir = rand(FT, sr, qm1)
phis = rand(FT, ss, qm1)
phit = rand(FT, st, qm1)
println("qm1 = ",qm1)
println("size(phir) = ",size(phir))
println("size(phis) = ",size(phis))
println("size(phit) = ",size(phit))
#--------------------------------------------
println("CPU launch + time consumed")
println("===========================")
@btime cpu_calc!($vin, $phir, $phis, $phit, $vout)
@show vout
#--------------------------------------------
d_vout = CuArray{FT}(undef, sr, ss, st)
d_vin = CuArray(vin)
d_phir = CuArray(phir)
d_phis = CuArray(phis)
d_phit = CuArray(phit)
#--------------------------------------------
println()
println()
println("GPU launch version 1 (static shared memory + val) + time consumed")
println("===========================")
#----------------------
fill!(d_vout, FT(0.0))
@btime CuArrays.@sync @cuda threads=($qm1,$qm1,$qm1) shmem=$(qm1*qm1*qm1*sizeof(eltype(d_vin))) gpu_calc1!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1)))
@show d_vout
#--------------------------------------------
println()
println()
println("GPU launch version 2 + (static shared memory + val) + time consumed")
println("===========================")
#----------------------
fill!(d_vout, FT(0.0))
println("timing using btime")
@btime CuArrays.@sync @cuda threads=($qm1,$qm1) shmem=$(qm1*qm1*sizeof(eltype(d_vin))) gpu_calc2!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1)))
@show d_vout
#----------------------
#--------------------------------------------
println()
println()
println("GPU launch version 1 with dynamic shared memory with size specified at launch + time consumed")
println("===========================")
#----------------------
fill!(d_vout, FT(0.0))
@btime CuArrays.@sync @cuda threads=($qm1,$qm1,$qm1) shmem=$(qm1*qm1*qm1*sizeof(eltype(d_vin))) gpu_calc3!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout)
@show d_vout
#--------------------------------------------
#--------------------------------------------
println()
println()
println("GPU launch version 2 with dynamic shared memory with size specified at launch + time consumed")
println("===========================")
#----------------------
fill!(d_vout, FT(0.0))
@btime CuArrays.@sync @cuda threads=($qm1,$qm1) shmem=$(qm1*qm1*sizeof(eltype(d_vin))) gpu_calc4!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout)
@show d_vout
#--------------------------------------------
println()
println()
println("GPUifyLoops launch version 1 + time consumed")
println("===========================")
device = typeof(d_vin) <: Array ? CPU() : CUDA()
println("device = $device")
#----------------------
fill!(d_vout, FT(0.0))
@btime CuArrays.@sync @launch($device, threads=($qm1,$qm1,$qm1), blocks=1, GPUify_Loops_mixed_calc!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1))))
@show d_vout
#----------------------
println()
println()
println("GPUifyLoops launch version 2 + time consumed")
println("===========================")
device = typeof(d_vin) <: Array ? CPU() : CUDA()
println("device = $device")
#----------------------
fill!(d_vout, FT(0.0))
@btime CuArrays.@sync @launch($device, threads=($qm1,$qm1), blocks=1, GPUify_Loops_mixed_calc2!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1))))
@show d_vout
#----------------------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment