Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
using CUDAdrv
using CUDAnative
using CuArrays
using CUDAapi
using GPUifyLoops
using BenchmarkTools
using StaticArrays
using Test
#-----------------------------------------------
function cpu_calc!(vin, phir, phis, phit, vout)
vout_l = 0
i = 1; j = 1; k = 1
for ik in 1:qm1
v_ij_l = 0
for ij in 1:qm1
v_ii_l = 0
for ii in 1:qm1
@inbounds v_ii_l += vin[ii + (ij-1)*qm1 + (ik-1)*qm1*qm1] * phir[i,ii]
end # ii loop
@inbounds v_ij_l += v_ii_l * phis[j,ij]
end # ij loop
@inbounds vout_l += v_ij_l * phit[k,ik]
end # ik loop
vout[i,j,k] = vout_l
return nothing
end
#-----------------------------------------------
#-----------------------------------------------
function gpu_calc1!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1}
FT = eltype(d_vin)
ii = threadIdx().x; ij = threadIdx().y; ik = threadIdx().z
sx = blockDim().x; sy = blockDim().y; sz = blockDim().z
bx = blockIdx().x; by = blockIdx().y; bz = blockIdx().z
vout_ijk = @cuStaticSharedMem(FT, qm1*qm1*qm1)
loc = ii + (ij-1)*sx + (ik-1)*sx*sy
vout_ijk[loc] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik]
sync_threads()
# Reduction
if ii == 1 && ij==1 && ik==1
temp = 0.0
for i in 1:sx*sy*sz
@inbounds temp += vout_ijk[i]
end
@inbounds d_vout[1,1,1] = temp
end
return nothing
end
#-----------------------------------------------
#-----------------------------------------------
function gpu_calc2!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1}
FT = eltype(d_vin)
ij = threadIdx().x; ik = threadIdx().y;
vout_jk = @cuStaticSharedMem(FT, (qm1,qm1))
vout_jk[ij,ik] = FT(0)
for i in 1:qm1
@inbounds vout_jk[ij,ik] += d_vin[i,ij,ik]*d_phir[1,i]
end
@inbounds vout_jk[ij,ik] *= d_phis[1,ij]
@synchronize
if ij==1 # Reduction and multiplication with phit
for i = 2:qm1
@inbounds vout_jk[1,ik] += vout_jk[i,ik]
end
@inbounds vout_jk[1,ik] *= d_phit[1,ik]
end
@synchronize
if ij==1 & ik==1
for k in 2:qm1
@inbounds vout_jk[1,1] += vout_jk[1,k]
end
d_vout[1,1,1] = vout_jk[1,1]
end
return nothing
end
#-----------------------------------------------
#-----------------------------------------------
function gpu_calc3!(d_vin, d_phir, d_phis, d_phit, d_vout)
FT = eltype(d_vin)
qm1 = length(d_phir)
ii = threadIdx().x; ij = threadIdx().y; ik = threadIdx().z
sx = blockDim().x; sy = blockDim().y; sz = blockDim().z
bx = blockIdx().x; by = blockIdx().y; bz = blockIdx().z
vout_ijk = @cuDynamicSharedMem(FT, qm1*qm1*qm1)
loc = ii + (ij-1)*sx + (ik-1)*sx*sy
vout_ijk[loc] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik]
# vout_ijk[ii,ij,ik] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik]
sync_threads()
# Reduction
if ii == 1 && ij==1 && ik==1
temp = 0.0
for i in 1:sx*sy*sz
@inbounds temp += vout_ijk[i]
end
@inbounds d_vout[1,1,1] = temp
end
return nothing
end
#-----------------------------------------------
#-----------------------------------------------
function gpu_calc4!(d_vin, d_phir, d_phis, d_phit, d_vout)
FT = eltype(d_vin)
ij = threadIdx().x; ik = threadIdx().y;
qm1 = length(d_phir)
vout_jk = @cuDynamicSharedMem(FT, (qm1,qm1))
vout_jk[ij,ik] = FT(0)
for i in 1:qm1
@inbounds vout_jk[ij,ik] += d_vin[i,ij,ik]*d_phir[1,i]
end
@inbounds vout_jk[ij,ik] *= d_phis[1,ij]
@synchronize
if ij==1 # Reduction and multiplication with phit
for i = 2:qm1
@inbounds vout_jk[1,ik] += vout_jk[i,ik]
end
@inbounds vout_jk[1,ik] *= d_phit[1,ik]
end
@synchronize
if ij==1 & ik==1
for k in 2:qm1
@inbounds vout_jk[1,1] += vout_jk[1,k]
end
d_vout[1,1,1] = vout_jk[1,1]
end
return nothing
end
#-----------------------------------------------
#-----------------------------------------------
function GPUify_Loops_mixed_calc!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1}
FT = eltype(d_vin)
ii = threadIdx().x; ij = threadIdx().y; ik = threadIdx().z
# @cuprintln("sx = $sx; sy = $sy; sz = $sz;")
vout_ijk = @shmem FT (qm1*qm1*qm1,)
loc = ii + (ij-1)*qm1 + (ik-1)*qm1*qm1
vout_ijk[loc] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik]
# Reduction
if ii == 1 && ij==1 && ik==1
temp = 0.0
for i in 1:qm1*qm1*qm1
@inbounds temp += vout_ijk[i]
end
@inbounds d_vout[1,1,1] = temp
end
return nothing
end
#-----------------------------------------------
#-----------------------------------------------
function GPUify_Loops_mixed_calc2!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1}
FT = eltype(d_vin)
ij = threadIdx().x; ik = threadIdx().y;
# @cuprintln("sx = $sx; sy = $sy; sz = $sz;")
vout_jk = @shmem FT (qm1,qm1)
vout_jk[ij,ik] = FT(0)
for i in 1:qm1
@inbounds vout_jk[ij,ik] += d_vin[i,ij,ik]*d_phir[1,i]
end
@inbounds vout_jk[ij,ik] *= d_phis[1,ij]
@synchronize
if ij==1 # Reduction and multiplication with phit
for i = 2:qm1
@inbounds vout_jk[1,ik] += vout_jk[i,ik]
end
@inbounds vout_jk[1,ik] *= d_phit[1,ik]
end
@synchronize
if ij==1 & ik==1
for k in 2:qm1
@inbounds vout_jk[1,1] += vout_jk[1,k]
end
d_vout[1,1,1] = vout_jk[1,1]
end
return nothing
end
#-----------------------------------------------
FT = Float64
poly_order = 5
qm1 = poly_order + 1
sr = 1; ss = 1; st = 1
vin = rand(FT, qm1, qm1, qm1)
vout = zeros(FT, sr, ss, st)
phir = rand(FT, sr, qm1)
phis = rand(FT, ss, qm1)
phit = rand(FT, st, qm1)
println("qm1 = ",qm1)
println("size(phir) = ",size(phir))
println("size(phis) = ",size(phis))
println("size(phit) = ",size(phit))
#--------------------------------------------
println("CPU launch + time consumed")
println("===========================")
@btime cpu_calc!($vin, $phir, $phis, $phit, $vout)
@show vout
#--------------------------------------------
d_vout = CuArray{FT}(undef, sr, ss, st)
d_vin = CuArray(vin)
d_phir = CuArray(phir)
d_phis = CuArray(phis)
d_phit = CuArray(phit)
#--------------------------------------------
println()
println()
println("GPU launch version 1 (static shared memory + val) + time consumed")
println("===========================")
#----------------------
fill!(d_vout, FT(0.0))
@btime CuArrays.@sync @cuda threads=($qm1,$qm1,$qm1) shmem=$(qm1*qm1*qm1*sizeof(eltype(d_vin))) gpu_calc1!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1)))
@show d_vout
#--------------------------------------------
println()
println()
println("GPU launch version 2 + (static shared memory + val) + time consumed")
println("===========================")
#----------------------
fill!(d_vout, FT(0.0))
println("timing using btime")
@btime CuArrays.@sync @cuda threads=($qm1,$qm1) shmem=$(qm1*qm1*sizeof(eltype(d_vin))) gpu_calc2!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1)))
@show d_vout
#----------------------
#--------------------------------------------
println()
println()
println("GPU launch version 1 with dynamic shared memory with size specified at launch + time consumed")
println("===========================")
#----------------------
fill!(d_vout, FT(0.0))
@btime CuArrays.@sync @cuda threads=($qm1,$qm1,$qm1) shmem=$(qm1*qm1*qm1*sizeof(eltype(d_vin))) gpu_calc3!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout)
@show d_vout
#--------------------------------------------
#--------------------------------------------
println()
println()
println("GPU launch version 2 with dynamic shared memory with size specified at launch + time consumed")
println("===========================")
#----------------------
fill!(d_vout, FT(0.0))
@btime CuArrays.@sync @cuda threads=($qm1,$qm1) shmem=$(qm1*qm1*sizeof(eltype(d_vin))) gpu_calc4!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout)
@show d_vout
#--------------------------------------------
println()
println()
println("GPUifyLoops launch version 1 + time consumed")
println("===========================")
device = typeof(d_vin) <: Array ? CPU() : CUDA()
println("device = $device")
#----------------------
fill!(d_vout, FT(0.0))
@btime CuArrays.@sync @launch($device, threads=($qm1,$qm1,$qm1), blocks=1, GPUify_Loops_mixed_calc!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1))))
@show d_vout
#----------------------
println()
println()
println("GPUifyLoops launch version 2 + time consumed")
println("===========================")
device = typeof(d_vin) <: Array ? CPU() : CUDA()
println("device = $device")
#----------------------
fill!(d_vout, FT(0.0))
@btime CuArrays.@sync @launch($device, threads=($qm1,$qm1), blocks=1, GPUify_Loops_mixed_calc2!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1))))
@show d_vout
#----------------------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment