Created
January 22, 2020 23:41
-
-
Save simonbyrne/0ade43d68c79992a3b0cfddc1a6165f9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using CUDAdrv | |
using CUDAnative | |
using CuArrays | |
using CUDAapi | |
using GPUifyLoops | |
using BenchmarkTools | |
using StaticArrays | |
using Test | |
#----------------------------------------------- | |
function cpu_calc!(vin, phir, phis, phit, vout) | |
vout_l = 0 | |
i = 1; j = 1; k = 1 | |
for ik in 1:qm1 | |
v_ij_l = 0 | |
for ij in 1:qm1 | |
v_ii_l = 0 | |
for ii in 1:qm1 | |
@inbounds v_ii_l += vin[ii + (ij-1)*qm1 + (ik-1)*qm1*qm1] * phir[i,ii] | |
end # ii loop | |
@inbounds v_ij_l += v_ii_l * phis[j,ij] | |
end # ij loop | |
@inbounds vout_l += v_ij_l * phit[k,ik] | |
end # ik loop | |
vout[i,j,k] = vout_l | |
return nothing | |
end | |
#----------------------------------------------- | |
#----------------------------------------------- | |
function gpu_calc1!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1} | |
FT = eltype(d_vin) | |
ii = threadIdx().x; ij = threadIdx().y; ik = threadIdx().z | |
sx = blockDim().x; sy = blockDim().y; sz = blockDim().z | |
bx = blockIdx().x; by = blockIdx().y; bz = blockIdx().z | |
vout_ijk = @cuStaticSharedMem(FT, qm1*qm1*qm1) | |
loc = ii + (ij-1)*sx + (ik-1)*sx*sy | |
vout_ijk[loc] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik] | |
sync_threads() | |
# Reduction | |
if ii == 1 && ij==1 && ik==1 | |
temp = 0.0 | |
for i in 1:sx*sy*sz | |
@inbounds temp += vout_ijk[i] | |
end | |
@inbounds d_vout[1,1,1] = temp | |
end | |
return nothing | |
end | |
#----------------------------------------------- | |
#----------------------------------------------- | |
function gpu_calc2!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1} | |
FT = eltype(d_vin) | |
ij = threadIdx().x; ik = threadIdx().y; | |
vout_jk = @cuStaticSharedMem(FT, (qm1,qm1)) | |
vout_jk[ij,ik] = FT(0) | |
for i in 1:qm1 | |
@inbounds vout_jk[ij,ik] += d_vin[i,ij,ik]*d_phir[1,i] | |
end | |
@inbounds vout_jk[ij,ik] *= d_phis[1,ij] | |
@synchronize | |
if ij==1 # Reduction and multiplication with phit | |
for i = 2:qm1 | |
@inbounds vout_jk[1,ik] += vout_jk[i,ik] | |
end | |
@inbounds vout_jk[1,ik] *= d_phit[1,ik] | |
end | |
@synchronize | |
if ij==1 & ik==1 | |
for k in 2:qm1 | |
@inbounds vout_jk[1,1] += vout_jk[1,k] | |
end | |
d_vout[1,1,1] = vout_jk[1,1] | |
end | |
return nothing | |
end | |
#----------------------------------------------- | |
#----------------------------------------------- | |
function gpu_calc3!(d_vin, d_phir, d_phis, d_phit, d_vout) | |
FT = eltype(d_vin) | |
qm1 = length(d_phir) | |
ii = threadIdx().x; ij = threadIdx().y; ik = threadIdx().z | |
sx = blockDim().x; sy = blockDim().y; sz = blockDim().z | |
bx = blockIdx().x; by = blockIdx().y; bz = blockIdx().z | |
vout_ijk = @cuDynamicSharedMem(FT, qm1*qm1*qm1) | |
loc = ii + (ij-1)*sx + (ik-1)*sx*sy | |
vout_ijk[loc] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik] | |
# vout_ijk[ii,ij,ik] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik] | |
sync_threads() | |
# Reduction | |
if ii == 1 && ij==1 && ik==1 | |
temp = 0.0 | |
for i in 1:sx*sy*sz | |
@inbounds temp += vout_ijk[i] | |
end | |
@inbounds d_vout[1,1,1] = temp | |
end | |
return nothing | |
end | |
#----------------------------------------------- | |
#----------------------------------------------- | |
function gpu_calc4!(d_vin, d_phir, d_phis, d_phit, d_vout) | |
FT = eltype(d_vin) | |
ij = threadIdx().x; ik = threadIdx().y; | |
qm1 = length(d_phir) | |
vout_jk = @cuDynamicSharedMem(FT, (qm1,qm1)) | |
vout_jk[ij,ik] = FT(0) | |
for i in 1:qm1 | |
@inbounds vout_jk[ij,ik] += d_vin[i,ij,ik]*d_phir[1,i] | |
end | |
@inbounds vout_jk[ij,ik] *= d_phis[1,ij] | |
@synchronize | |
if ij==1 # Reduction and multiplication with phit | |
for i = 2:qm1 | |
@inbounds vout_jk[1,ik] += vout_jk[i,ik] | |
end | |
@inbounds vout_jk[1,ik] *= d_phit[1,ik] | |
end | |
@synchronize | |
if ij==1 & ik==1 | |
for k in 2:qm1 | |
@inbounds vout_jk[1,1] += vout_jk[1,k] | |
end | |
d_vout[1,1,1] = vout_jk[1,1] | |
end | |
return nothing | |
end | |
#----------------------------------------------- | |
#----------------------------------------------- | |
function GPUify_Loops_mixed_calc!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1} | |
FT = eltype(d_vin) | |
ii = threadIdx().x; ij = threadIdx().y; ik = threadIdx().z | |
# @cuprintln("sx = $sx; sy = $sy; sz = $sz;") | |
vout_ijk = @shmem FT (qm1*qm1*qm1,) | |
loc = ii + (ij-1)*qm1 + (ik-1)*qm1*qm1 | |
vout_ijk[loc] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik] | |
# Reduction | |
if ii == 1 && ij==1 && ik==1 | |
temp = 0.0 | |
for i in 1:qm1*qm1*qm1 | |
@inbounds temp += vout_ijk[i] | |
end | |
@inbounds d_vout[1,1,1] = temp | |
end | |
return nothing | |
end | |
#----------------------------------------------- | |
#----------------------------------------------- | |
function GPUify_Loops_mixed_calc2!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1} | |
FT = eltype(d_vin) | |
ij = threadIdx().x; ik = threadIdx().y; | |
# @cuprintln("sx = $sx; sy = $sy; sz = $sz;") | |
vout_jk = @shmem FT (qm1,qm1) | |
vout_jk[ij,ik] = FT(0) | |
for i in 1:qm1 | |
@inbounds vout_jk[ij,ik] += d_vin[i,ij,ik]*d_phir[1,i] | |
end | |
@inbounds vout_jk[ij,ik] *= d_phis[1,ij] | |
@synchronize | |
if ij==1 # Reduction and multiplication with phit | |
for i = 2:qm1 | |
@inbounds vout_jk[1,ik] += vout_jk[i,ik] | |
end | |
@inbounds vout_jk[1,ik] *= d_phit[1,ik] | |
end | |
@synchronize | |
if ij==1 & ik==1 | |
for k in 2:qm1 | |
@inbounds vout_jk[1,1] += vout_jk[1,k] | |
end | |
d_vout[1,1,1] = vout_jk[1,1] | |
end | |
return nothing | |
end | |
#----------------------------------------------- | |
FT = Float64 | |
poly_order = 5 | |
qm1 = poly_order + 1 | |
sr = 1; ss = 1; st = 1 | |
vin = rand(FT, qm1, qm1, qm1) | |
vout = zeros(FT, sr, ss, st) | |
phir = rand(FT, sr, qm1) | |
phis = rand(FT, ss, qm1) | |
phit = rand(FT, st, qm1) | |
println("qm1 = ",qm1) | |
println("size(phir) = ",size(phir)) | |
println("size(phis) = ",size(phis)) | |
println("size(phit) = ",size(phit)) | |
#-------------------------------------------- | |
println("CPU launch + time consumed") | |
println("===========================") | |
@btime cpu_calc!($vin, $phir, $phis, $phit, $vout) | |
@show vout | |
#-------------------------------------------- | |
d_vout = CuArray{FT}(undef, sr, ss, st) | |
d_vin = CuArray(vin) | |
d_phir = CuArray(phir) | |
d_phis = CuArray(phis) | |
d_phit = CuArray(phit) | |
#-------------------------------------------- | |
println() | |
println() | |
println("GPU launch version 1 (static shared memory + val) + time consumed") | |
println("===========================") | |
#---------------------- | |
fill!(d_vout, FT(0.0)) | |
@btime CuArrays.@sync @cuda threads=($qm1,$qm1,$qm1) shmem=$(qm1*qm1*qm1*sizeof(eltype(d_vin))) gpu_calc1!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1))) | |
@show d_vout | |
#-------------------------------------------- | |
println() | |
println() | |
println("GPU launch version 2 + (static shared memory + val) + time consumed") | |
println("===========================") | |
#---------------------- | |
fill!(d_vout, FT(0.0)) | |
println("timing using btime") | |
@btime CuArrays.@sync @cuda threads=($qm1,$qm1) shmem=$(qm1*qm1*sizeof(eltype(d_vin))) gpu_calc2!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1))) | |
@show d_vout | |
#---------------------- | |
#-------------------------------------------- | |
println() | |
println() | |
println("GPU launch version 1 with dynamic shared memory with size specified at launch + time consumed") | |
println("===========================") | |
#---------------------- | |
fill!(d_vout, FT(0.0)) | |
@btime CuArrays.@sync @cuda threads=($qm1,$qm1,$qm1) shmem=$(qm1*qm1*qm1*sizeof(eltype(d_vin))) gpu_calc3!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout) | |
@show d_vout | |
#-------------------------------------------- | |
#-------------------------------------------- | |
println() | |
println() | |
println("GPU launch version 2 with dynamic shared memory with size specified at launch + time consumed") | |
println("===========================") | |
#---------------------- | |
fill!(d_vout, FT(0.0)) | |
@btime CuArrays.@sync @cuda threads=($qm1,$qm1) shmem=$(qm1*qm1*sizeof(eltype(d_vin))) gpu_calc4!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout) | |
@show d_vout | |
#-------------------------------------------- | |
println() | |
println() | |
println("GPUifyLoops launch version 1 + time consumed") | |
println("===========================") | |
device = typeof(d_vin) <: Array ? CPU() : CUDA() | |
println("device = $device") | |
#---------------------- | |
fill!(d_vout, FT(0.0)) | |
@btime CuArrays.@sync @launch($device, threads=($qm1,$qm1,$qm1), blocks=1, GPUify_Loops_mixed_calc!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1)))) | |
@show d_vout | |
#---------------------- | |
println() | |
println() | |
println("GPUifyLoops launch version 2 + time consumed") | |
println("===========================") | |
device = typeof(d_vin) <: Array ? CPU() : CUDA() | |
println("device = $device") | |
#---------------------- | |
fill!(d_vout, FT(0.0)) | |
@btime CuArrays.@sync @launch($device, threads=($qm1,$qm1), blocks=1, GPUify_Loops_mixed_calc2!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1)))) | |
@show d_vout | |
#---------------------- | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment