simonbyrne/testgpu.jl

## testgpu.jl
using CUDAdrv
using CUDAnative
using CuArrays
using CUDAapi
using GPUifyLoops
using BenchmarkTools
using StaticArrays
using Test


#-----------------------------------------------
function cpu_calc!(vin, phir, phis, phit, vout)
    vout_l = 0
    i = 1; j = 1; k = 1

    for ik in 1:qm1
	v_ij_l = 0
	for ij in 1:qm1
	    v_ii_l = 0
	    for ii in 1:qm1
		@inbounds v_ii_l += vin[ii + (ij-1)*qm1 + (ik-1)*qm1*qm1] * phir[i,ii]
	    end # ii loop
	    @inbounds v_ij_l += v_ii_l * phis[j,ij]
	end # ij loop
	@inbounds vout_l += v_ij_l * phit[k,ik]
    end # ik loop
    vout[i,j,k] = vout_l

    return nothing
end
#-----------------------------------------------
#-----------------------------------------------
function gpu_calc1!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1}
    FT = eltype(d_vin)
    ii = threadIdx().x; ij = threadIdx().y;	ik = threadIdx().z
    sx = blockDim().x; 	sy = blockDim().y;	sz = blockDim().z
    bx = blockIdx().x; 	by = blockIdx().y;	bz = blockIdx().z
    vout_ijk = @cuStaticSharedMem(FT, qm1*qm1*qm1)

    loc = ii + (ij-1)*sx + (ik-1)*sx*sy

    vout_ijk[loc] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik]
    sync_threads()

    # Reduction
    if ii == 1 && ij==1 && ik==1
    	temp = 0.0
        for i in 1:sx*sy*sz
	    @inbounds temp += vout_ijk[i]
        end
	@inbounds d_vout[1,1,1] = temp
    end

    return nothing
end
#-----------------------------------------------
#-----------------------------------------------
function gpu_calc2!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1}
    FT = eltype(d_vin)
    ij = threadIdx().x; ik = threadIdx().y;

    vout_jk = @cuStaticSharedMem(FT, (qm1,qm1))
    vout_jk[ij,ik] = FT(0)

    for i in 1:qm1
        @inbounds vout_jk[ij,ik] += d_vin[i,ij,ik]*d_phir[1,i]
    end

    @inbounds vout_jk[ij,ik] *= d_phis[1,ij]
    @synchronize

    if ij==1 # Reduction and multiplication with phit
        for i = 2:qm1
            @inbounds vout_jk[1,ik] += vout_jk[i,ik]
        end
        @inbounds vout_jk[1,ik] *= d_phit[1,ik]
    end

    @synchronize
    if ij==1 & ik==1
        for k in 2:qm1
            @inbounds vout_jk[1,1] += vout_jk[1,k]
        end
        d_vout[1,1,1] = vout_jk[1,1]
    end

    return nothing
end

#-----------------------------------------------
#-----------------------------------------------
function gpu_calc3!(d_vin, d_phir, d_phis, d_phit, d_vout)
    FT = eltype(d_vin)
    qm1 = length(d_phir)
    ii = threadIdx().x; ij = threadIdx().y;	ik = threadIdx().z
    sx = blockDim().x; 	sy = blockDim().y;	sz = blockDim().z
    bx = blockIdx().x; 	by = blockIdx().y;	bz = blockIdx().z

    vout_ijk = @cuDynamicSharedMem(FT, qm1*qm1*qm1)

    loc = ii + (ij-1)*sx + (ik-1)*sx*sy

    vout_ijk[loc] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik]
    #    vout_ijk[ii,ij,ik] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik]
    sync_threads()

    # Reduction
    if ii == 1 && ij==1 && ik==1
    	temp = 0.0
        for i in 1:sx*sy*sz
	    @inbounds temp += vout_ijk[i]
        end
	@inbounds d_vout[1,1,1] = temp
    end

    return nothing
end
#-----------------------------------------------
#-----------------------------------------------
function gpu_calc4!(d_vin, d_phir, d_phis, d_phit, d_vout)
    FT = eltype(d_vin)
    ij = threadIdx().x; ik = threadIdx().y;

    qm1 = length(d_phir)

    vout_jk = @cuDynamicSharedMem(FT, (qm1,qm1))
    vout_jk[ij,ik] = FT(0)

    for i in 1:qm1
        @inbounds vout_jk[ij,ik] += d_vin[i,ij,ik]*d_phir[1,i]
    end

    @inbounds vout_jk[ij,ik] *= d_phis[1,ij]
    @synchronize

    if ij==1 # Reduction and multiplication with phit
        for i = 2:qm1
            @inbounds vout_jk[1,ik] += vout_jk[i,ik]
        end
        @inbounds vout_jk[1,ik] *= d_phit[1,ik]
    end

    @synchronize
    if ij==1 & ik==1
        for k in 2:qm1
            @inbounds vout_jk[1,1] += vout_jk[1,k]
        end
        d_vout[1,1,1] = vout_jk[1,1]
    end

    return nothing
end

#-----------------------------------------------
#-----------------------------------------------
function GPUify_Loops_mixed_calc!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1}
    FT = eltype(d_vin)
    ii = threadIdx().x; ij = threadIdx().y;	ik = threadIdx().z
    #    @cuprintln("sx = $sx; sy = $sy; sz = $sz;")
    vout_ijk = @shmem FT (qm1*qm1*qm1,)
    loc = ii + (ij-1)*qm1 + (ik-1)*qm1*qm1
    vout_ijk[loc] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik]

    # Reduction
    if ii == 1 && ij==1 && ik==1
        temp = 0.0
        for i in 1:qm1*qm1*qm1
            @inbounds temp += vout_ijk[i]
        end
        @inbounds d_vout[1,1,1] = temp
    end
    return nothing
end
#-----------------------------------------------
#-----------------------------------------------
function GPUify_Loops_mixed_calc2!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1}
    FT = eltype(d_vin)
    ij = threadIdx().x; ik = threadIdx().y;
    #   @cuprintln("sx = $sx; sy = $sy; sz = $sz;")
    vout_jk = @shmem FT (qm1,qm1)
    vout_jk[ij,ik] = FT(0)

    for i in 1:qm1
        @inbounds vout_jk[ij,ik] += d_vin[i,ij,ik]*d_phir[1,i]
    end

    @inbounds vout_jk[ij,ik] *= d_phis[1,ij]
    @synchronize

    if ij==1 # Reduction and multiplication with phit
        for i = 2:qm1
       	    @inbounds vout_jk[1,ik] += vout_jk[i,ik]
	end
        @inbounds vout_jk[1,ik] *= d_phit[1,ik]
    end

    @synchronize
    if ij==1 & ik==1
        for k in 2:qm1
            @inbounds vout_jk[1,1] += vout_jk[1,k]
        end
        d_vout[1,1,1] = vout_jk[1,1]
    end

    return nothing
end
#-----------------------------------------------


FT = Float64
poly_order = 5
qm1 = poly_order + 1

sr = 1; ss = 1; st = 1

vin  = rand(FT, qm1, qm1, qm1)
vout = zeros(FT, sr, ss, st)
phir = rand(FT, sr, qm1)
phis = rand(FT, ss, qm1)
phit = rand(FT, st, qm1)

println("qm1 = ",qm1)
println("size(phir) = ",size(phir))
println("size(phis) = ",size(phis))
println("size(phit) = ",size(phit))


#--------------------------------------------
println("CPU launch + time consumed")
println("===========================")
@btime cpu_calc!($vin, $phir, $phis, $phit, $vout)
@show vout
#--------------------------------------------

d_vout = CuArray{FT}(undef, sr, ss, st)

d_vin  = CuArray(vin)
d_phir = CuArray(phir)
d_phis = CuArray(phis)
d_phit = CuArray(phit)


#--------------------------------------------
println()
println()
println("GPU launch version 1 (static shared memory + val) + time consumed")
println("===========================")
#----------------------
fill!(d_vout, FT(0.0))
@btime CuArrays.@sync @cuda threads=($qm1,$qm1,$qm1) shmem=$(qm1*qm1*qm1*sizeof(eltype(d_vin))) gpu_calc1!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1)))
@show d_vout
#--------------------------------------------

println()
println()
println("GPU launch version 2 + (static shared memory + val) + time consumed")
println("===========================")
#----------------------
fill!(d_vout, FT(0.0))
println("timing using btime")
@btime CuArrays.@sync @cuda threads=($qm1,$qm1) shmem=$(qm1*qm1*sizeof(eltype(d_vin))) gpu_calc2!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1)))
@show d_vout
#----------------------

#--------------------------------------------
println()
println()
println("GPU launch version 1 with dynamic shared memory with size specified at launch + time consumed")
println("===========================")
#----------------------
fill!(d_vout, FT(0.0))
@btime CuArrays.@sync @cuda threads=($qm1,$qm1,$qm1) shmem=$(qm1*qm1*qm1*sizeof(eltype(d_vin))) gpu_calc3!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout)
@show d_vout
#--------------------------------------------

#--------------------------------------------
println()
println()
println("GPU launch version 2 with dynamic shared memory with size specified at launch + time consumed")
println("===========================")
#----------------------
fill!(d_vout, FT(0.0))
@btime CuArrays.@sync @cuda threads=($qm1,$qm1) shmem=$(qm1*qm1*sizeof(eltype(d_vin))) gpu_calc4!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout)
@show d_vout
#--------------------------------------------


println()
println()
println("GPUifyLoops launch version 1 + time consumed")
println("===========================")

device = typeof(d_vin) <: Array ? CPU() : CUDA()
println("device  = $device")
#----------------------
fill!(d_vout, FT(0.0))
@btime CuArrays.@sync @launch($device, threads=($qm1,$qm1,$qm1), blocks=1, GPUify_Loops_mixed_calc!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1))))
@show d_vout
#----------------------


println()
println()
println("GPUifyLoops launch version 2 + time consumed")
println("===========================")

device = typeof(d_vin) <: Array ? CPU() : CUDA()
println("device  = $device")
#----------------------
fill!(d_vout, FT(0.0))
@btime CuArrays.@sync @launch($device, threads=($qm1,$qm1), blocks=1, GPUify_Loops_mixed_calc2!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1))))
@show d_vout
#----------------------
	using CUDAdrv
	using CUDAnative
	using CuArrays
	using CUDAapi
	using GPUifyLoops
	using BenchmarkTools
	using StaticArrays
	using Test


	#-----------------------------------------------
	function cpu_calc!(vin, phir, phis, phit, vout)
	vout_l = 0
	i = 1; j = 1; k = 1

	for ik in 1:qm1
	v_ij_l = 0
	for ij in 1:qm1
	v_ii_l = 0
	for ii in 1:qm1
	@inbounds v_ii_l += vin[ii + (ij-1)qm1 + (ik-1)qm1qm1] phir[i,ii]
	end # ii loop
	@inbounds v_ij_l += v_ii_l * phis[j,ij]
	end # ij loop
	@inbounds vout_l += v_ij_l * phit[k,ik]
	end # ik loop
	vout[i,j,k] = vout_l

	return nothing
	end
	#-----------------------------------------------
	#-----------------------------------------------
	function gpu_calc1!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1}
	FT = eltype(d_vin)
	ii = threadIdx().x; ij = threadIdx().y; ik = threadIdx().z
	sx = blockDim().x; sy = blockDim().y; sz = blockDim().z
	bx = blockIdx().x; by = blockIdx().y; bz = blockIdx().z
	vout_ijk = @cuStaticSharedMem(FT, qm1qm1qm1)

	loc = ii + (ij-1)sx + (ik-1)sx*sy

	vout_ijk[loc] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik]
	sync_threads()

	# Reduction
	if ii == 1 && ij==1 && ik==1
	temp = 0.0
	for i in 1:sxsysz
	@inbounds temp += vout_ijk[i]
	end
	@inbounds d_vout[1,1,1] = temp
	end

	return nothing
	end
	#-----------------------------------------------
	#-----------------------------------------------
	function gpu_calc2!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1}
	FT = eltype(d_vin)
	ij = threadIdx().x; ik = threadIdx().y;

	vout_jk = @cuStaticSharedMem(FT, (qm1,qm1))
	vout_jk[ij,ik] = FT(0)

	for i in 1:qm1
	@inbounds vout_jk[ij,ik] += d_vin[i,ij,ik]*d_phir[1,i]
	end

	@inbounds vout_jk[ij,ik] *= d_phis[1,ij]
	@synchronize

	if ij==1 # Reduction and multiplication with phit
	for i = 2:qm1
	@inbounds vout_jk[1,ik] += vout_jk[i,ik]
	end
	@inbounds vout_jk[1,ik] *= d_phit[1,ik]
	end

	@synchronize
	if ij==1 & ik==1
	for k in 2:qm1
	@inbounds vout_jk[1,1] += vout_jk[1,k]
	end
	d_vout[1,1,1] = vout_jk[1,1]
	end

	return nothing
	end

	#-----------------------------------------------
	#-----------------------------------------------
	function gpu_calc3!(d_vin, d_phir, d_phis, d_phit, d_vout)
	FT = eltype(d_vin)
	qm1 = length(d_phir)
	ii = threadIdx().x; ij = threadIdx().y; ik = threadIdx().z
	sx = blockDim().x; sy = blockDim().y; sz = blockDim().z
	bx = blockIdx().x; by = blockIdx().y; bz = blockIdx().z

	vout_ijk = @cuDynamicSharedMem(FT, qm1qm1qm1)

	loc = ii + (ij-1)sx + (ik-1)sx*sy

	vout_ijk[loc] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik]
	# vout_ijk[ii,ij,ik] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik]
	sync_threads()

	# Reduction
	if ii == 1 && ij==1 && ik==1
	temp = 0.0
	for i in 1:sxsysz
	@inbounds temp += vout_ijk[i]
	end
	@inbounds d_vout[1,1,1] = temp
	end

	return nothing
	end
	#-----------------------------------------------
	#-----------------------------------------------
	function gpu_calc4!(d_vin, d_phir, d_phis, d_phit, d_vout)
	FT = eltype(d_vin)
	ij = threadIdx().x; ik = threadIdx().y;

	qm1 = length(d_phir)

	vout_jk = @cuDynamicSharedMem(FT, (qm1,qm1))
	vout_jk[ij,ik] = FT(0)

	for i in 1:qm1
	@inbounds vout_jk[ij,ik] += d_vin[i,ij,ik]*d_phir[1,i]
	end

	@inbounds vout_jk[ij,ik] *= d_phis[1,ij]
	@synchronize

	if ij==1 # Reduction and multiplication with phit
	for i = 2:qm1
	@inbounds vout_jk[1,ik] += vout_jk[i,ik]
	end
	@inbounds vout_jk[1,ik] *= d_phit[1,ik]
	end

	@synchronize
	if ij==1 & ik==1
	for k in 2:qm1
	@inbounds vout_jk[1,1] += vout_jk[1,k]
	end
	d_vout[1,1,1] = vout_jk[1,1]
	end

	return nothing
	end

	#-----------------------------------------------
	#-----------------------------------------------
	function GPUify_Loops_mixed_calc!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1}
	FT = eltype(d_vin)
	ii = threadIdx().x; ij = threadIdx().y; ik = threadIdx().z
	# @cuprintln("sx = $sx; sy = $sy; sz = $sz;")
	vout_ijk = @shmem FT (qm1qm1qm1,)
	loc = ii + (ij-1)qm1 + (ik-1)qm1*qm1
	vout_ijk[loc] = d_vin[ii,ij,ik] * d_phir[1,ii] * d_phis[1,ij] * d_phit[1,ik]

	# Reduction
	if ii == 1 && ij==1 && ik==1
	temp = 0.0
	for i in 1:qm1qm1qm1
	@inbounds temp += vout_ijk[i]
	end
	@inbounds d_vout[1,1,1] = temp
	end
	return nothing
	end
	#-----------------------------------------------
	#-----------------------------------------------
	function GPUify_Loops_mixed_calc2!(d_vin, d_phir, d_phis, d_phit, d_vout, ::Val{qm1}) where {qm1}
	FT = eltype(d_vin)
	ij = threadIdx().x; ik = threadIdx().y;
	# @cuprintln("sx = $sx; sy = $sy; sz = $sz;")
	vout_jk = @shmem FT (qm1,qm1)
	vout_jk[ij,ik] = FT(0)

	for i in 1:qm1
	@inbounds vout_jk[ij,ik] += d_vin[i,ij,ik]*d_phir[1,i]
	end

	@inbounds vout_jk[ij,ik] *= d_phis[1,ij]
	@synchronize

	if ij==1 # Reduction and multiplication with phit
	for i = 2:qm1
	@inbounds vout_jk[1,ik] += vout_jk[i,ik]
	end
	@inbounds vout_jk[1,ik] *= d_phit[1,ik]
	end

	@synchronize
	if ij==1 & ik==1
	for k in 2:qm1
	@inbounds vout_jk[1,1] += vout_jk[1,k]
	end
	d_vout[1,1,1] = vout_jk[1,1]
	end

	return nothing
	end
	#-----------------------------------------------




	FT = Float64
	poly_order = 5
	qm1 = poly_order + 1

	sr = 1; ss = 1; st = 1

	vin = rand(FT, qm1, qm1, qm1)
	vout = zeros(FT, sr, ss, st)
	phir = rand(FT, sr, qm1)
	phis = rand(FT, ss, qm1)
	phit = rand(FT, st, qm1)

	println("qm1 = ",qm1)
	println("size(phir) = ",size(phir))
	println("size(phis) = ",size(phis))
	println("size(phit) = ",size(phit))




	#--------------------------------------------
	println("CPU launch + time consumed")
	println("===========================")
	@btime cpu_calc!($vin, $phir, $phis, $phit, $vout)
	@show vout
	#--------------------------------------------

	d_vout = CuArray{FT}(undef, sr, ss, st)

	d_vin = CuArray(vin)
	d_phir = CuArray(phir)
	d_phis = CuArray(phis)
	d_phit = CuArray(phit)




	#--------------------------------------------
	println()
	println()
	println("GPU launch version 1 (static shared memory + val) + time consumed")
	println("===========================")
	#----------------------
	fill!(d_vout, FT(0.0))
	@btime CuArrays.@sync @cuda threads=($qm1,$qm1,$qm1) shmem=$(qm1qm1qm1*sizeof(eltype(d_vin))) gpu_calc1!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1)))
	@show d_vout
	#--------------------------------------------

	println()
	println()
	println("GPU launch version 2 + (static shared memory + val) + time consumed")
	println("===========================")
	#----------------------
	fill!(d_vout, FT(0.0))
	println("timing using btime")
	@btime CuArrays.@sync @cuda threads=($qm1,$qm1) shmem=$(qm1qm1sizeof(eltype(d_vin))) gpu_calc2!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1)))
	@show d_vout
	#----------------------

	#--------------------------------------------
	println()
	println()
	println("GPU launch version 1 with dynamic shared memory with size specified at launch + time consumed")
	println("===========================")
	#----------------------
	fill!(d_vout, FT(0.0))
	@btime CuArrays.@sync @cuda threads=($qm1,$qm1,$qm1) shmem=$(qm1qm1qm1*sizeof(eltype(d_vin))) gpu_calc3!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout)
	@show d_vout
	#--------------------------------------------

	#--------------------------------------------
	println()
	println()
	println("GPU launch version 2 with dynamic shared memory with size specified at launch + time consumed")
	println("===========================")
	#----------------------
	fill!(d_vout, FT(0.0))
	@btime CuArrays.@sync @cuda threads=($qm1,$qm1) shmem=$(qm1qm1sizeof(eltype(d_vin))) gpu_calc4!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout)
	@show d_vout
	#--------------------------------------------


	println()
	println()
	println("GPUifyLoops launch version 1 + time consumed")
	println("===========================")

	device = typeof(d_vin) <: Array ? CPU() : CUDA()
	println("device = $device")
	#----------------------
	fill!(d_vout, FT(0.0))
	@btime CuArrays.@sync @launch($device, threads=($qm1,$qm1,$qm1), blocks=1, GPUify_Loops_mixed_calc!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1))))
	@show d_vout
	#----------------------


	println()
	println()
	println("GPUifyLoops launch version 2 + time consumed")
	println("===========================")

	device = typeof(d_vin) <: Array ? CPU() : CUDA()
	println("device = $device")
	#----------------------
	fill!(d_vout, FT(0.0))
	@btime CuArrays.@sync @launch($device, threads=($qm1,$qm1), blocks=1, GPUify_Loops_mixed_calc2!($d_vin, $d_phir, $d_phis, $d_phit, $d_vout, $(Val(qm1))))
	@show d_vout
	#----------------------