Skip to content

Instantly share code, notes, and snippets.

@luraess
Created February 16, 2023 17:28
Show Gist options
  • Save luraess/5e697f857a7aa4d1d00e99ca02cbbb3d to your computer and use it in GitHub Desktop.
Save luraess/5e697f857a7aa4d1d00e99ca02cbbb3d to your computer and use it in GitHub Desktop.
Laplacian 2D to test boundscheck perf on AMDGPU
using BenchmarkTools, AMDGPU
function diff2D_step_inbounds!(T2, T, Ci, lam, dt, _dx, _dy)
ix = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
iy = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y
if (ix>1 && ix<size(T2,1) && iy>1 && iy<size(T2,2))
@inbounds T2[ix,iy] = T[ix,iy] + dt*(Ci[ix,iy]*(
- ((-lam*(T[ix+1,iy] - T[ix,iy])*_dx) - (-lam*(T[ix,iy] - T[ix-1,iy])*_dx))*_dx
- ((-lam*(T[ix,iy+1] - T[ix,iy])*_dy) - (-lam*(T[ix,iy] - T[ix,iy-1])*_dy))*_dy ))
end
return
end
function diff2D_step!(T2, T, Ci, lam, dt, _dx, _dy)
ix = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
iy = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y
if (ix>1 && ix<size(T2,1) && iy>1 && iy<size(T2,2))
T2[ix,iy] = T[ix,iy] + dt*(Ci[ix,iy]*(
- ((-lam*(T[ix+1,iy] - T[ix,iy])*_dx) - (-lam*(T[ix,iy] - T[ix-1,iy])*_dx))*_dx
- ((-lam*(T[ix,iy+1] - T[ix,iy])*_dy) - (-lam*(T[ix,iy] - T[ix,iy-1])*_dy))*_dy ))
end
return
end
function run_bench(;DAT=Float64)
sc = DAT==Float64 ? 1 : 2
fact = 24
nx,ny,nz = sc*fact*1024-1, fact*1024-1, 1
threads = (128, 2, 1)
grid = (nx+1, ny+1)
A = ROCArray(zeros(DAT, nx, ny))
B = ROCArray( rand(DAT, nx, ny))
C = ROCArray( ones(DAT, nx, ny))
lam = rand(DAT)
_dx,_dy = DAT(1.0), DAT(1.0)
dt = DAT(1.0/10.0/4.1)
println("Process selecting device $(AMDGPU.default_device_id())")
println("Problem size: nx=$nx, ny=$ny, nz=$nz, $(DAT)")
println("ROCm grid=$(grid), threads=$(threads)")
# run memcopy
sig = ROCSignal()
rocqueue = ROCQueue(AMDGPU.default_device(); priority=:high)
# run Laplacian
# t_it = @belapsed begin wait( @roc groupsize=$threads gridsize=$grid diff2D_step!($A, $B, $C, $lam, $dt, $_dx, $_dy) ) end
t_it = @belapsed begin
AMDGPU.HSA.signal_store_screlease($(sig.signal),1)
@roc wait=false mark=false signal=$sig queue=$rocqueue groupsize=$threads gridsize=$grid diff2D_step_inbounds!($A, $B, $C, $lam, $dt, $_dx, $_dy)
wait($sig)
end
T_tot = 3*1/1e9*nx*ny*sizeof(DAT)/t_it
println("T_tot Lap2D inbounds = $(round(T_tot,sigdigits=7)) GB/s")
t_it = @belapsed begin
AMDGPU.HSA.signal_store_screlease($(sig.signal),1)
@roc wait=false mark=false signal=$sig queue=$rocqueue groupsize=$threads gridsize=$grid diff2D_step!($A, $B, $C, $lam, $dt, $_dx, $_dy)
wait($sig)
end
T_tot = 3*1/1e9*nx*ny*sizeof(DAT)/t_it
println("T_tot Lap2D no inbounds = $(round(T_tot,sigdigits=7)) GB/s")
t_it = @belapsed begin
AMDGPU.HSA.signal_store_screlease($(sig.signal),1)
@roc wait=false mark=false boundscheck=false signal=$sig queue=$rocqueue groupsize=$threads gridsize=$grid diff2D_step!($A, $B, $C, $lam, $dt, $_dx, $_dy)
wait($sig)
end
T_tot = 3*1/1e9*nx*ny*sizeof(DAT)/t_it
println("T_tot Lap2D boundscheck=false = $(round(T_tot,sigdigits=7)) GB/s")
return
end
run_bench()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment