Created
April 13, 2021 23:28
-
-
Save mwarusz/98443299b8b11627b971e28eb43e8380 to your computer and use it in GitHub Desktop.
volumerhs indexing vs performance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Info: volumerhs_orig! details: | |
│ - 171 registers, max 256 threads | |
│ - 0 bytes local memory, | |
│ 1.113 KiB shared memory, | |
└ 0 bytes constant memory | |
Trial(7.074 ms) | |
┌ Info: volumerhs_ijk! details: | |
│ - 136 registers, max 384 threads | |
│ - 0 bytes local memory, | |
│ 1.113 KiB shared memory, | |
└ 0 bytes constant memory | |
Trial(5.675 ms) | |
┌ Info: volumerhs_linear! details: | |
│ - 86 registers, max 640 threads | |
│ - 0 bytes local memory, | |
│ 1.113 KiB shared memory, | |
└ 0 bytes constant memory | |
Trial(4.661 ms) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Info: volumerhs_orig! details: | |
│ - 250 registers, max 256 threads | |
│ - 0 bytes local memory, | |
│ 2.195 KiB shared memory, | |
└ 0 bytes constant memory | |
Trial(9.162 ms) | |
┌ Info: volumerhs_ijk! details: | |
│ - 214 registers, max 256 threads | |
│ - 0 bytes local memory, | |
│ 2.195 KiB shared memory, | |
└ 0 bytes constant memory | |
Trial(9.129 ms) | |
┌ Info: volumerhs_linear! details: | |
│ - 134 registers, max 384 threads | |
│ - 0 bytes local memory, | |
│ 2.195 KiB shared memory, | |
└ 0 bytes constant memory | |
Trial(8.625 ms) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module VolumeRHS | |
using BenchmarkTools | |
using CUDA | |
using StableRNGs | |
using StaticArrays | |
function loopinfo(name, expr, nodes...) | |
if expr.head != :for | |
error("Syntax error: pragma $name needs a for loop") | |
end | |
push!(expr.args[2].args, Expr(:loopinfo, nodes...)) | |
return expr | |
end | |
macro unroll(expr) | |
expr = loopinfo("@unroll", expr, (Symbol("llvm.loop.unroll.full"),)) | |
return esc(expr) | |
end | |
# HACK: module-local versions of core arithmetic; needed to get FMA | |
for (jlf, f) in zip((:+, :*, :-), (:add, :mul, :sub)) | |
for (T, llvmT) in ((Float32, "float"), (Float64, "double")) | |
ir = """ | |
%x = f$f contract nsz $llvmT %0, %1 | |
ret $llvmT %x | |
""" | |
@eval begin | |
# the @pure is necessary so that we can constant propagate. | |
Base.@pure function $jlf(a::$T, b::$T) | |
Base.@_inline_meta | |
Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b) | |
end | |
end | |
end | |
@eval function $jlf(args...) | |
Base.$jlf(args...) | |
end | |
end | |
let (jlf, f) = (:div_arcp, :div) | |
for (T, llvmT) in ((Float32, "float"), (Float64, "double")) | |
ir = """ | |
%x = f$f fast $llvmT %0, %1 | |
ret $llvmT %x | |
""" | |
@eval begin | |
# the @pure is necessary so that we can constant propagate. | |
Base.@pure function $jlf(a::$T, b::$T) | |
@Base._inline_meta | |
Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b) | |
end | |
end | |
end | |
@eval function $jlf(args...) | |
Base.$jlf(args...) | |
end | |
end | |
rcp(x) = div_arcp(one(x), x) # still leads to rcp.rn which is also a function call | |
# div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y) | |
# rcp(x) = div_fast(one(x), x) | |
# note the order of the fields below is also assumed in the code. | |
const _nstate = 5 | |
const _ρ, _U, _V, _W, _E = 1:_nstate | |
const stateid = (ρ = _ρ, U = _U, V = _V, W = _W, E = _E) | |
const _nvgeo = 14 | |
const _ξx, _ηx, _ζx, _ξy, _ηy, _ζy, _ξz, _ηz, _ζz, _MJ, _MJI, | |
_x, _y, _z = 1:_nvgeo | |
const vgeoid = (ξx = _ξx, ηx = _ηx, ζx = _ζx, | |
ξy = _ξy, ηy = _ηy, ζy = _ζy, | |
ξz = _ξz, ηz = _ηz, ζz = _ζz, | |
MJ = _MJ, MJI = _MJI, | |
x = _x, y = _y, z = _z) | |
const N = 4 | |
const nmoist = 0 | |
const ntrace = 0 | |
Base.@irrational grav 9.81 BigFloat(9.81) | |
Base.@irrational gdm1 0.4 BigFloat(0.4) | |
function volumerhs!(rhs, Q, vgeo, gravity, D, nelem) | |
Q = Base.Experimental.Const(Q) | |
vgeo = Base.Experimental.Const(vgeo) | |
D = Base.Experimental.Const(D) | |
nvar = _nstate + nmoist + ntrace | |
Nq = N + 1 | |
s_D = @cuStaticSharedMem eltype(D) (Nq, Nq) | |
s_F = @cuStaticSharedMem eltype(Q) (Nq, Nq, _nstate) | |
s_G = @cuStaticSharedMem eltype(Q) (Nq, Nq, _nstate) | |
r_rhsρ = MArray{Tuple{Nq}, eltype(rhs)}(undef) | |
r_rhsU = MArray{Tuple{Nq}, eltype(rhs)}(undef) | |
r_rhsV = MArray{Tuple{Nq}, eltype(rhs)}(undef) | |
r_rhsW = MArray{Tuple{Nq}, eltype(rhs)}(undef) | |
r_rhsE = MArray{Tuple{Nq}, eltype(rhs)}(undef) | |
e = blockIdx().x | |
j = threadIdx().y | |
i = threadIdx().x | |
@inbounds begin | |
for k in 1:Nq | |
r_rhsρ[k] = zero(eltype(rhs)) | |
r_rhsU[k] = zero(eltype(rhs)) | |
r_rhsV[k] = zero(eltype(rhs)) | |
r_rhsW[k] = zero(eltype(rhs)) | |
r_rhsE[k] = zero(eltype(rhs)) | |
end | |
# fetch D into shared | |
s_D[i, j] = D[i, j] | |
@unroll for k in 1:Nq | |
sync_threads() | |
# Load values will need into registers | |
MJ = vgeo[i, j, k, _MJ, e] | |
ξx, ξy, ξz = vgeo[i,j,k,_ξx,e], vgeo[i,j,k,_ξy,e], vgeo[i,j,k,_ξz,e] | |
ηx, ηy, ηz = vgeo[i,j,k,_ηx,e], vgeo[i,j,k,_ηy,e], vgeo[i,j,k,_ηz,e] | |
ζx, ζy, ζz = vgeo[i,j,k,_ζx,e], vgeo[i,j,k,_ζy,e], vgeo[i,j,k,_ζz,e] | |
z = vgeo[i,j,k,_z,e] | |
U, V, W = Q[i, j, k, _U, e], Q[i, j, k, _V, e], Q[i, j, k, _W, e] | |
ρ, E = Q[i, j, k, _ρ, e], Q[i, j, k, _E, e] | |
# GPU performance trick | |
# Allow optimizations to use the reciprocal of an argument rather than perform division. | |
# IEEE floating-point division is implemented as a function call | |
ρinv = rcp(ρ) | |
ρ2inv = rcp(2ρ) | |
# ρ2inv = 0.5f0 * pinv | |
P = gdm1*(E - (U^2 + V^2 + W^2)*ρ2inv - ρ*gravity*z) | |
fluxρ_x = U | |
fluxU_x = ρinv * U * U + P | |
fluxV_x = ρinv * U * V | |
fluxW_x = ρinv * U * W | |
fluxE_x = ρinv * U * (E + P) | |
fluxρ_y = V | |
fluxU_y = ρinv * V * U | |
fluxV_y = ρinv * V * V + P | |
fluxW_y = ρinv * V * W | |
fluxE_y = ρinv * V * (E + P) | |
fluxρ_z = W | |
fluxU_z = ρinv * W * U | |
fluxV_z = ρinv * W * V | |
fluxW_z = ρinv * W * W + P | |
fluxE_z = ρinv * W * (E + P) | |
s_F[i, j, _ρ] = MJ * (ξx * fluxρ_x + ξy * fluxρ_y + ξz * fluxρ_z) | |
s_F[i, j, _U] = MJ * (ξx * fluxU_x + ξy * fluxU_y + ξz * fluxU_z) | |
s_F[i, j, _V] = MJ * (ξx * fluxV_x + ξy * fluxV_y + ξz * fluxV_z) | |
s_F[i, j, _W] = MJ * (ξx * fluxW_x + ξy * fluxW_y + ξz * fluxW_z) | |
s_F[i, j, _E] = MJ * (ξx * fluxE_x + ξy * fluxE_y + ξz * fluxE_z) | |
s_G[i, j, _ρ] = MJ * (ηx * fluxρ_x + ηy * fluxρ_y + ηz * fluxρ_z) | |
s_G[i, j, _U] = MJ * (ηx * fluxU_x + ηy * fluxU_y + ηz * fluxU_z) | |
s_G[i, j, _V] = MJ * (ηx * fluxV_x + ηy * fluxV_y + ηz * fluxV_z) | |
s_G[i, j, _W] = MJ * (ηx * fluxW_x + ηy * fluxW_y + ηz * fluxW_z) | |
s_G[i, j, _E] = MJ * (ηx * fluxE_x + ηy * fluxE_y + ηz * fluxE_z) | |
r_Hρ = MJ * (ζx * fluxρ_x + ζy * fluxρ_y + ζz * fluxρ_z) | |
r_HU = MJ * (ζx * fluxU_x + ζy * fluxU_y + ζz * fluxU_z) | |
r_HV = MJ * (ζx * fluxV_x + ζy * fluxV_y + ζz * fluxV_z) | |
r_HW = MJ * (ζx * fluxW_x + ζy * fluxW_y + ζz * fluxW_z) | |
r_HE = MJ * (ζx * fluxE_x + ζy * fluxE_y + ζz * fluxE_z) | |
# one shared access per 10 flops | |
for n = 1:Nq | |
Dkn = s_D[k, n] | |
r_rhsρ[n] += Dkn * r_Hρ | |
r_rhsU[n] += Dkn * r_HU | |
r_rhsV[n] += Dkn * r_HV | |
r_rhsW[n] += Dkn * r_HW | |
r_rhsE[n] += Dkn * r_HE | |
end | |
r_rhsW[k] -= MJ * ρ * gravity | |
sync_threads() | |
# loop of ξ-grid lines | |
@unroll for n = 1:Nq | |
Dni = s_D[n, i] | |
Dnj = s_D[n, j] | |
r_rhsρ[k] += Dni * s_F[n, j, _ρ] | |
r_rhsρ[k] += Dnj * s_G[i, n, _ρ] | |
r_rhsU[k] += Dni * s_F[n, j, _U] | |
r_rhsU[k] += Dnj * s_G[i, n, _U] | |
r_rhsV[k] += Dni * s_F[n, j, _V] | |
r_rhsV[k] += Dnj * s_G[i, n, _V] | |
r_rhsW[k] += Dni * s_F[n, j, _W] | |
r_rhsW[k] += Dnj * s_G[i, n, _W] | |
r_rhsE[k] += Dni * s_F[n, j, _E] | |
r_rhsE[k] += Dnj * s_G[i, n, _E] | |
end | |
end # k | |
@unroll for k in 1:Nq | |
MJI = vgeo[i, j, k, _MJI, e] | |
# Updates are a performance bottleneck | |
# primary source of stall_long_sb | |
rhs[i, j, k, _U, e] += MJI*r_rhsU[k] | |
rhs[i, j, k, _V, e] += MJI*r_rhsV[k] | |
rhs[i, j, k, _W, e] += MJI*r_rhsW[k] | |
rhs[i, j, k, _ρ, e] += MJI*r_rhsρ[k] | |
rhs[i, j, k, _E, e] += MJI*r_rhsE[k] | |
end | |
end | |
return | |
end | |
function volumerhs_ijk!(rhs, Q, vgeo, gravity, D, nelem) | |
Q = Base.Experimental.Const(Q) | |
vgeo = Base.Experimental.Const(vgeo) | |
D = Base.Experimental.Const(D) | |
nvar = _nstate + nmoist + ntrace | |
Nq = N + 1 | |
s_D = @cuStaticSharedMem eltype(D) (Nq, Nq) | |
s_F = @cuStaticSharedMem eltype(Q) (Nq, Nq, _nstate) | |
s_G = @cuStaticSharedMem eltype(Q) (Nq, Nq, _nstate) | |
r_rhsρ = MArray{Tuple{Nq}, eltype(rhs)}(undef) | |
r_rhsU = MArray{Tuple{Nq}, eltype(rhs)}(undef) | |
r_rhsV = MArray{Tuple{Nq}, eltype(rhs)}(undef) | |
r_rhsW = MArray{Tuple{Nq}, eltype(rhs)}(undef) | |
r_rhsE = MArray{Tuple{Nq}, eltype(rhs)}(undef) | |
e = blockIdx().x | |
j = threadIdx().y | |
i = threadIdx().x | |
@inbounds begin | |
for k in 1:Nq | |
r_rhsρ[k] = zero(eltype(rhs)) | |
r_rhsU[k] = zero(eltype(rhs)) | |
r_rhsV[k] = zero(eltype(rhs)) | |
r_rhsW[k] = zero(eltype(rhs)) | |
r_rhsE[k] = zero(eltype(rhs)) | |
end | |
# fetch D into shared | |
s_D[i, j] = D[i, j] | |
@unroll for k in 1:Nq | |
sync_threads() | |
ijk = i + Nq * (j - 1 + Nq * (k - 1)) | |
# Load values will need into registers | |
MJ = vgeo[ijk, _MJ, e] | |
ξx, ξy, ξz = vgeo[ijk,_ξx,e], vgeo[ijk,_ξy,e], vgeo[ijk,_ξz,e] | |
ηx, ηy, ηz = vgeo[ijk,_ηx,e], vgeo[ijk,_ηy,e], vgeo[ijk,_ηz,e] | |
ζx, ζy, ζz = vgeo[ijk,_ζx,e], vgeo[ijk,_ζy,e], vgeo[ijk,_ζz,e] | |
z = vgeo[ijk,_z,e] | |
U, V, W = Q[ijk, _U, e], Q[ijk, _V, e], Q[ijk, _W, e] | |
ρ, E = Q[ijk, _ρ, e], Q[ijk, _E, e] | |
# GPU performance trick | |
# Allow optimizations to use the reciprocal of an argument rather than perform division. | |
# IEEE floating-point division is implemented as a function call | |
ρinv = rcp(ρ) | |
ρ2inv = rcp(2ρ) | |
# ρ2inv = 0.5f0 * pinv | |
P = gdm1*(E - (U^2 + V^2 + W^2)*ρ2inv - ρ*gravity*z) | |
fluxρ_x = U | |
fluxU_x = ρinv * U * U + P | |
fluxV_x = ρinv * U * V | |
fluxW_x = ρinv * U * W | |
fluxE_x = ρinv * U * (E + P) | |
fluxρ_y = V | |
fluxU_y = ρinv * V * U | |
fluxV_y = ρinv * V * V + P | |
fluxW_y = ρinv * V * W | |
fluxE_y = ρinv * V * (E + P) | |
fluxρ_z = W | |
fluxU_z = ρinv * W * U | |
fluxV_z = ρinv * W * V | |
fluxW_z = ρinv * W * W + P | |
fluxE_z = ρinv * W * (E + P) | |
s_F[i, j, _ρ] = MJ * (ξx * fluxρ_x + ξy * fluxρ_y + ξz * fluxρ_z) | |
s_F[i, j, _U] = MJ * (ξx * fluxU_x + ξy * fluxU_y + ξz * fluxU_z) | |
s_F[i, j, _V] = MJ * (ξx * fluxV_x + ξy * fluxV_y + ξz * fluxV_z) | |
s_F[i, j, _W] = MJ * (ξx * fluxW_x + ξy * fluxW_y + ξz * fluxW_z) | |
s_F[i, j, _E] = MJ * (ξx * fluxE_x + ξy * fluxE_y + ξz * fluxE_z) | |
s_G[i, j, _ρ] = MJ * (ηx * fluxρ_x + ηy * fluxρ_y + ηz * fluxρ_z) | |
s_G[i, j, _U] = MJ * (ηx * fluxU_x + ηy * fluxU_y + ηz * fluxU_z) | |
s_G[i, j, _V] = MJ * (ηx * fluxV_x + ηy * fluxV_y + ηz * fluxV_z) | |
s_G[i, j, _W] = MJ * (ηx * fluxW_x + ηy * fluxW_y + ηz * fluxW_z) | |
s_G[i, j, _E] = MJ * (ηx * fluxE_x + ηy * fluxE_y + ηz * fluxE_z) | |
r_Hρ = MJ * (ζx * fluxρ_x + ζy * fluxρ_y + ζz * fluxρ_z) | |
r_HU = MJ * (ζx * fluxU_x + ζy * fluxU_y + ζz * fluxU_z) | |
r_HV = MJ * (ζx * fluxV_x + ζy * fluxV_y + ζz * fluxV_z) | |
r_HW = MJ * (ζx * fluxW_x + ζy * fluxW_y + ζz * fluxW_z) | |
r_HE = MJ * (ζx * fluxE_x + ζy * fluxE_y + ζz * fluxE_z) | |
# one shared access per 10 flops | |
for n = 1:Nq | |
Dkn = s_D[k, n] | |
r_rhsρ[n] += Dkn * r_Hρ | |
r_rhsU[n] += Dkn * r_HU | |
r_rhsV[n] += Dkn * r_HV | |
r_rhsW[n] += Dkn * r_HW | |
r_rhsE[n] += Dkn * r_HE | |
end | |
r_rhsW[k] -= MJ * ρ * gravity | |
sync_threads() | |
# loop of ξ-grid lines | |
@unroll for n = 1:Nq | |
Dni = s_D[n, i] | |
Dnj = s_D[n, j] | |
r_rhsρ[k] += Dni * s_F[n, j, _ρ] | |
r_rhsρ[k] += Dnj * s_G[i, n, _ρ] | |
r_rhsU[k] += Dni * s_F[n, j, _U] | |
r_rhsU[k] += Dnj * s_G[i, n, _U] | |
r_rhsV[k] += Dni * s_F[n, j, _V] | |
r_rhsV[k] += Dnj * s_G[i, n, _V] | |
r_rhsW[k] += Dni * s_F[n, j, _W] | |
r_rhsW[k] += Dnj * s_G[i, n, _W] | |
r_rhsE[k] += Dni * s_F[n, j, _E] | |
r_rhsE[k] += Dnj * s_G[i, n, _E] | |
end | |
end # k | |
@unroll for k in 1:Nq | |
ijk = i + Nq * (j - 1 + Nq * (k - 1)) | |
MJI = vgeo[ijk, _MJI, e] | |
# Updates are a performance bottleneck | |
# primary source of stall_long_sb | |
rhs[ijk, _U, e] += MJI*r_rhsU[k] | |
rhs[ijk, _V, e] += MJI*r_rhsV[k] | |
rhs[ijk, _W, e] += MJI*r_rhsW[k] | |
rhs[ijk, _ρ, e] += MJI*r_rhsρ[k] | |
rhs[ijk, _E, e] += MJI*r_rhsE[k] | |
end | |
end | |
return | |
end | |
function volumerhs_linear!(rhs, Q, vgeo, gravity, D, nelem) | |
Q = Base.Experimental.Const(Q) | |
vgeo = Base.Experimental.Const(vgeo) | |
D = Base.Experimental.Const(D) | |
nvar = _nstate + nmoist + ntrace | |
Nq = N + 1 | |
Nq2 = Nq ^ 2 | |
Nq3 = Nq ^ 3 | |
s_D = @cuStaticSharedMem eltype(D) (Nq, Nq) | |
s_F = @cuStaticSharedMem eltype(Q) (Nq, Nq, _nstate) | |
s_G = @cuStaticSharedMem eltype(Q) (Nq, Nq, _nstate) | |
r_rhsρ = MArray{Tuple{Nq}, eltype(rhs)}(undef) | |
r_rhsU = MArray{Tuple{Nq}, eltype(rhs)}(undef) | |
r_rhsV = MArray{Tuple{Nq}, eltype(rhs)}(undef) | |
r_rhsW = MArray{Tuple{Nq}, eltype(rhs)}(undef) | |
r_rhsE = MArray{Tuple{Nq}, eltype(rhs)}(undef) | |
e = blockIdx().x | |
j = threadIdx().y | |
i = threadIdx().x | |
@inbounds begin | |
for k in 1:Nq | |
r_rhsρ[k] = zero(eltype(rhs)) | |
r_rhsU[k] = zero(eltype(rhs)) | |
r_rhsV[k] = zero(eltype(rhs)) | |
r_rhsW[k] = zero(eltype(rhs)) | |
r_rhsE[k] = zero(eltype(rhs)) | |
end | |
vgeo_ije = i + Nq * (j - 1) + Nq3 * _nvgeo * (e - 1) | |
Q_ije = i + Nq * (j - 1) + Nq3 * _nstate * (e - 1) | |
# fetch D into shared | |
s_D[i, j] = D[i, j] | |
@unroll for k in 1:Nq | |
sync_threads() | |
vgeo_ijke = vgeo_ije + (k - 1) * Nq2 | |
Q_ijke = Q_ije + (k - 1) * Nq2 | |
# Load values will need into registers | |
MJ = vgeo[vgeo_ijke + (_MJ - 1) * Nq3] | |
ξx, ξy, ξz = | |
vgeo[vgeo_ijke + (_ξx - 1) * Nq3], | |
vgeo[vgeo_ijke + (_ξy - 1) * Nq3], | |
vgeo[vgeo_ijke + (_ξz - 1) * Nq3] | |
ηx, ηy, ηz = | |
vgeo[vgeo_ijke + (_ηx - 1) * Nq3], | |
vgeo[vgeo_ijke + (_ηy - 1) * Nq3], | |
vgeo[vgeo_ijke + (_ηz - 1) * Nq3] | |
ζx, ζy, ζz = | |
vgeo[vgeo_ijke + (_ζx - 1) * Nq3], | |
vgeo[vgeo_ijke + (_ζy - 1) * Nq3], | |
vgeo[vgeo_ijke + (_ζz - 1) * Nq3] | |
z = vgeo[vgeo_ijke + (_z - 1) * Nq3] | |
U, V, W = Q[Q_ijke + (_U - 1) * Nq3], Q[Q_ijke + (_V - 1) * Nq3], Q[Q_ijke + (_W - 1) * Nq3] | |
ρ, E = Q[Q_ijke + (_ρ - 1) * Nq3], Q[Q_ijke + (_E - 1) * Nq3] | |
# GPU performance trick | |
# Allow optimizations to use the reciprocal of an argument rather than perform division. | |
# IEEE floating-point division is implemented as a function call | |
ρinv = rcp(ρ) | |
ρ2inv = rcp(2ρ) | |
# ρ2inv = 0.5f0 * pinv | |
P = gdm1*(E - (U^2 + V^2 + W^2)*ρ2inv - ρ*gravity*z) | |
fluxρ_x = U | |
fluxU_x = ρinv * U * U + P | |
fluxV_x = ρinv * U * V | |
fluxW_x = ρinv * U * W | |
fluxE_x = ρinv * U * (E + P) | |
fluxρ_y = V | |
fluxU_y = ρinv * V * U | |
fluxV_y = ρinv * V * V + P | |
fluxW_y = ρinv * V * W | |
fluxE_y = ρinv * V * (E + P) | |
fluxρ_z = W | |
fluxU_z = ρinv * W * U | |
fluxV_z = ρinv * W * V | |
fluxW_z = ρinv * W * W + P | |
fluxE_z = ρinv * W * (E + P) | |
s_F[i, j, _ρ] = MJ * (ξx * fluxρ_x + ξy * fluxρ_y + ξz * fluxρ_z) | |
s_F[i, j, _U] = MJ * (ξx * fluxU_x + ξy * fluxU_y + ξz * fluxU_z) | |
s_F[i, j, _V] = MJ * (ξx * fluxV_x + ξy * fluxV_y + ξz * fluxV_z) | |
s_F[i, j, _W] = MJ * (ξx * fluxW_x + ξy * fluxW_y + ξz * fluxW_z) | |
s_F[i, j, _E] = MJ * (ξx * fluxE_x + ξy * fluxE_y + ξz * fluxE_z) | |
s_G[i, j, _ρ] = MJ * (ηx * fluxρ_x + ηy * fluxρ_y + ηz * fluxρ_z) | |
s_G[i, j, _U] = MJ * (ηx * fluxU_x + ηy * fluxU_y + ηz * fluxU_z) | |
s_G[i, j, _V] = MJ * (ηx * fluxV_x + ηy * fluxV_y + ηz * fluxV_z) | |
s_G[i, j, _W] = MJ * (ηx * fluxW_x + ηy * fluxW_y + ηz * fluxW_z) | |
s_G[i, j, _E] = MJ * (ηx * fluxE_x + ηy * fluxE_y + ηz * fluxE_z) | |
r_Hρ = MJ * (ζx * fluxρ_x + ζy * fluxρ_y + ζz * fluxρ_z) | |
r_HU = MJ * (ζx * fluxU_x + ζy * fluxU_y + ζz * fluxU_z) | |
r_HV = MJ * (ζx * fluxV_x + ζy * fluxV_y + ζz * fluxV_z) | |
r_HW = MJ * (ζx * fluxW_x + ζy * fluxW_y + ζz * fluxW_z) | |
r_HE = MJ * (ζx * fluxE_x + ζy * fluxE_y + ζz * fluxE_z) | |
# one shared access per 10 flops | |
for n = 1:Nq | |
Dkn = s_D[k, n] | |
r_rhsρ[n] += Dkn * r_Hρ | |
r_rhsU[n] += Dkn * r_HU | |
r_rhsV[n] += Dkn * r_HV | |
r_rhsW[n] += Dkn * r_HW | |
r_rhsE[n] += Dkn * r_HE | |
end | |
r_rhsW[k] -= MJ * ρ * gravity | |
sync_threads() | |
# loop of ξ-grid lines | |
@unroll for n = 1:Nq | |
Dni = s_D[n, i] | |
Dnj = s_D[n, j] | |
r_rhsρ[k] += Dni * s_F[n, j, _ρ] | |
r_rhsρ[k] += Dnj * s_G[i, n, _ρ] | |
r_rhsU[k] += Dni * s_F[n, j, _U] | |
r_rhsU[k] += Dnj * s_G[i, n, _U] | |
r_rhsV[k] += Dni * s_F[n, j, _V] | |
r_rhsV[k] += Dnj * s_G[i, n, _V] | |
r_rhsW[k] += Dni * s_F[n, j, _W] | |
r_rhsW[k] += Dnj * s_G[i, n, _W] | |
r_rhsE[k] += Dni * s_F[n, j, _E] | |
r_rhsE[k] += Dnj * s_G[i, n, _E] | |
end | |
end # k | |
@unroll for k in 1:Nq | |
vgeo_ijke = vgeo_ije + (k - 1) * Nq2 | |
Q_ijke = Q_ije + (k - 1) * Nq2 | |
MJI = vgeo[vgeo_ijke + (_MJI - 1) * Nq3] | |
# Updates are a performance bottleneck | |
# primary source of stall_long_sb | |
rhs[Q_ijke + (_U - 1) * Nq3] += MJI*r_rhsU[k] | |
rhs[Q_ijke + (_V - 1) * Nq3] += MJI*r_rhsV[k] | |
rhs[Q_ijke + (_W - 1) * Nq3] += MJI*r_rhsW[k] | |
rhs[Q_ijke + (_ρ - 1) * Nq3] += MJI*r_rhsρ[k] | |
rhs[Q_ijke + (_E - 1) * Nq3] += MJI*r_rhsE[k] | |
end | |
end | |
return | |
end | |
function benchmark_orig(rng, DFloat, nelem) | |
Nq = N + 1 | |
nvar = _nstate + nmoist + ntrace | |
Q = 1 .+ CuArray(rand(rng, DFloat, Nq, Nq, Nq, nvar, nelem)) | |
Q[:, :, :, _E, :] .+= 20 | |
vgeo = CuArray(rand(rng, DFloat, Nq, Nq, Nq, _nvgeo, nelem)) | |
# make sure the entries of the mass matrix satisfy the inverse relation | |
vgeo[:, :, :, _MJ, :] .+= 3 | |
vgeo[:, :, :, _MJI, :] .= 1 ./ vgeo[:, :, :, _MJ, :] | |
D = CuArray(rand(rng, DFloat, Nq, Nq)) | |
rhs = CuArray(zeros(DFloat, Nq, Nq, Nq, nvar, nelem)) | |
threads=(N+1, N+1) | |
kernel = @cuda launch=false volumerhs!(rhs, Q, vgeo, DFloat(grav), D, nelem) | |
# XXX: should we print these for all kernels? maybe upload them to Codespeed? | |
@info """volumerhs_orig! details: | |
- $(CUDA.registers(kernel)) registers, max $(CUDA.maxthreads(kernel)) threads | |
- $(Base.format_bytes(CUDA.memory(kernel).local)) local memory, | |
$(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory, | |
$(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory""" | |
results = @benchmark begin | |
CUDA.@sync $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem; | |
threads=$threads, blocks=$nelem) | |
end | |
# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them | |
CUDA.unsafe_free!(rhs) | |
CUDA.unsafe_free!(Q) | |
CUDA.unsafe_free!(vgeo) | |
CUDA.unsafe_free!(D) | |
println(results) | |
results | |
end | |
function benchmark_ijk(rng, DFloat, nelem) | |
Nq = N + 1 | |
nvar = _nstate + nmoist + ntrace | |
Q = 1 .+ CuArray(rand(rng, DFloat, Nq ^ 3, nvar, nelem)) | |
Q[:, _E, :] .+= 20 | |
vgeo = CuArray(rand(rng, DFloat, Nq ^ 3, _nvgeo, nelem)) | |
# make sure the entries of the mass matrix satisfy the inverse relation | |
vgeo[:, _MJ, :] .+= 3 | |
vgeo[:, _MJI, :] .= 1 ./ vgeo[:, _MJ, :] | |
D = CuArray(rand(rng, DFloat, Nq, Nq)) | |
rhs = CuArray(zeros(DFloat, Nq ^ 3, nvar, nelem)) | |
threads=(N+1, N+1) | |
kernel = @cuda launch=false volumerhs_ijk!(rhs, Q, vgeo, DFloat(grav), D, nelem) | |
# XXX: should we print these for all kernels? maybe upload them to Codespeed? | |
@info """volumerhs_ijk! details: | |
- $(CUDA.registers(kernel)) registers, max $(CUDA.maxthreads(kernel)) threads | |
- $(Base.format_bytes(CUDA.memory(kernel).local)) local memory, | |
$(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory, | |
$(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory""" | |
results = @benchmark begin | |
CUDA.@sync $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem; | |
threads=$threads, blocks=$nelem) | |
end | |
# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them | |
CUDA.unsafe_free!(rhs) | |
CUDA.unsafe_free!(Q) | |
CUDA.unsafe_free!(vgeo) | |
CUDA.unsafe_free!(D) | |
println(results) | |
results | |
end | |
function benchmark_linear(rng, DFloat, nelem) | |
Nq = N + 1 | |
nvar = _nstate + nmoist + ntrace | |
Q = 1 .+ CuArray(rand(rng, DFloat, Nq ^ 3, nvar, nelem)) | |
Q[:, _E, :] .+= 20 | |
vgeo = CuArray(rand(rng, DFloat, Nq ^ 3, _nvgeo, nelem)) | |
# make sure the entries of the mass matrix satisfy the inverse relation | |
vgeo[:, _MJ, :] .+= 3 | |
vgeo[:, _MJI, :] .= 1 ./ vgeo[:, _MJ, :] | |
D = CuArray(rand(rng, DFloat, Nq, Nq)) | |
rhs = CuArray(zeros(DFloat, Nq ^ 3, nvar, nelem)) | |
threads=(N+1, N+1) | |
kernel = @cuda launch=false volumerhs_linear!(rhs, Q, vgeo, DFloat(grav), D, nelem) | |
# XXX: should we print these for all kernels? maybe upload them to Codespeed? | |
@info """volumerhs_linear! details: | |
- $(CUDA.registers(kernel)) registers, max $(CUDA.maxthreads(kernel)) threads | |
- $(Base.format_bytes(CUDA.memory(kernel).local)) local memory, | |
$(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory, | |
$(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory""" | |
results = @benchmark begin | |
CUDA.@sync $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem; | |
threads=$threads, blocks=$nelem) | |
end | |
# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them | |
CUDA.unsafe_free!(rhs) | |
CUDA.unsafe_free!(Q) | |
CUDA.unsafe_free!(vgeo) | |
CUDA.unsafe_free!(D) | |
println(results) | |
results | |
end | |
function check(rng, DFloat, nelem) | |
Nq = N + 1 | |
nvar = _nstate + nmoist + ntrace | |
Q = 1 .+ CuArray(rand(rng, DFloat, Nq, Nq, Nq, nvar, nelem)) | |
Q[:, :, :, _E, :] .+= 20 | |
vgeo = CuArray(rand(rng, DFloat, Nq, Nq, Nq, _nvgeo, nelem)) | |
# make sure the entries of the mass matrix satisfy the inverse relation | |
vgeo[:, :, :, _MJ, :] .+= 3 | |
vgeo[:, :, :, _MJI, :] .= 1 ./ vgeo[:, :, :, _MJ, :] | |
D = CuArray(rand(rng, DFloat, Nq, Nq)) | |
rhs1 = CuArray(zeros(DFloat, Nq, Nq, Nq, nvar, nelem)) | |
rhs2 = CuArray(zeros(DFloat, Nq, Nq, Nq, nvar, nelem)) | |
rhs3 = CuArray(zeros(DFloat, Nq, Nq, Nq, nvar, nelem)) | |
threads=(N+1, N+1) | |
@cuda volumerhs!(rhs1, Q, vgeo, DFloat(grav), D, nelem) | |
@cuda volumerhs_linear!(rhs3, Q, vgeo, DFloat(grav), D, nelem) | |
rhs2 = reshape(rhs2, (Nq ^3, nvar, nelem)) | |
Q = reshape(Q, (Nq ^3, nvar, nelem)) | |
vgeo = reshape(vgeo, (Nq ^3, _nvgeo, nelem)) | |
@cuda volumerhs_ijk!(rhs2, Q, vgeo, DFloat(grav), D, nelem) | |
rhs2 = reshape(rhs2, (Nq, Nq, Nq, nvar, nelem)) | |
rhs1 = Array(rhs1) | |
rhs2 = Array(rhs2) | |
rhs3 = Array(rhs3) | |
@show extrema(rhs1 .- rhs2) | |
@show extrema(rhs2 .- rhs3) | |
@show extrema(rhs1 .- rhs3) | |
end | |
function main() | |
DFloat = Float32 | |
nelem = 240_000 | |
rng = StableRNG(123) | |
#check(rng, DFloat, nelem) | |
benchmark_orig(rng, DFloat, nelem) | |
benchmark_ijk(rng, DFloat, nelem) | |
benchmark_linear(rng, DFloat, nelem) | |
end | |
end | |
VolumeRHS.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment