Last active
February 8, 2022 20:53
-
-
Save carstenbauer/c482cb14e44246c4186193c50a1f9ac7 to your computer and use it in GitHub Desktop.
Nvidia NVLink Bandwidth Measurement with Julia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using CUDA | |
using BenchmarkTools | |
using Statistics | |
using UnicodePlots | |
# bandwidth test | |
abstract type BytesSIPrefixed end | |
struct B <: BytesSIPrefixed | |
value::Int64 | |
end | |
struct KB <: BytesSIPrefixed | |
value::Int64 | |
end | |
struct MB <: BytesSIPrefixed | |
value::Int64 | |
end | |
struct GB <: BytesSIPrefixed | |
value::Int64 | |
end | |
_memsize2N(memsize::B; dtype = Float32) = memsize.value ÷ sizeof(dtype) | |
_memsize2N(memsize::KB; dtype = Float32) = (memsize.value * 1024) ÷ sizeof(dtype) | |
_memsize2N(memsize::MB; dtype = Float32) = (memsize.value * 1024 * 1024) ÷ sizeof(dtype) | |
_memsize2N(memsize::GB; dtype = Float32) = (memsize.value * 1024 * 1024 * 1024) ÷ sizeof(dtype) | |
""" | |
init_mem(memsize; src = 0, dst = 1, dtype=Float32) | |
Init memory of size `memsize` on device `src` and `dst`. | |
""" | |
function init_mem(memsize::BytesSIPrefixed; src = 0, dst = 1, dtype = Float32) | |
# src != dst || throw(ArgumentError("src == dst...")) | |
N = _memsize2N(memsize; dtype) | |
device!(src) | |
mem_src = CUDA.rand(N) | |
device!(dst) | |
mem_dst = CUDA.zeros(N) | |
return mem_src, mem_dst | |
end | |
init_mem(memsize::Real; kwargs...) = init_mem(B(memsize); kwargs...) | |
function bench_p2p_copyto!(memsize::BytesSIPrefixed; nbench = 10, verbose = true, hist = true, dtype = Float32, kwargs...) | |
mem_src, mem_dst = init_mem(memsize; dtype, kwargs...) | |
actual_memsize = sizeof(mem_src) | |
ts = zeros(nbench) | |
@inbounds for i in 1:nbench | |
ts[i] = CUDA.@elapsed begin | |
copyto!(mem_dst, mem_src) | |
end | |
end | |
t_min = minimum(ts) | |
t_max = maximum(ts) | |
t_avg = mean(ts) | |
if verbose | |
println("Memsize: $actual_memsize (requested: $memsize)") | |
println("t_min: $t_min") | |
println("t_max: $t_max") | |
println("t_avg: $t_avg") | |
actual_memsize_GB = (actual_memsize) / (1024^3) | |
println("Bandwidth (GB/s):") | |
println(" ├ max: ", actual_memsize_GB / t_min) | |
println(" ├ min: ", actual_memsize_GB / t_max) | |
println(" └ avg: ", actual_memsize_GB / t_avg) | |
if hist | |
display(histogram(ts)) | |
end | |
end | |
# return t_min, t_max, t_avg | |
# return ts | |
return nothing | |
end | |
bench_p2p_copyto!(memsize::Real; kwargs...) = bench_p2p_copyto!(B(memsize); kwargs...) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks @maleadt, that makes a ton of sense! With CUDA.jl#master and performing the time measurement in the context of
mem_src
as suggested seems to fix all the issues!Code:
For our A100s in the DGX (i.e. with NVSwitch) I now get:
which agrees well with the C Code by @lukas-mazur. I guess the numbers are also reasonable given a theoretical maximum of
25GB/s * 12 = 300GB/s
.FWIW, this is the profile: