Last active
February 8, 2022 20:53
-
-
Save carstenbauer/c482cb14e44246c4186193c50a1f9ac7 to your computer and use it in GitHub Desktop.
Nvidia NVLink Bandwidth Measurement with Julia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using CUDA | |
using BenchmarkTools | |
using Statistics | |
using UnicodePlots | |
# bandwidth test | |
abstract type BytesSIPrefixed end | |
struct B <: BytesSIPrefixed | |
value::Int64 | |
end | |
struct KB <: BytesSIPrefixed | |
value::Int64 | |
end | |
struct MB <: BytesSIPrefixed | |
value::Int64 | |
end | |
struct GB <: BytesSIPrefixed | |
value::Int64 | |
end | |
_memsize2N(memsize::B; dtype = Float32) = memsize.value ÷ sizeof(dtype) | |
_memsize2N(memsize::KB; dtype = Float32) = (memsize.value * 1024) ÷ sizeof(dtype) | |
_memsize2N(memsize::MB; dtype = Float32) = (memsize.value * 1024 * 1024) ÷ sizeof(dtype) | |
_memsize2N(memsize::GB; dtype = Float32) = (memsize.value * 1024 * 1024 * 1024) ÷ sizeof(dtype) | |
""" | |
init_mem(memsize; src = 0, dst = 1, dtype=Float32) | |
Init memory of size `memsize` on device `src` and `dst`. | |
""" | |
function init_mem(memsize::BytesSIPrefixed; src = 0, dst = 1, dtype = Float32) | |
# src != dst || throw(ArgumentError("src == dst...")) | |
N = _memsize2N(memsize; dtype) | |
device!(src) | |
mem_src = CUDA.rand(N) | |
device!(dst) | |
mem_dst = CUDA.zeros(N) | |
return mem_src, mem_dst | |
end | |
init_mem(memsize::Real; kwargs...) = init_mem(B(memsize); kwargs...) | |
function bench_p2p_copyto!(memsize::BytesSIPrefixed; nbench = 10, verbose = true, hist = true, dtype = Float32, kwargs...) | |
mem_src, mem_dst = init_mem(memsize; dtype, kwargs...) | |
actual_memsize = sizeof(mem_src) | |
ts = zeros(nbench) | |
@inbounds for i in 1:nbench | |
ts[i] = CUDA.@elapsed begin | |
copyto!(mem_dst, mem_src) | |
end | |
end | |
t_min = minimum(ts) | |
t_max = maximum(ts) | |
t_avg = mean(ts) | |
if verbose | |
println("Memsize: $actual_memsize (requested: $memsize)") | |
println("t_min: $t_min") | |
println("t_max: $t_max") | |
println("t_avg: $t_avg") | |
actual_memsize_GB = (actual_memsize) / (1024^3) | |
println("Bandwidth (GB/s):") | |
println(" ├ max: ", actual_memsize_GB / t_min) | |
println(" ├ min: ", actual_memsize_GB / t_max) | |
println(" └ avg: ", actual_memsize_GB / t_avg) | |
if hist | |
display(histogram(ts)) | |
end | |
end | |
# return t_min, t_max, t_avg | |
# return ts | |
return nothing | |
end | |
bench_p2p_copyto!(memsize::Real; kwargs...) = bench_p2p_copyto!(B(memsize); kwargs...) |
Thanks @maleadt, that makes a ton of sense! With CUDA.jl#master and performing the time measurement in the context of mem_src
as suggested seems to fix all the issues!
Code:
using CUDA
using BenchmarkTools
using Statistics
using UnicodePlots
using Humanize
abstract type BytesSIPrefixed end
struct B <: BytesSIPrefixed
value::Int64
end
struct KB <: BytesSIPrefixed
value::Int64
end
struct MB <: BytesSIPrefixed
value::Int64
end
struct GB <: BytesSIPrefixed
value::Int64
end
_memsize2N(memsize::B; dtype=Float32) = memsize.value ÷ sizeof(dtype)
_memsize2N(memsize::KB; dtype=Float32) = (memsize.value * 2^10) ÷ sizeof(dtype)
_memsize2N(memsize::MB; dtype=Float32) = (memsize.value * 2^20) ÷ sizeof(dtype)
_memsize2N(memsize::GB; dtype=Float32) = (memsize.value * 2^30) ÷ sizeof(dtype)
"""
init_mem(memsize::BytesSIPrefixed; src = 0, dst = 1, dtype=Float32)
Init memory on devices `src` and `dst`.
**Examples:**
```julia
init_mem(MB(1024))
init_mem(B(40_000_000))
```
"""
function init_mem(memsize::BytesSIPrefixed; src=0, dst=1, dtype=Float32)
# src != dst || throw(ArgumentError("src == dst..."))
N = _memsize2N(memsize; dtype)
device!(src)
mem_src = CUDA.rand(N)
device!(dst)
mem_dst = CUDA.zeros(N)
return mem_src, mem_dst
end
@inline function _time_cuda_elapsed(kernel::F, mem_dst, mem_src) where {F}
t = CUDA.context!(context(mem_src)) do
CUDA.@elapsed begin
NVTX.@range "p2p: kernel call" begin
kernel(mem_dst, mem_src)
end
end
end
return t
end
"""
bench_p2p_memcpy([memsize::BytesSIPrefixed]; kwargs...)
Performs a peer-to-peer memory copy benchmark (time measurement) and
returns an inter-gpu memory bandwidth estimate (in GB/s) derived from it.
**Keyword arguments:**
* `src` (default: `0`): source device
* `dst` (default: `1`): destination device
* `nbench` (default: `5`): number of time measurements (i.e. p2p memcopies)
* `verbose` (default: `true`): set to false to turn off any printing.
* `hist` (default: `false`): when `true`, a UnicodePlots-based histogram is printed.
* `times` (default: `false`): toggle printing of measured times.
* `alternate` (default: `false`): alternate `src` and `dst`, i.e. copy data back and forth.
* `dtype` (default: `Float32`): see [`init_mem`](@ref).
**Examples:**
```julia
bench_p2p_memcpy()
bench_p2p_memcpy(MB(1024))
bench_p2p_memcpy(KB(20_000); dtype=Int32)
```
"""
function bench_p2p_memcpy(
memsize::BytesSIPrefixed=B(40_000_000);
nbench=5,
verbose=true,
hist=false,
times=false,
alternate=false,
dtype=Float32,
kwargs...,
)
mem_src, mem_dst = init_mem(memsize; dtype, kwargs...)
actual_memsize = sizeof(mem_src)
ts = zeros(nbench)
NVTX.@range "p2p: nbench loop" begin
@inbounds for i in 1:nbench
if mod(i, alternate ? 2 : 1) == 0
ts[i] = _time_cuda_elapsed(copyto!, mem_dst, mem_src)
else
ts[i] = _time_cuda_elapsed(copyto!, mem_src, mem_dst)
end
end
end
t_min = minimum(ts)
t_max = maximum(ts)
t_avg = mean(ts)
actual_memsize_GB = (actual_memsize) / (1024^3)
bws = actual_memsize_GB ./ ts
bw_min = minimum(bws)
bw_max = maximum(bws)
bw_avg = mean(bws)
if verbose
println("Memsize: $(Humanize.datasize(actual_memsize; style=:gnu))\n")
if times
println("t_min: $t_min")
println("t_max: $t_max")
println("t_avg: $t_avg")
end
printstyled("Bandwidth (GB/s):\n"; bold=true)
print(" ├ max: ")
printstyled(round(bw_max; digits=2), "\n"; color=:green, bold=true)
println(" ├ min: ", round(bw_min; digits=2))
println(" ├ avg: ", round(bw_avg; digits=2))
print(" └ std_dev: ")
printstyled(round(std(bws); digits=2), "\n"; color=:yellow, bold=true)
if hist
display(histogram(bws; title="Bandwidths (GB/s)", nbins=5))
end
end
return bw_max
end
"""
bench_p2p_memcpy_all(args...; kwargs...)
Run [`bench_p2p_memcpy`](@ref) for all combinations of devices.
Returns a matrix with the p2p memory bandwidth estimates.
"""
function bench_p2p_memcpy_all(args...; kwargs...)
ngpus = length(CUDA.devices())
return [src == dst ? nothing : bench_p2p_memcpy(args...; src=src, dst=dst, verbose=false, kwargs...) for src in 0:ngpus-1, dst in 0:ngpus-1]
end
For our A100s in the DGX (i.e. with NVSwitch) I now get:
julia> bench_p2p_memcpy();
Memsize: 38.1M
Bandwidth (GB/s):
├ max: 247.32
├ min: 173.5
├ avg: 229.63
└ std_dev: 31.67
julia> bench_p2p_memcpy_all()
8×8 Matrix{Union{Nothing, Float64}}:
nothing 245.706 241.075 244.467 246.434 242.229 245.085 245.033
239.046 nothing 241.776 243.853 241.626 245.136 244.467 240.379
246.957 242.633 nothing 242.937 245.291 248.114 239.193 242.684
244.724 241.375 244.211 nothing 245.861 238.117 245.085 242.28
241.576 246.329 242.582 245.602 nothing 246.59 240.677 243.343
247.114 240.18 245.965 244.006 236.616 nothing 242.28 244.673
243.802 242.028 248.326 239.933 244.365 245.033 nothing 245.498
245.136 246.904 239.488 243.343 244.057 240.627 243.445 nothing
which agrees well with the C Code by @lukas-mazur. I guess the numbers are also reasonable given a theoretical maximum of 25GB/s * 12 = 300GB/s
.
FWIW, this is the profile:
That's great!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You could spot this in the profile: the API calls part of
CUDA.@elapsed
ought to wait for the GPU to finish, or obviously the measurement wouldn't make any sense. This didn't happen:Executing
CUDA.@elapsed
in the correct context results in:(it's possible this didn't happen before my fixes of the stream-ordered allocator, as the invocation of cuMemcpyPeer might have behaved synchronously then. but it seems to fix the alternative mode here.)