Skip to content

Instantly share code, notes, and snippets.

@carstenbauer
Last active February 8, 2022 20:53
Show Gist options
  • Save carstenbauer/c482cb14e44246c4186193c50a1f9ac7 to your computer and use it in GitHub Desktop.
Save carstenbauer/c482cb14e44246c4186193c50a1f9ac7 to your computer and use it in GitHub Desktop.
Nvidia NVLink Bandwidth Measurement with Julia
@carstenbauer
Copy link
Author

carstenbauer commented Feb 8, 2022

Thanks @maleadt, that makes a ton of sense! With CUDA.jl#master and performing the time measurement in the context of mem_src as suggested seems to fix all the issues!

Code:

using CUDA
using BenchmarkTools
using Statistics
using UnicodePlots
using Humanize

abstract type BytesSIPrefixed end
struct B <: BytesSIPrefixed
    value::Int64
end
struct KB <: BytesSIPrefixed
    value::Int64
end
struct MB <: BytesSIPrefixed
    value::Int64
end
struct GB <: BytesSIPrefixed
    value::Int64
end

_memsize2N(memsize::B; dtype=Float32) = memsize.value ÷ sizeof(dtype)
_memsize2N(memsize::KB; dtype=Float32) = (memsize.value * 2^10) ÷ sizeof(dtype)
_memsize2N(memsize::MB; dtype=Float32) = (memsize.value * 2^20) ÷ sizeof(dtype)
_memsize2N(memsize::GB; dtype=Float32) = (memsize.value * 2^30) ÷ sizeof(dtype)

"""
    init_mem(memsize::BytesSIPrefixed; src = 0, dst = 1, dtype=Float32)
Init memory on devices `src` and `dst`.

**Examples:**
```julia
init_mem(MB(1024))
init_mem(B(40_000_000))
```
"""
function init_mem(memsize::BytesSIPrefixed; src=0, dst=1, dtype=Float32)
    # src != dst || throw(ArgumentError("src == dst..."))
    N = _memsize2N(memsize; dtype)

    device!(src)
    mem_src = CUDA.rand(N)
    device!(dst)
    mem_dst = CUDA.zeros(N)
    return mem_src, mem_dst
end

@inline function _time_cuda_elapsed(kernel::F, mem_dst, mem_src) where {F}
    t = CUDA.context!(context(mem_src)) do
            CUDA.@elapsed begin
                NVTX.@range "p2p: kernel call" begin
                    kernel(mem_dst, mem_src)
                end
            end
    end
    return t
end

"""
    bench_p2p_memcpy([memsize::BytesSIPrefixed]; kwargs...)

Performs a peer-to-peer memory copy benchmark (time measurement) and
returns an inter-gpu memory bandwidth estimate (in GB/s) derived from it.

**Keyword arguments:**
* `src` (default: `0`): source device
* `dst` (default: `1`): destination device
* `nbench` (default: `5`): number of time measurements (i.e. p2p memcopies)
* `verbose` (default: `true`): set to false to turn off any printing.
* `hist` (default: `false`): when `true`, a UnicodePlots-based histogram is printed.
* `times` (default: `false`): toggle printing of measured times.
* `alternate` (default: `false`): alternate `src` and `dst`, i.e. copy data back and forth.
* `dtype` (default: `Float32`): see [`init_mem`](@ref).

**Examples:**
```julia
bench_p2p_memcpy()
bench_p2p_memcpy(MB(1024))
bench_p2p_memcpy(KB(20_000); dtype=Int32)
```
"""
function bench_p2p_memcpy(
    memsize::BytesSIPrefixed=B(40_000_000);
    nbench=5,
    verbose=true,
    hist=false,
    times=false,
    alternate=false,
    dtype=Float32,
    kwargs...,
)
    mem_src, mem_dst = init_mem(memsize; dtype, kwargs...)
    actual_memsize = sizeof(mem_src)
    ts = zeros(nbench)

    NVTX.@range "p2p: nbench loop" begin
        @inbounds for i in 1:nbench
            if mod(i, alternate ? 2 : 1) == 0
                ts[i] = _time_cuda_elapsed(copyto!, mem_dst, mem_src)
            else
                ts[i] = _time_cuda_elapsed(copyto!, mem_src, mem_dst)
            end
        end
    end

    t_min = minimum(ts)
    t_max = maximum(ts)
    t_avg = mean(ts)

    actual_memsize_GB = (actual_memsize) / (1024^3)
    bws = actual_memsize_GB ./ ts
    bw_min = minimum(bws)
    bw_max = maximum(bws)
    bw_avg = mean(bws)

    if verbose
        println("Memsize: $(Humanize.datasize(actual_memsize; style=:gnu))\n")
        if times
            println("t_min: $t_min")
            println("t_max: $t_max")
            println("t_avg: $t_avg")
        end
        printstyled("Bandwidth (GB/s):\n"; bold=true)
        print(" ├ max: ")
        printstyled(round(bw_max; digits=2), "\n"; color=:green, bold=true)
        println(" ├ min: ", round(bw_min; digits=2))
        println(" ├ avg: ", round(bw_avg; digits=2))
        print(" └ std_dev: ")
        printstyled(round(std(bws); digits=2), "\n"; color=:yellow, bold=true)
        if hist
            display(histogram(bws; title="Bandwidths (GB/s)", nbins=5))
        end
    end

    return bw_max
end

"""
    bench_p2p_memcpy_all(args...; kwargs...)

Run [`bench_p2p_memcpy`](@ref) for all combinations of devices.
Returns a matrix with the p2p memory bandwidth estimates.
"""
function bench_p2p_memcpy_all(args...; kwargs...)
    ngpus = length(CUDA.devices())
    return [src == dst ? nothing : bench_p2p_memcpy(args...; src=src, dst=dst, verbose=false, kwargs...) for src in 0:ngpus-1, dst in 0:ngpus-1]
end

For our A100s in the DGX (i.e. with NVSwitch) I now get:

julia> bench_p2p_memcpy();
Memsize: 38.1M

Bandwidth (GB/s):
 ├ max: 247.32
 ├ min: 173.5
 ├ avg: 229.63
 └ std_dev: 31.67

julia> bench_p2p_memcpy_all()
8×8 Matrix{Union{Nothing, Float64}}:
    nothing  245.706     241.075     244.467     246.434     242.229     245.085     245.033
 239.046        nothing  241.776     243.853     241.626     245.136     244.467     240.379
 246.957     242.633        nothing  242.937     245.291     248.114     239.193     242.684
 244.724     241.375     244.211        nothing  245.861     238.117     245.085     242.28
 241.576     246.329     242.582     245.602        nothing  246.59      240.677     243.343
 247.114     240.18      245.965     244.006     236.616        nothing  242.28      244.673
 243.802     242.028     248.326     239.933     244.365     245.033        nothing  245.498
 245.136     246.904     239.488     243.343     244.057     240.627     243.445        nothing

which agrees well with the C Code by @lukas-mazur. I guess the numbers are also reasonable given a theoretical maximum of 25GB/s * 12 = 300GB/s.

FWIW, this is the profile:

p2p_memcpy_dgx_profile

@maleadt
Copy link

maleadt commented Feb 8, 2022

That's great!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment