carstenbauer/nvlink_bench.jl

## nvlink_bench.jl
using CUDA
using BenchmarkTools
using Statistics
using UnicodePlots

# bandwidth test
abstract type BytesSIPrefixed end
struct B <: BytesSIPrefixed
    value::Int64
end
struct KB <: BytesSIPrefixed
    value::Int64
end
struct MB <: BytesSIPrefixed
    value::Int64
end
struct GB <: BytesSIPrefixed
    value::Int64
end

_memsize2N(memsize::B; dtype = Float32) = memsize.value ÷ sizeof(dtype)
_memsize2N(memsize::KB; dtype = Float32) = (memsize.value * 1024) ÷ sizeof(dtype)
_memsize2N(memsize::MB; dtype = Float32) = (memsize.value * 1024 * 1024) ÷ sizeof(dtype)
_memsize2N(memsize::GB; dtype = Float32) = (memsize.value * 1024 * 1024 * 1024) ÷ sizeof(dtype)

"""
  init_mem(memsize; src = 0, dst = 1, dtype=Float32)
Init memory of size `memsize` on device `src` and `dst`.
"""
function init_mem(memsize::BytesSIPrefixed; src = 0, dst = 1, dtype = Float32)
    # src != dst || throw(ArgumentError("src == dst..."))
    N = _memsize2N(memsize; dtype)

    device!(src)
    mem_src = CUDA.rand(N)
    device!(dst)
    mem_dst = CUDA.zeros(N)
    return mem_src, mem_dst
end
init_mem(memsize::Real; kwargs...) = init_mem(B(memsize); kwargs...)

function bench_p2p_copyto!(memsize::BytesSIPrefixed; nbench = 10, verbose = true, hist = true, dtype = Float32, kwargs...)
    mem_src, mem_dst = init_mem(memsize; dtype, kwargs...)
    actual_memsize = sizeof(mem_src)
    ts = zeros(nbench)

    @inbounds for i in 1:nbench
        ts[i] = CUDA.@elapsed begin
            copyto!(mem_dst, mem_src)
        end
    end

    t_min = minimum(ts)
    t_max = maximum(ts)
    t_avg = mean(ts)

    if verbose
        println("Memsize: $actual_memsize (requested: $memsize)")
        println("t_min: $t_min")
        println("t_max: $t_max")
        println("t_avg: $t_avg")
        actual_memsize_GB = (actual_memsize) / (1024^3)
        println("Bandwidth (GB/s):")
        println(" ├ max: ", actual_memsize_GB / t_min)
        println(" ├ min: ", actual_memsize_GB / t_max)
        println(" └ avg: ", actual_memsize_GB / t_avg)

        if hist
            display(histogram(ts))
        end
    end

    # return t_min, t_max, t_avg
    # return ts
    return nothing
end
bench_p2p_copyto!(memsize::Real; kwargs...) = bench_p2p_copyto!(B(memsize); kwargs...)
	using CUDA
	using BenchmarkTools
	using Statistics
	using UnicodePlots

	# bandwidth test
	abstract type BytesSIPrefixed end
	struct B <: BytesSIPrefixed
	value::Int64
	end
	struct KB <: BytesSIPrefixed
	value::Int64
	end
	struct MB <: BytesSIPrefixed
	value::Int64
	end
	struct GB <: BytesSIPrefixed
	value::Int64
	end

	_memsize2N(memsize::B; dtype = Float32) = memsize.value ÷ sizeof(dtype)
	_memsize2N(memsize::KB; dtype = Float32) = (memsize.value * 1024) ÷ sizeof(dtype)
	_memsize2N(memsize::MB; dtype = Float32) = (memsize.value * 1024 * 1024) ÷ sizeof(dtype)
	_memsize2N(memsize::GB; dtype = Float32) = (memsize.value * 1024 * 1024 * 1024) ÷ sizeof(dtype)

	"""
	init_mem(memsize; src = 0, dst = 1, dtype=Float32)
	Init memory of size `memsize` on device `src` and `dst`.
	"""
	function init_mem(memsize::BytesSIPrefixed; src = 0, dst = 1, dtype = Float32)
	# src != dst \|\| throw(ArgumentError("src == dst..."))
	N = _memsize2N(memsize; dtype)

	device!(src)
	mem_src = CUDA.rand(N)
	device!(dst)
	mem_dst = CUDA.zeros(N)
	return mem_src, mem_dst
	end
	init_mem(memsize::Real; kwargs...) = init_mem(B(memsize); kwargs...)

	function bench_p2p_copyto!(memsize::BytesSIPrefixed; nbench = 10, verbose = true, hist = true, dtype = Float32, kwargs...)
	mem_src, mem_dst = init_mem(memsize; dtype, kwargs...)
	actual_memsize = sizeof(mem_src)
	ts = zeros(nbench)

	@inbounds for i in 1:nbench
	ts[i] = CUDA.@elapsed begin
	copyto!(mem_dst, mem_src)
	end
	end

	t_min = minimum(ts)
	t_max = maximum(ts)
	t_avg = mean(ts)

	if verbose
	println("Memsize: $actual_memsize (requested: $memsize)")
	println("t_min: $t_min")
	println("t_max: $t_max")
	println("t_avg: $t_avg")
	actual_memsize_GB = (actual_memsize) / (1024^3)
	println("Bandwidth (GB/s):")
	println(" ├ max: ", actual_memsize_GB / t_min)
	println(" ├ min: ", actual_memsize_GB / t_max)
	println(" └ avg: ", actual_memsize_GB / t_avg)

	if hist
	display(histogram(ts))
	end
	end

	# return t_min, t_max, t_avg
	# return ts
	return nothing
	end
	bench_p2p_copyto!(memsize::Real; kwargs...) = bench_p2p_copyto!(B(memsize); kwargs...)