carstenbauer/gpu_saxpy_comparison.jl

## gpu_saxpy_comparison.jl
# ==============================================================================
# Example program for CUDA Julia tutorial
# Written by: Carsten Bauer
# ==============================================================================
using BenchmarkTools
using CUDA
using Printf

function saxpy_gpu_kernel!(z,a,x,y)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    if i <= length(z)
        @inbounds z[i] = a * x[i] + y[i]
    end
    return nothing
end

function saxpy_gpu!(z,a,x,y; threads_per_block, num_blocks)
    CUDA.@sync begin
        @cuda threads=threads_per_block blocks=num_blocks saxpy_gpu_kernel!(z,a,x,y)
    end
end

t_start = time_ns()

# define the dimension of the arrays used in the SAXPY
dim = 100_000_000
# define a constant a for the SAXPY
a = 3.1415

# query how many threads per block are available on the GPU
threads_per_block = CUDA.attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)
# compute how many blocks we need to use for the given `dim`
num_blocks = ceil(Int, dim/threads_per_block)

# allocate GPU memory for the SAXPY
# (Float32 is the default type for CuArrays)
x = CUDA.ones(dim)
y = CUDA.ones(dim)
z = CUDA.zeros(dim)

# perform SAXPY
t_saxpy_kernel = @belapsed saxpy_gpu!(z,a,x,y; num_blocks, threads_per_block)
t_saxpy_broadc = @belapsed CUDA.@sync z .= a .* x .+ y

# free the used GPU memory (optional)
# Hint: CUDA.memory_status() can be used to check the memory usage of the GPU
x, y, z = nothing, nothing, nothing
GC.gc(true)

t_end = time_ns()
t_total = (t_end - t_start) * 10^(-9)
println("Computed SAXPY of dimension: ", dim)
println("Total program took: ", round(t_total, digits=9), "s")
@printf("SAXPY (gpu, broadcasting) average took: %.9fs \n", t_saxpy_broadc)
@printf("SAXPY (gpu, kernel, parallel) average took: %.9fs", t_saxpy_kernel)
	# ==============================================================================
	# Example program for CUDA Julia tutorial
	# Written by: Carsten Bauer
	# ==============================================================================
	using BenchmarkTools
	using CUDA
	using Printf

	function saxpy_gpu_kernel!(z,a,x,y)
	i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
	if i <= length(z)
	@inbounds z[i] = a * x[i] + y[i]
	end
	return nothing
	end

	function saxpy_gpu!(z,a,x,y; threads_per_block, num_blocks)
	CUDA.@sync begin
	@cuda threads=threads_per_block blocks=num_blocks saxpy_gpu_kernel!(z,a,x,y)
	end
	end

	t_start = time_ns()

	# define the dimension of the arrays used in the SAXPY
	dim = 100_000_000
	# define a constant a for the SAXPY
	a = 3.1415

	# query how many threads per block are available on the GPU
	threads_per_block = CUDA.attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)
	# compute how many blocks we need to use for the given `dim`
	num_blocks = ceil(Int, dim/threads_per_block)

	# allocate GPU memory for the SAXPY
	# (Float32 is the default type for CuArrays)
	x = CUDA.ones(dim)
	y = CUDA.ones(dim)
	z = CUDA.zeros(dim)

	# perform SAXPY
	t_saxpy_kernel = @belapsed saxpy_gpu!(z,a,x,y; num_blocks, threads_per_block)
	t_saxpy_broadc = @belapsed CUDA.@sync z .= a .* x .+ y

	# free the used GPU memory (optional)
	# Hint: CUDA.memory_status() can be used to check the memory usage of the GPU
	x, y, z = nothing, nothing, nothing
	GC.gc(true)

	t_end = time_ns()
	t_total = (t_end - t_start) * 10^(-9)
	println("Computed SAXPY of dimension: ", dim)
	println("Total program took: ", round(t_total, digits=9), "s")
	@printf("SAXPY (gpu, broadcasting) average took: %.9fs \n", t_saxpy_broadc)
	@printf("SAXPY (gpu, kernel, parallel) average took: %.9fs", t_saxpy_kernel)