Last active
July 26, 2021 07:59
-
-
Save carstenbauer/5c21266c9119f28d21549da08b884142 to your computer and use it in GitHub Desktop.
CUDA broadcasting vs kernel
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ============================================================================== | |
# Example program for CUDA Julia tutorial | |
# Written by: Carsten Bauer | |
# ============================================================================== | |
using BenchmarkTools | |
using CUDA | |
using Printf | |
function saxpy_gpu_kernel!(z,a,x,y) | |
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x | |
if i <= length(z) | |
@inbounds z[i] = a * x[i] + y[i] | |
end | |
return nothing | |
end | |
function saxpy_gpu!(z,a,x,y; threads_per_block, num_blocks) | |
CUDA.@sync begin | |
@cuda threads=threads_per_block blocks=num_blocks saxpy_gpu_kernel!(z,a,x,y) | |
end | |
end | |
t_start = time_ns() | |
# define the dimension of the arrays used in the SAXPY | |
dim = 100_000_000 | |
# define a constant a for the SAXPY | |
a = 3.1415 | |
# query how many threads per block are available on the GPU | |
threads_per_block = CUDA.attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) | |
# compute how many blocks we need to use for the given `dim` | |
num_blocks = ceil(Int, dim/threads_per_block) | |
# allocate GPU memory for the SAXPY | |
# (Float32 is the default type for CuArrays) | |
x = CUDA.ones(dim) | |
y = CUDA.ones(dim) | |
z = CUDA.zeros(dim) | |
# perform SAXPY | |
t_saxpy_kernel = @belapsed saxpy_gpu!(z,a,x,y; num_blocks, threads_per_block) | |
t_saxpy_broadc = @belapsed CUDA.@sync z .= a .* x .+ y | |
# free the used GPU memory (optional) | |
# Hint: CUDA.memory_status() can be used to check the memory usage of the GPU | |
x, y, z = nothing, nothing, nothing | |
GC.gc(true) | |
t_end = time_ns() | |
t_total = (t_end - t_start) * 10^(-9) | |
println("Computed SAXPY of dimension: ", dim) | |
println("Total program took: ", round(t_total, digits=9), "s") | |
@printf("SAXPY (gpu, broadcasting) average took: %.9fs \n", t_saxpy_broadc) | |
@printf("SAXPY (gpu, kernel, parallel) average took: %.9fs", t_saxpy_kernel) |
UPDATE: In the original version, I had mixed up the printing. The broadcasting version was always the faster one but it was printed with the kernel timings.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
On a NVIDIA GeForce GTX 1650, I get
Why is the broadcasting version faster?