Created
February 18, 2020 21:26
-
-
Save mwarusz/18c4fe123c8869dd4570ae518ea6f582 to your computer and use it in GitHub Desktop.
naive transpose performance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using KernelAbstractions | |
using GPUifyLoops | |
using CUDAnative, CuArrays, CUDAdrv | |
@kernel function transpose_kernel_naive!(b, a) | |
I = @index(Global, Cartesian) | |
i, j = Tuple(I) | |
@inbounds b[i, j] = a[j, i] | |
end | |
@kernel function transpose_kernel_naive_ldg!(b, @Const(a)) | |
I = @index(Global, Cartesian) | |
i, j = Tuple(I) | |
@inbounds b[i + size(b, 1) * (j - 1)] = a[j + size(a, 1) * (i - 1)] | |
end | |
function transpose_cuda!(b, a) | |
i = (blockIdx().x-1) * blockDim().x + threadIdx().x | |
j = (blockIdx().y-1) * blockDim().y + threadIdx().y | |
@inbounds b[i, j] = a[j, i] | |
nothing | |
end | |
function transpose_cuda_ldg!(b, a) | |
i = (blockIdx().x-1) * blockDim().x + threadIdx().x | |
j = (blockIdx().y-1) * blockDim().y + threadIdx().y | |
@inbounds b[i + size(b, 1) * (j - 1)] = ldg(a, j + size(a, 1) * (i - 1)) | |
nothing | |
end | |
@kernel function transpose_kernel!(b, a) | |
block_dim_x, block_dim_y = 32, 32 | |
# Warning !!! grid dimensions have to be changed if the matrix size changes | |
grid_dim_x, grid_dim_y = 256, 256 | |
wgsize = groupsize() | |
I = @index(Global) | |
L = @index(Local) | |
G = div(I - 1, wgsize) + 1 | |
thread_idx_x = (L - 1) % block_dim_x + 1 | |
thread_idx_y = div(L - 1, block_dim_x) + 1 | |
block_idx_x = (G - 1) % grid_dim_x + 1 | |
block_idx_y = div(G - 1, grid_dim_x) + 1 | |
i = (block_idx_x - 1) * block_dim_x + thread_idx_x | |
j = (block_idx_y - 1) * block_dim_y + thread_idx_y | |
@inbounds b[i + size(b, 1) * (j - 1)] = a[j + size(a, 1) * (i - 1)] | |
end | |
@kernel function transpose_kernel_ldg!(b, @Const(a)) | |
block_dim_x, block_dim_y = 32, 32 | |
# Warning !!! grid dimensions have to be changed if the matrix size changes | |
grid_dim_x, grid_dim_y = 256, 256 | |
wgsize = groupsize() | |
I = @index(Global) | |
L = @index(Local) | |
G = div(I - 1, wgsize) + 1 | |
thread_idx_x = (L - 1) % block_dim_x + 1 | |
thread_idx_y = div(L - 1, block_dim_x) + 1 | |
block_idx_x = (G - 1) % grid_dim_x + 1 | |
block_idx_y = div(G - 1, grid_dim_x) + 1 | |
i = (block_idx_x - 1) * block_dim_x + thread_idx_x | |
j = (block_idx_y - 1) * block_dim_y + thread_idx_y | |
@inbounds b[i + size(b, 1) * (j - 1)] = a[j + size(a, 1) * (i - 1)] | |
end | |
const TDIM = 32 | |
const BLOCK_ROWS = 8 | |
function transpose_cuda_shared!(out, in) | |
T = eltype(in) | |
tile = @cuStaticSharedMem(T, (TDIM + 1, TDIM)) | |
bx = blockIdx().x | |
by = blockIdx().y | |
tx = threadIdx().x | |
ty = threadIdx().y | |
i = (bx - 1) * TDIM + tx | |
j = (by - 1) * TDIM + ty | |
@unroll for k = 0:BLOCK_ROWS:TDIM-1 | |
@inbounds tile[ty + k, tx] = in[i, j + k] | |
end | |
sync_threads() | |
i = (by - 1) * TDIM + tx | |
j = (bx - 1) * TDIM + ty | |
@unroll for k = 0:BLOCK_ROWS:TDIM-1 | |
@inbounds out[i, j + k] = tile[tx, ty + k] | |
end | |
nothing | |
end | |
function transpose_gpuify_shared!(out, input) | |
T = eltype(input) | |
tile = @shmem T (TDIM + 1, TDIM) | |
ny = size(input, 2) ÷ TDIM | |
nx = size(input, 1) ÷ TDIM | |
@loop for by in (1:ny; blockIdx().y) | |
@loop for bx in (1:nx; blockIdx().x) | |
@loop for ty in (1:BLOCK_ROWS; threadIdx().y) | |
@loop for tx in (1:TDIM; threadIdx().x) | |
i = (bx - 1) * TDIM + tx | |
j = (by - 1) * TDIM + ty | |
@unroll for k = 0:BLOCK_ROWS:TDIM-1 | |
@inbounds tile[ty + k, tx] = input[i, j + k] | |
end | |
end | |
end | |
@GPUifyLoops.synchronize | |
@loop for ty in (1:BLOCK_ROWS; threadIdx().y) | |
@loop for tx in (1:TDIM; threadIdx().x) | |
i = (by - 1) * TDIM + tx | |
j = (bx - 1) * TDIM + ty | |
@unroll for k = 0:BLOCK_ROWS:TDIM-1 | |
@inbounds out[i, j + k] = tile[tx, ty + k] | |
end | |
end | |
end | |
end | |
end | |
end | |
const T = Float32 | |
const n = 8 * 1024 | |
const shape = n, n | |
const nreps = 10 | |
let | |
a = CuArray(rand(T, shape)) | |
b = similar(a, shape[2], shape[1]) | |
W = 1024 | |
kernel! = transpose_kernel_naive!(KernelAbstractions.CUDA(), W, size(b)) | |
event = kernel!(b, a) | |
wait(event) | |
@assert Array(b) == Array(a)' | |
@CUDAdrv.profile for rep in 1:nreps | |
event = kernel!(b, a) | |
wait(event) | |
end | |
end | |
let | |
a = CuArray(rand(T, shape)) | |
b = similar(a, shape[2], shape[1]) | |
W = 1024 | |
kernel! = transpose_kernel_naive_ldg!(KernelAbstractions.CUDA(), W, size(b)) | |
event = kernel!(b, a) | |
wait(event) | |
@assert Array(b) == Array(a)' | |
@CUDAdrv.profile for rep in 1:nreps | |
event = kernel!(b, a) | |
wait(event) | |
end | |
end | |
let | |
a = CuArray(rand(T, shape)) | |
b = similar(a, shape[2], shape[1]) | |
W = 1024 | |
kernel! = transpose_kernel!(KernelAbstractions.CUDA(), W, length(b)) | |
event = kernel!(b, a) | |
wait(event) | |
@assert Array(b) == Array(a)' | |
@CUDAdrv.profile for rep in 1:nreps | |
event = kernel!(b, a) | |
wait(event) | |
end | |
end | |
let | |
a = CuArray(rand(T, shape)) | |
b = similar(a, shape[2], shape[1]) | |
W = 1024 | |
kernel! = transpose_kernel_ldg!(KernelAbstractions.CUDA(), W, length(b)) | |
event = kernel!(b, a) | |
wait(event) | |
@assert Array(b) == Array(a)' | |
@CUDAdrv.profile for rep in 1:nreps | |
event = kernel!(b, a) | |
wait(event) | |
end | |
end | |
let | |
a = CuArray(rand(T, shape)) | |
b = similar(a, shape[2], shape[1]) | |
threads = 32, 32 | |
blocks = cld.(size(b), threads) | |
@cuda threads=threads blocks=blocks transpose_cuda!(b, a) | |
@assert Array(b) == Array(a)' | |
@CUDAdrv.profile for rep in 1:nreps | |
@cuda threads=threads blocks=blocks transpose_cuda!(b, a) | |
end | |
end | |
let | |
a = CuArray(rand(T, shape)) | |
b = similar(a, shape[2], shape[1]) | |
threads = 32, 32 | |
blocks = cld.(size(b), threads) | |
@cuda threads=threads blocks=blocks transpose_cuda_ldg!(b, a) | |
@assert Array(b) == Array(a)' | |
@CUDAdrv.profile for rep in 1:nreps | |
@cuda threads=threads blocks=blocks transpose_cuda_ldg!(b, a) | |
end | |
end | |
let | |
a = CuArray(rand(T, shape)) | |
b = similar(a, shape[2], shape[1]) | |
threads = TDIM, BLOCK_ROWS | |
blocks = cld.(size(a), TDIM) | |
@cuda threads=threads blocks=blocks transpose_cuda_shared!(b, a) | |
@assert Array(b) == Array(a)' | |
@CUDAdrv.profile for rep in 1:nreps | |
@cuda threads=threads blocks=blocks transpose_cuda_shared!(b, a) | |
end | |
end | |
let | |
a = CuArray(rand(T, shape)) | |
b = similar(a, shape[2], shape[1]) | |
threads = TDIM, BLOCK_ROWS | |
blocks = cld.(size(a), TDIM) | |
@launch GPUifyLoops.CUDA() threads=threads blocks=blocks transpose_gpuify_shared!(b, a) | |
@assert Array(b) == Array(a)' | |
@CUDAdrv.profile for rep in 1:nreps | |
@launch GPUifyLoops.CUDA() threads=threads blocks=blocks transpose_gpuify_shared!(b, a) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment