Skip to content

Instantly share code, notes, and snippets.

@mwarusz
Created February 18, 2020 21:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mwarusz/18c4fe123c8869dd4570ae518ea6f582 to your computer and use it in GitHub Desktop.
Save mwarusz/18c4fe123c8869dd4570ae518ea6f582 to your computer and use it in GitHub Desktop.
naive transpose performance
using KernelAbstractions
using GPUifyLoops
using CUDAnative, CuArrays, CUDAdrv
@kernel function transpose_kernel_naive!(b, a)
I = @index(Global, Cartesian)
i, j = Tuple(I)
@inbounds b[i, j] = a[j, i]
end
@kernel function transpose_kernel_naive_ldg!(b, @Const(a))
I = @index(Global, Cartesian)
i, j = Tuple(I)
@inbounds b[i + size(b, 1) * (j - 1)] = a[j + size(a, 1) * (i - 1)]
end
function transpose_cuda!(b, a)
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
j = (blockIdx().y-1) * blockDim().y + threadIdx().y
@inbounds b[i, j] = a[j, i]
nothing
end
function transpose_cuda_ldg!(b, a)
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
j = (blockIdx().y-1) * blockDim().y + threadIdx().y
@inbounds b[i + size(b, 1) * (j - 1)] = ldg(a, j + size(a, 1) * (i - 1))
nothing
end
@kernel function transpose_kernel!(b, a)
block_dim_x, block_dim_y = 32, 32
# Warning !!! grid dimensions have to be changed if the matrix size changes
grid_dim_x, grid_dim_y = 256, 256
wgsize = groupsize()
I = @index(Global)
L = @index(Local)
G = div(I - 1, wgsize) + 1
thread_idx_x = (L - 1) % block_dim_x + 1
thread_idx_y = div(L - 1, block_dim_x) + 1
block_idx_x = (G - 1) % grid_dim_x + 1
block_idx_y = div(G - 1, grid_dim_x) + 1
i = (block_idx_x - 1) * block_dim_x + thread_idx_x
j = (block_idx_y - 1) * block_dim_y + thread_idx_y
@inbounds b[i + size(b, 1) * (j - 1)] = a[j + size(a, 1) * (i - 1)]
end
@kernel function transpose_kernel_ldg!(b, @Const(a))
block_dim_x, block_dim_y = 32, 32
# Warning !!! grid dimensions have to be changed if the matrix size changes
grid_dim_x, grid_dim_y = 256, 256
wgsize = groupsize()
I = @index(Global)
L = @index(Local)
G = div(I - 1, wgsize) + 1
thread_idx_x = (L - 1) % block_dim_x + 1
thread_idx_y = div(L - 1, block_dim_x) + 1
block_idx_x = (G - 1) % grid_dim_x + 1
block_idx_y = div(G - 1, grid_dim_x) + 1
i = (block_idx_x - 1) * block_dim_x + thread_idx_x
j = (block_idx_y - 1) * block_dim_y + thread_idx_y
@inbounds b[i + size(b, 1) * (j - 1)] = a[j + size(a, 1) * (i - 1)]
end
const TDIM = 32
const BLOCK_ROWS = 8
function transpose_cuda_shared!(out, in)
T = eltype(in)
tile = @cuStaticSharedMem(T, (TDIM + 1, TDIM))
bx = blockIdx().x
by = blockIdx().y
tx = threadIdx().x
ty = threadIdx().y
i = (bx - 1) * TDIM + tx
j = (by - 1) * TDIM + ty
@unroll for k = 0:BLOCK_ROWS:TDIM-1
@inbounds tile[ty + k, tx] = in[i, j + k]
end
sync_threads()
i = (by - 1) * TDIM + tx
j = (bx - 1) * TDIM + ty
@unroll for k = 0:BLOCK_ROWS:TDIM-1
@inbounds out[i, j + k] = tile[tx, ty + k]
end
nothing
end
function transpose_gpuify_shared!(out, input)
T = eltype(input)
tile = @shmem T (TDIM + 1, TDIM)
ny = size(input, 2) ÷ TDIM
nx = size(input, 1) ÷ TDIM
@loop for by in (1:ny; blockIdx().y)
@loop for bx in (1:nx; blockIdx().x)
@loop for ty in (1:BLOCK_ROWS; threadIdx().y)
@loop for tx in (1:TDIM; threadIdx().x)
i = (bx - 1) * TDIM + tx
j = (by - 1) * TDIM + ty
@unroll for k = 0:BLOCK_ROWS:TDIM-1
@inbounds tile[ty + k, tx] = input[i, j + k]
end
end
end
@GPUifyLoops.synchronize
@loop for ty in (1:BLOCK_ROWS; threadIdx().y)
@loop for tx in (1:TDIM; threadIdx().x)
i = (by - 1) * TDIM + tx
j = (bx - 1) * TDIM + ty
@unroll for k = 0:BLOCK_ROWS:TDIM-1
@inbounds out[i, j + k] = tile[tx, ty + k]
end
end
end
end
end
end
const T = Float32
const n = 8 * 1024
const shape = n, n
const nreps = 10
let
a = CuArray(rand(T, shape))
b = similar(a, shape[2], shape[1])
W = 1024
kernel! = transpose_kernel_naive!(KernelAbstractions.CUDA(), W, size(b))
event = kernel!(b, a)
wait(event)
@assert Array(b) == Array(a)'
@CUDAdrv.profile for rep in 1:nreps
event = kernel!(b, a)
wait(event)
end
end
let
a = CuArray(rand(T, shape))
b = similar(a, shape[2], shape[1])
W = 1024
kernel! = transpose_kernel_naive_ldg!(KernelAbstractions.CUDA(), W, size(b))
event = kernel!(b, a)
wait(event)
@assert Array(b) == Array(a)'
@CUDAdrv.profile for rep in 1:nreps
event = kernel!(b, a)
wait(event)
end
end
let
a = CuArray(rand(T, shape))
b = similar(a, shape[2], shape[1])
W = 1024
kernel! = transpose_kernel!(KernelAbstractions.CUDA(), W, length(b))
event = kernel!(b, a)
wait(event)
@assert Array(b) == Array(a)'
@CUDAdrv.profile for rep in 1:nreps
event = kernel!(b, a)
wait(event)
end
end
let
a = CuArray(rand(T, shape))
b = similar(a, shape[2], shape[1])
W = 1024
kernel! = transpose_kernel_ldg!(KernelAbstractions.CUDA(), W, length(b))
event = kernel!(b, a)
wait(event)
@assert Array(b) == Array(a)'
@CUDAdrv.profile for rep in 1:nreps
event = kernel!(b, a)
wait(event)
end
end
let
a = CuArray(rand(T, shape))
b = similar(a, shape[2], shape[1])
threads = 32, 32
blocks = cld.(size(b), threads)
@cuda threads=threads blocks=blocks transpose_cuda!(b, a)
@assert Array(b) == Array(a)'
@CUDAdrv.profile for rep in 1:nreps
@cuda threads=threads blocks=blocks transpose_cuda!(b, a)
end
end
let
a = CuArray(rand(T, shape))
b = similar(a, shape[2], shape[1])
threads = 32, 32
blocks = cld.(size(b), threads)
@cuda threads=threads blocks=blocks transpose_cuda_ldg!(b, a)
@assert Array(b) == Array(a)'
@CUDAdrv.profile for rep in 1:nreps
@cuda threads=threads blocks=blocks transpose_cuda_ldg!(b, a)
end
end
let
a = CuArray(rand(T, shape))
b = similar(a, shape[2], shape[1])
threads = TDIM, BLOCK_ROWS
blocks = cld.(size(a), TDIM)
@cuda threads=threads blocks=blocks transpose_cuda_shared!(b, a)
@assert Array(b) == Array(a)'
@CUDAdrv.profile for rep in 1:nreps
@cuda threads=threads blocks=blocks transpose_cuda_shared!(b, a)
end
end
let
a = CuArray(rand(T, shape))
b = similar(a, shape[2], shape[1])
threads = TDIM, BLOCK_ROWS
blocks = cld.(size(a), TDIM)
@launch GPUifyLoops.CUDA() threads=threads blocks=blocks transpose_gpuify_shared!(b, a)
@assert Array(b) == Array(a)'
@CUDAdrv.profile for rep in 1:nreps
@launch GPUifyLoops.CUDA() threads=threads blocks=blocks transpose_gpuify_shared!(b, a)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment