mwarusz/naive_transpose.jl

## naive_transpose.jl
using KernelAbstractions
using GPUifyLoops
using CUDAnative, CuArrays, CUDAdrv

@kernel function transpose_kernel_naive!(b, a)
  I = @index(Global, Cartesian)
  i, j = Tuple(I)
  @inbounds b[i, j] = a[j, i]
end

@kernel function transpose_kernel_naive_ldg!(b, @Const(a))
  I = @index(Global, Cartesian)
  i, j = Tuple(I)
  @inbounds b[i + size(b, 1) * (j - 1)] = a[j + size(a, 1) * (i - 1)]
end

function transpose_cuda!(b, a)
  i = (blockIdx().x-1) * blockDim().x + threadIdx().x
  j = (blockIdx().y-1) * blockDim().y + threadIdx().y
  @inbounds b[i, j] = a[j, i]
  nothing
end
function transpose_cuda_ldg!(b, a)
  i = (blockIdx().x-1) * blockDim().x + threadIdx().x
  j = (blockIdx().y-1) * blockDim().y + threadIdx().y
  @inbounds b[i + size(b, 1) * (j - 1)] = ldg(a, j + size(a, 1) * (i - 1))
  nothing
end

@kernel function transpose_kernel!(b, a)
  block_dim_x, block_dim_y = 32, 32
  # Warning !!! grid dimensions have to be changed if the matrix size changes
  grid_dim_x, grid_dim_y = 256, 256
  wgsize = groupsize()

  I = @index(Global)
  L = @index(Local)
  G = div(I - 1, wgsize) + 1

  thread_idx_x = (L - 1) % block_dim_x + 1
  thread_idx_y = div(L - 1, block_dim_x) + 1

  block_idx_x = (G - 1) % grid_dim_x + 1
  block_idx_y = div(G - 1, grid_dim_x) + 1

  i = (block_idx_x - 1) * block_dim_x + thread_idx_x
  j = (block_idx_y - 1) * block_dim_y + thread_idx_y

  @inbounds b[i + size(b, 1) * (j - 1)] = a[j + size(a, 1) * (i - 1)]
end

@kernel function transpose_kernel_ldg!(b, @Const(a))
  block_dim_x, block_dim_y = 32, 32
  # Warning !!! grid dimensions have to be changed if the matrix size changes
  grid_dim_x, grid_dim_y = 256, 256
  wgsize = groupsize()

  I = @index(Global)
  L = @index(Local)
  G = div(I - 1, wgsize) + 1

  thread_idx_x = (L - 1) % block_dim_x + 1
  thread_idx_y = div(L - 1, block_dim_x) + 1

  block_idx_x = (G - 1) % grid_dim_x + 1
  block_idx_y = div(G - 1, grid_dim_x) + 1

  i = (block_idx_x - 1) * block_dim_x + thread_idx_x
  j = (block_idx_y - 1) * block_dim_y + thread_idx_y

  @inbounds b[i + size(b, 1) * (j - 1)] = a[j + size(a, 1) * (i - 1)]
end

const TDIM = 32
const BLOCK_ROWS = 8
function transpose_cuda_shared!(out, in)
  T = eltype(in)
  tile = @cuStaticSharedMem(T, (TDIM + 1, TDIM))

  bx = blockIdx().x
  by = blockIdx().y

  tx = threadIdx().x
  ty = threadIdx().y

  i = (bx - 1) * TDIM + tx
  j = (by - 1) * TDIM + ty

  @unroll for k = 0:BLOCK_ROWS:TDIM-1
    @inbounds tile[ty + k, tx] = in[i, j + k]
  end
  sync_threads()

  i = (by - 1) * TDIM + tx
  j = (bx - 1) * TDIM + ty

  @unroll for k = 0:BLOCK_ROWS:TDIM-1
    @inbounds out[i, j + k] = tile[tx, ty + k]
  end
  nothing
end

function transpose_gpuify_shared!(out, input)
  T = eltype(input)
  tile = @shmem T (TDIM + 1, TDIM)
  ny = size(input, 2) ÷ TDIM
  nx = size(input, 1) ÷ TDIM
  @loop for by in (1:ny; blockIdx().y)
    @loop for bx in (1:nx; blockIdx().x)
      @loop for ty in (1:BLOCK_ROWS; threadIdx().y)
        @loop for tx in (1:TDIM; threadIdx().x)
          i = (bx - 1) * TDIM + tx
          j = (by - 1) * TDIM + ty

          @unroll for k = 0:BLOCK_ROWS:TDIM-1
            @inbounds tile[ty + k, tx] = input[i, j + k]
          end
        end
      end
      @GPUifyLoops.synchronize
      @loop for ty in (1:BLOCK_ROWS; threadIdx().y)
        @loop for tx in (1:TDIM; threadIdx().x)
          i = (by - 1) * TDIM + tx
          j = (bx - 1) * TDIM + ty

          @unroll for k = 0:BLOCK_ROWS:TDIM-1
            @inbounds out[i, j + k] = tile[tx, ty + k]
          end
        end
      end
    end
  end
end

const T = Float32
const n = 8 * 1024
const shape = n, n
const nreps = 10

let
  a = CuArray(rand(T, shape))
  b = similar(a, shape[2], shape[1])
  W = 1024
  kernel! = transpose_kernel_naive!(KernelAbstractions.CUDA(), W, size(b))

  event = kernel!(b, a)
  wait(event)
  @assert Array(b) == Array(a)'
  @CUDAdrv.profile for rep in 1:nreps
    event = kernel!(b, a)
    wait(event)
  end
end

let
  a = CuArray(rand(T, shape))
  b = similar(a, shape[2], shape[1])
  W = 1024
  kernel! = transpose_kernel_naive_ldg!(KernelAbstractions.CUDA(), W, size(b))

  event = kernel!(b, a)
  wait(event)
  @assert Array(b) == Array(a)'
  @CUDAdrv.profile for rep in 1:nreps
    event = kernel!(b, a)
    wait(event)
  end
end

let
  a = CuArray(rand(T, shape))
  b = similar(a, shape[2], shape[1])
  W = 1024
  kernel! = transpose_kernel!(KernelAbstractions.CUDA(), W, length(b))

  event = kernel!(b, a)
  wait(event)
  @assert Array(b) == Array(a)'
  @CUDAdrv.profile for rep in 1:nreps
    event = kernel!(b, a)
    wait(event)
  end
end

let
  a = CuArray(rand(T, shape))
  b = similar(a, shape[2], shape[1])
  W = 1024
  kernel! = transpose_kernel_ldg!(KernelAbstractions.CUDA(), W, length(b))

  event = kernel!(b, a)
  wait(event)
  @assert Array(b) == Array(a)'
  @CUDAdrv.profile for rep in 1:nreps
    event = kernel!(b, a)
    wait(event)
  end
end

let
  a = CuArray(rand(T, shape))
  b = similar(a, shape[2], shape[1])

  threads = 32, 32
  blocks = cld.(size(b), threads)
  @cuda threads=threads blocks=blocks transpose_cuda!(b, a)
  @assert Array(b) == Array(a)'
  @CUDAdrv.profile for rep in 1:nreps
    @cuda threads=threads blocks=blocks transpose_cuda!(b, a)
  end
end

let
  a = CuArray(rand(T, shape))
  b = similar(a, shape[2], shape[1])

  threads = 32, 32
  blocks = cld.(size(b), threads)
  @cuda threads=threads blocks=blocks transpose_cuda_ldg!(b, a)
  @assert Array(b) == Array(a)'
  @CUDAdrv.profile for rep in 1:nreps
    @cuda threads=threads blocks=blocks transpose_cuda_ldg!(b, a)
  end
end

let
  a = CuArray(rand(T, shape))
  b = similar(a, shape[2], shape[1])

  threads = TDIM, BLOCK_ROWS
  blocks = cld.(size(a), TDIM)
  @cuda threads=threads blocks=blocks transpose_cuda_shared!(b, a)
  @assert Array(b) == Array(a)'
  @CUDAdrv.profile for rep in 1:nreps
    @cuda threads=threads blocks=blocks transpose_cuda_shared!(b, a)
  end
end

let
  a = CuArray(rand(T, shape))
  b = similar(a, shape[2], shape[1])

  threads = TDIM, BLOCK_ROWS
  blocks = cld.(size(a), TDIM)
  @launch GPUifyLoops.CUDA() threads=threads blocks=blocks transpose_gpuify_shared!(b, a)
  @assert Array(b) == Array(a)'
  @CUDAdrv.profile for rep in 1:nreps
    @launch GPUifyLoops.CUDA() threads=threads blocks=blocks transpose_gpuify_shared!(b, a)
  end
end
	using KernelAbstractions
	using GPUifyLoops
	using CUDAnative, CuArrays, CUDAdrv

	@kernel function transpose_kernel_naive!(b, a)
	I = @index(Global, Cartesian)
	i, j = Tuple(I)
	@inbounds b[i, j] = a[j, i]
	end

	@kernel function transpose_kernel_naive_ldg!(b, @Const(a))
	I = @index(Global, Cartesian)
	i, j = Tuple(I)
	@inbounds b[i + size(b, 1) * (j - 1)] = a[j + size(a, 1) * (i - 1)]
	end

	function transpose_cuda!(b, a)
	i = (blockIdx().x-1) * blockDim().x + threadIdx().x
	j = (blockIdx().y-1) * blockDim().y + threadIdx().y
	@inbounds b[i, j] = a[j, i]
	nothing
	end
	function transpose_cuda_ldg!(b, a)
	i = (blockIdx().x-1) * blockDim().x + threadIdx().x
	j = (blockIdx().y-1) * blockDim().y + threadIdx().y
	@inbounds b[i + size(b, 1) * (j - 1)] = ldg(a, j + size(a, 1) * (i - 1))
	nothing
	end

	@kernel function transpose_kernel!(b, a)
	block_dim_x, block_dim_y = 32, 32
	# Warning !!! grid dimensions have to be changed if the matrix size changes
	grid_dim_x, grid_dim_y = 256, 256
	wgsize = groupsize()

	I = @index(Global)
	L = @index(Local)
	G = div(I - 1, wgsize) + 1

	thread_idx_x = (L - 1) % block_dim_x + 1
	thread_idx_y = div(L - 1, block_dim_x) + 1

	block_idx_x = (G - 1) % grid_dim_x + 1
	block_idx_y = div(G - 1, grid_dim_x) + 1

	i = (block_idx_x - 1) * block_dim_x + thread_idx_x
	j = (block_idx_y - 1) * block_dim_y + thread_idx_y

	@inbounds b[i + size(b, 1) * (j - 1)] = a[j + size(a, 1) * (i - 1)]
	end

	@kernel function transpose_kernel_ldg!(b, @Const(a))
	block_dim_x, block_dim_y = 32, 32
	# Warning !!! grid dimensions have to be changed if the matrix size changes
	grid_dim_x, grid_dim_y = 256, 256
	wgsize = groupsize()

	I = @index(Global)
	L = @index(Local)
	G = div(I - 1, wgsize) + 1

	thread_idx_x = (L - 1) % block_dim_x + 1
	thread_idx_y = div(L - 1, block_dim_x) + 1

	block_idx_x = (G - 1) % grid_dim_x + 1
	block_idx_y = div(G - 1, grid_dim_x) + 1

	i = (block_idx_x - 1) * block_dim_x + thread_idx_x
	j = (block_idx_y - 1) * block_dim_y + thread_idx_y

	@inbounds b[i + size(b, 1) * (j - 1)] = a[j + size(a, 1) * (i - 1)]
	end

	const TDIM = 32
	const BLOCK_ROWS = 8
	function transpose_cuda_shared!(out, in)
	T = eltype(in)
	tile = @cuStaticSharedMem(T, (TDIM + 1, TDIM))

	bx = blockIdx().x
	by = blockIdx().y

	tx = threadIdx().x
	ty = threadIdx().y

	i = (bx - 1) * TDIM + tx
	j = (by - 1) * TDIM + ty

	@unroll for k = 0:BLOCK_ROWS:TDIM-1
	@inbounds tile[ty + k, tx] = in[i, j + k]
	end
	sync_threads()

	i = (by - 1) * TDIM + tx
	j = (bx - 1) * TDIM + ty

	@unroll for k = 0:BLOCK_ROWS:TDIM-1
	@inbounds out[i, j + k] = tile[tx, ty + k]
	end
	nothing
	end

	function transpose_gpuify_shared!(out, input)
	T = eltype(input)
	tile = @shmem T (TDIM + 1, TDIM)
	ny = size(input, 2) ÷ TDIM
	nx = size(input, 1) ÷ TDIM
	@loop for by in (1:ny; blockIdx().y)
	@loop for bx in (1:nx; blockIdx().x)
	@loop for ty in (1:BLOCK_ROWS; threadIdx().y)
	@loop for tx in (1:TDIM; threadIdx().x)
	i = (bx - 1) * TDIM + tx
	j = (by - 1) * TDIM + ty

	@unroll for k = 0:BLOCK_ROWS:TDIM-1
	@inbounds tile[ty + k, tx] = input[i, j + k]
	end
	end
	end
	@GPUifyLoops.synchronize
	@loop for ty in (1:BLOCK_ROWS; threadIdx().y)
	@loop for tx in (1:TDIM; threadIdx().x)
	i = (by - 1) * TDIM + tx
	j = (bx - 1) * TDIM + ty

	@unroll for k = 0:BLOCK_ROWS:TDIM-1
	@inbounds out[i, j + k] = tile[tx, ty + k]
	end
	end
	end
	end
	end
	end

	const T = Float32
	const n = 8 * 1024
	const shape = n, n
	const nreps = 10

	let
	a = CuArray(rand(T, shape))
	b = similar(a, shape[2], shape[1])
	W = 1024
	kernel! = transpose_kernel_naive!(KernelAbstractions.CUDA(), W, size(b))

	event = kernel!(b, a)
	wait(event)
	@assert Array(b) == Array(a)'
	@CUDAdrv.profile for rep in 1:nreps
	event = kernel!(b, a)
	wait(event)
	end
	end

	let
	a = CuArray(rand(T, shape))
	b = similar(a, shape[2], shape[1])
	W = 1024
	kernel! = transpose_kernel_naive_ldg!(KernelAbstractions.CUDA(), W, size(b))

	event = kernel!(b, a)
	wait(event)
	@assert Array(b) == Array(a)'
	@CUDAdrv.profile for rep in 1:nreps
	event = kernel!(b, a)
	wait(event)
	end
	end

	let
	a = CuArray(rand(T, shape))
	b = similar(a, shape[2], shape[1])
	W = 1024
	kernel! = transpose_kernel!(KernelAbstractions.CUDA(), W, length(b))

	event = kernel!(b, a)
	wait(event)
	@assert Array(b) == Array(a)'
	@CUDAdrv.profile for rep in 1:nreps
	event = kernel!(b, a)
	wait(event)
	end
	end

	let
	a = CuArray(rand(T, shape))
	b = similar(a, shape[2], shape[1])
	W = 1024
	kernel! = transpose_kernel_ldg!(KernelAbstractions.CUDA(), W, length(b))

	event = kernel!(b, a)
	wait(event)
	@assert Array(b) == Array(a)'
	@CUDAdrv.profile for rep in 1:nreps
	event = kernel!(b, a)
	wait(event)
	end
	end

	let
	a = CuArray(rand(T, shape))
	b = similar(a, shape[2], shape[1])

	threads = 32, 32
	blocks = cld.(size(b), threads)
	@cuda threads=threads blocks=blocks transpose_cuda!(b, a)
	@assert Array(b) == Array(a)'
	@CUDAdrv.profile for rep in 1:nreps
	@cuda threads=threads blocks=blocks transpose_cuda!(b, a)
	end
	end

	let
	a = CuArray(rand(T, shape))
	b = similar(a, shape[2], shape[1])

	threads = 32, 32
	blocks = cld.(size(b), threads)
	@cuda threads=threads blocks=blocks transpose_cuda_ldg!(b, a)
	@assert Array(b) == Array(a)'
	@CUDAdrv.profile for rep in 1:nreps
	@cuda threads=threads blocks=blocks transpose_cuda_ldg!(b, a)
	end
	end

	let
	a = CuArray(rand(T, shape))
	b = similar(a, shape[2], shape[1])

	threads = TDIM, BLOCK_ROWS
	blocks = cld.(size(a), TDIM)
	@cuda threads=threads blocks=blocks transpose_cuda_shared!(b, a)
	@assert Array(b) == Array(a)'
	@CUDAdrv.profile for rep in 1:nreps
	@cuda threads=threads blocks=blocks transpose_cuda_shared!(b, a)
	end
	end

	let
	a = CuArray(rand(T, shape))
	b = similar(a, shape[2], shape[1])

	threads = TDIM, BLOCK_ROWS
	blocks = cld.(size(a), TDIM)
	@launch GPUifyLoops.CUDA() threads=threads blocks=blocks transpose_gpuify_shared!(b, a)
	@assert Array(b) == Array(a)'
	@CUDAdrv.profile for rep in 1:nreps
	@launch GPUifyLoops.CUDA() threads=threads blocks=blocks transpose_gpuify_shared!(b, a)
	end
	end