AlfTetzlaff/bilinear.jl

## bilinear.jl
# An implementation for GPU based bilinear upsampling including its gradient
# WARNING: Untested code ahead!
# The code is a translation from the following files:
# https://github.com/pytorch/pytorch/blob/master/caffe2/operators/upsample_op.cu
# https://github.com/pytorch/pytorch/blob/master/caffe2/core/common_gpu.h

# Open issues:
# 1) type stability?
# 2) licensing?
# 3) the array indexing has to be corrected


using CUDAnative: atomic_add!
using CuArrays

const CUDA_NUM_THREADS = 128
const MAXIMUM_NUM_BLOCKS = 4096

@inline function GET_BLOCKS(N::Integer)
    # Use at least 1 block, since CUDA does not allow empty block
    return max(min((N + CUDA_NUM_THREADS - 1) ÷ CUDA_NUM_THREADS, MAXIMUM_NUM_BLOCKS), 1)
end


@inline function idx(
    n::Integer,
    num_channels::Integer,
    c::Integer,
    height::Integer,
    width::Integer,
    y::Integer,
    x::Integer)
  return ((n * num_channels + c) * height + y) * width + x + 1
end


function UpsampleBilinearKernel(
    num_batch,
    num_channels,
    input_height,
    input_width,
    output_height,
    output_width,
    height_scale,
    width_scale,
    X,
    Y)
    out_size = output_height * output_width
    # CUDA 1D kernel loop
    for index in ((blockIdx().x-1) * blockDim().x + threadIdx().x) : (blockDim().x * gridDim().x) : out_size
        indexTemp = index
        out_x = indexTemp % output_width
        indexTemp = indexTemp ÷ output_width
        out_y = indexTemp % output_height
        indexTemp = indexTemp ÷ output_height
        indexTemp = indexTemp ÷ num_channels

        rheight = output_height > 1 ? (input_height - 1f0) / (output_height - 1f0) : 0f0
        rwidth  = output_width  > 1 ? (input_width - 1f0)  / (output_width  - 1f0) : 0f0

        # Compute Y axis lambdas
        h1r = rheight * out_y
        h1::Int = round(h1r)
        h1p = (h1 < input_height - 1) ? 1 : 0
        h1lambda = h1r - h1
        h0lambda = 1f0 - h1lambda

        # Compute X axis lambdas
        w1r = rwidth * out_x
        w1::Int = round(w1r)
        w1p = (w1 < input_width - 1) ? 1 : 0
        w1lambda = w1r - w1
        w0lambda = 1f0 - w1lambda

        for n in 0:num_batch-1
            for c in 0:num_channels-1
                X0 = X[idx(n, num_channels, c, input_height, input_width, h1, w1)]
                X1 = X[idx(n, num_channels, c, input_height, input_width, h1, w1 + w1p)]
                X2 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1)]
                X3 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1 + w1p)]

                Y[idx(n, num_channels, c, output_height, output_width, out_y, out_x)] =
                            h0lambda * (w0lambda * X0 + w1lambda * X1) +
                            h1lambda * (w0lambda * X2 + w1lambda * X3)
            end  # channels
        end  # batch
    end # 1D kernel loop
    return nothing
end


function upsample_bilinear(x, height_scale, width_scale)
    n, c, h, w = Int32.(size(x))
    out_h = Int32(round(height_scale*h))
    out_w = Int32(round(width_scale*w))
    out_size = out_h*out_w
    nblocks = GET_BLOCKS(out_size)
    out = CuArray{Float32}(undef, n, c, out_h, out_w)
    CuArrays.@sync @cuda blocks=nblocks threads=CUDA_NUM_THREADS UpsampleBilinearKernel(n,c,h,w,out_h,out_w,height_scale,width_scale,x,out)
    return out
end


# input is dY, output is dX
function UpsampleBilinearGradientKernel(
    input_size,
    num_channels,
    input_height,
    input_width,
    output_height,
    output_width,
    height_scale,
    width_scale,
    dY,
    dX)
    #CUDA_1D_KERNEL_LOOP(index, input_size) {
    for index in ((blockIdx().x - 1) * blockDim().x + threadIdx().x): blockDim().x * gridDim().x : input_size
        indexTemp = index
        in_x = indexTemp % input_width
        indexTemp ÷= input_width
        in_y = indexTemp % input_height
        indexTemp ÷= input_height
        c = indexTemp % num_channels
        indexTemp ÷= num_channels
        n = indexTemp

        out_y = min(in_y / height_scale, output_height - 1)
        out_x = min(in_x / width_scale, output_width - 1)

        rheight = output_height > 1 ? (output_height - 1.f0) / (input_height - 1.f0) : 0.f0
        rwidth  = output_width > 1 ?  (output_width -  1.f0) / (input_width -  1.f0) : 0.f0

        # Compute Y axis lambdas
        h1r = rheight * in_y
        h1 = Int(round(h1r))  # check me
        h1p = (h1 < output_height - 1) ? 1 : 0
        h1lambda = h1r - h1
        h0lambda = 1.f0 - h1lambda

        # Compute X axis lambdas
        w1r = rwidth * in_x
        w1 = Int(round(w1r))  # check me
        w1p = (w1 < output_width - 1) ? 1 : 0
        w1lambda = w1r - w1
        w0lambda = 1.f0 - w1lambda

        #if __CUDA_ARCH__ >= 350
        # dYi = ldg(dY[index])  # ldg(pointer(dY[index])) ?
        dYi = ldg(dY, index)  # ldg(pointer(dY[index])) ?
        #else
            #dYi = dY[index];
        #endif
        atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1,       w1)),       h0lambda * w0lambda * dYi)
        atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1,       w1 + w1p)), h0lambda * w1lambda * dYi)
        atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1)),       h1lambda * w0lambda * dYi)
        atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1 + w1p)), h1lambda * w1lambda * dYi)
    end
    return nothing
end

function upsample_bilinear_gradient(dy, x)
    # input_size,
    # num_channels,
    # input_height,
    # input_width,
    # output_height,
    # output_width,
    # height_scale,
    # width_scale,
    # dY,
    # dX
    n, c, h, w = Int32.(size(dy))
    input_size = length(dy)
    out_h = size(x)[3]
    out_w = size(x)[4]
    height_scale = Float32(out_h/h)
    width_scale  = Float32(out_w/w)
    out_size = out_h * out_w
    nblocks = GET_BLOCKS(out_size)
    dx = CuArray{Float32}(undef, n, c, out_h, out_w)  # zeros()?
    CuArrays.@sync @cuda blocks=nblocks threads=CUDA_NUM_THREADS UpsampleBilinearGradientKernel(input_size, c, h, w, out_h, out_w, height_scale, width_scale, dy, dx)
    return dx
end
	# An implementation for GPU based bilinear upsampling including its gradient
	# WARNING: Untested code ahead!
	# The code is a translation from the following files:
	# https://github.com/pytorch/pytorch/blob/master/caffe2/operators/upsample_op.cu
	# https://github.com/pytorch/pytorch/blob/master/caffe2/core/common_gpu.h

	# Open issues:
	# 1) type stability?
	# 2) licensing?
	# 3) the array indexing has to be corrected


	using CUDAnative: atomic_add!
	using CuArrays

	const CUDA_NUM_THREADS = 128
	const MAXIMUM_NUM_BLOCKS = 4096

	@inline function GET_BLOCKS(N::Integer)
	# Use at least 1 block, since CUDA does not allow empty block
	return max(min((N + CUDA_NUM_THREADS - 1) ÷ CUDA_NUM_THREADS, MAXIMUM_NUM_BLOCKS), 1)
	end


	@inline function idx(
	n::Integer,
	num_channels::Integer,
	c::Integer,
	height::Integer,
	width::Integer,
	y::Integer,
	x::Integer)
	return ((n * num_channels + c) * height + y) * width + x + 1
	end


	function UpsampleBilinearKernel(
	num_batch,
	num_channels,
	input_height,
	input_width,
	output_height,
	output_width,
	height_scale,
	width_scale,
	X,
	Y)
	out_size = output_height * output_width
	# CUDA 1D kernel loop
	for index in ((blockIdx().x-1) * blockDim().x + threadIdx().x) : (blockDim().x * gridDim().x) : out_size
	indexTemp = index
	out_x = indexTemp % output_width
	indexTemp = indexTemp ÷ output_width
	out_y = indexTemp % output_height
	indexTemp = indexTemp ÷ output_height
	indexTemp = indexTemp ÷ num_channels

	rheight = output_height > 1 ? (input_height - 1f0) / (output_height - 1f0) : 0f0
	rwidth = output_width > 1 ? (input_width - 1f0) / (output_width - 1f0) : 0f0

	# Compute Y axis lambdas
	h1r = rheight * out_y
	h1::Int = round(h1r)
	h1p = (h1 < input_height - 1) ? 1 : 0
	h1lambda = h1r - h1
	h0lambda = 1f0 - h1lambda

	# Compute X axis lambdas
	w1r = rwidth * out_x
	w1::Int = round(w1r)
	w1p = (w1 < input_width - 1) ? 1 : 0
	w1lambda = w1r - w1
	w0lambda = 1f0 - w1lambda

	for n in 0:num_batch-1
	for c in 0:num_channels-1
	X0 = X[idx(n, num_channels, c, input_height, input_width, h1, w1)]
	X1 = X[idx(n, num_channels, c, input_height, input_width, h1, w1 + w1p)]
	X2 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1)]
	X3 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1 + w1p)]

	Y[idx(n, num_channels, c, output_height, output_width, out_y, out_x)] =
	h0lambda * (w0lambda * X0 + w1lambda * X1) +
	h1lambda * (w0lambda * X2 + w1lambda * X3)
	end # channels
	end # batch
	end # 1D kernel loop
	return nothing
	end


	function upsample_bilinear(x, height_scale, width_scale)
	n, c, h, w = Int32.(size(x))
	out_h = Int32(round(height_scale*h))
	out_w = Int32(round(width_scale*w))
	out_size = out_h*out_w
	nblocks = GET_BLOCKS(out_size)
	out = CuArray{Float32}(undef, n, c, out_h, out_w)
	CuArrays.@sync @cuda blocks=nblocks threads=CUDA_NUM_THREADS UpsampleBilinearKernel(n,c,h,w,out_h,out_w,height_scale,width_scale,x,out)
	return out
	end


	# input is dY, output is dX
	function UpsampleBilinearGradientKernel(
	input_size,
	num_channels,
	input_height,
	input_width,
	output_height,
	output_width,
	height_scale,
	width_scale,
	dY,
	dX)
	#CUDA_1D_KERNEL_LOOP(index, input_size) {
	for index in ((blockIdx().x - 1) * blockDim().x + threadIdx().x): blockDim().x * gridDim().x : input_size
	indexTemp = index
	in_x = indexTemp % input_width
	indexTemp ÷= input_width
	in_y = indexTemp % input_height
	indexTemp ÷= input_height
	c = indexTemp % num_channels
	indexTemp ÷= num_channels
	n = indexTemp

	out_y = min(in_y / height_scale, output_height - 1)
	out_x = min(in_x / width_scale, output_width - 1)

	rheight = output_height > 1 ? (output_height - 1.f0) / (input_height - 1.f0) : 0.f0
	rwidth = output_width > 1 ? (output_width - 1.f0) / (input_width - 1.f0) : 0.f0

	# Compute Y axis lambdas
	h1r = rheight * in_y
	h1 = Int(round(h1r)) # check me
	h1p = (h1 < output_height - 1) ? 1 : 0
	h1lambda = h1r - h1
	h0lambda = 1.f0 - h1lambda

	# Compute X axis lambdas
	w1r = rwidth * in_x
	w1 = Int(round(w1r)) # check me
	w1p = (w1 < output_width - 1) ? 1 : 0
	w1lambda = w1r - w1
	w0lambda = 1.f0 - w1lambda

	#if __CUDA_ARCH__ >= 350
	# dYi = ldg(dY[index]) # ldg(pointer(dY[index])) ?
	dYi = ldg(dY, index) # ldg(pointer(dY[index])) ?
	#else
	#dYi = dY[index];
	#endif
	atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1, w1)), h0lambda * w0lambda * dYi)
	atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1, w1 + w1p)), h0lambda * w1lambda * dYi)
	atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1)), h1lambda * w0lambda * dYi)
	atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1 + w1p)), h1lambda * w1lambda * dYi)
	end
	return nothing
	end

	function upsample_bilinear_gradient(dy, x)
	# input_size,
	# num_channels,
	# input_height,
	# input_width,
	# output_height,
	# output_width,
	# height_scale,
	# width_scale,
	# dY,
	# dX
	n, c, h, w = Int32.(size(dy))
	input_size = length(dy)
	out_h = size(x)[3]
	out_w = size(x)[4]
	height_scale = Float32(out_h/h)
	width_scale = Float32(out_w/w)
	out_size = out_h * out_w
	nblocks = GET_BLOCKS(out_size)
	dx = CuArray{Float32}(undef, n, c, out_h, out_w) # zeros()?
	CuArrays.@sync @cuda blocks=nblocks threads=CUDA_NUM_THREADS UpsampleBilinearGradientKernel(input_size, c, h, w, out_h, out_w, height_scale, width_scale, dy, dx)
	return dx
	end