Skip to content

Instantly share code, notes, and snippets.

@AlfTetzlaff
Created June 17, 2020 10:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AlfTetzlaff/fdee18829758f56a3b18c494d42c34c5 to your computer and use it in GitHub Desktop.
Save AlfTetzlaff/fdee18829758f56a3b18c494d42c34c5 to your computer and use it in GitHub Desktop.
# An implementation for GPU based bilinear upsampling including its gradient
# WARNING: Untested code ahead!
# The code is a translation from the following files:
# https://github.com/pytorch/pytorch/blob/master/caffe2/operators/upsample_op.cu
# https://github.com/pytorch/pytorch/blob/master/caffe2/core/common_gpu.h
# Open issues:
# 1) type stability?
# 2) licensing?
# 3) the array indexing has to be corrected
using CUDAnative: atomic_add!
using CuArrays
const CUDA_NUM_THREADS = 128
const MAXIMUM_NUM_BLOCKS = 4096
@inline function GET_BLOCKS(N::Integer)
# Use at least 1 block, since CUDA does not allow empty block
return max(min((N + CUDA_NUM_THREADS - 1) ÷ CUDA_NUM_THREADS, MAXIMUM_NUM_BLOCKS), 1)
end
@inline function idx(
n::Integer,
num_channels::Integer,
c::Integer,
height::Integer,
width::Integer,
y::Integer,
x::Integer)
return ((n * num_channels + c) * height + y) * width + x + 1
end
function UpsampleBilinearKernel(
num_batch,
num_channels,
input_height,
input_width,
output_height,
output_width,
height_scale,
width_scale,
X,
Y)
out_size = output_height * output_width
# CUDA 1D kernel loop
for index in ((blockIdx().x-1) * blockDim().x + threadIdx().x) : (blockDim().x * gridDim().x) : out_size
indexTemp = index
out_x = indexTemp % output_width
indexTemp = indexTemp ÷ output_width
out_y = indexTemp % output_height
indexTemp = indexTemp ÷ output_height
indexTemp = indexTemp ÷ num_channels
rheight = output_height > 1 ? (input_height - 1f0) / (output_height - 1f0) : 0f0
rwidth = output_width > 1 ? (input_width - 1f0) / (output_width - 1f0) : 0f0
# Compute Y axis lambdas
h1r = rheight * out_y
h1::Int = round(h1r)
h1p = (h1 < input_height - 1) ? 1 : 0
h1lambda = h1r - h1
h0lambda = 1f0 - h1lambda
# Compute X axis lambdas
w1r = rwidth * out_x
w1::Int = round(w1r)
w1p = (w1 < input_width - 1) ? 1 : 0
w1lambda = w1r - w1
w0lambda = 1f0 - w1lambda
for n in 0:num_batch-1
for c in 0:num_channels-1
X0 = X[idx(n, num_channels, c, input_height, input_width, h1, w1)]
X1 = X[idx(n, num_channels, c, input_height, input_width, h1, w1 + w1p)]
X2 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1)]
X3 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1 + w1p)]
Y[idx(n, num_channels, c, output_height, output_width, out_y, out_x)] =
h0lambda * (w0lambda * X0 + w1lambda * X1) +
h1lambda * (w0lambda * X2 + w1lambda * X3)
end # channels
end # batch
end # 1D kernel loop
return nothing
end
function upsample_bilinear(x, height_scale, width_scale)
n, c, h, w = Int32.(size(x))
out_h = Int32(round(height_scale*h))
out_w = Int32(round(width_scale*w))
out_size = out_h*out_w
nblocks = GET_BLOCKS(out_size)
out = CuArray{Float32}(undef, n, c, out_h, out_w)
CuArrays.@sync @cuda blocks=nblocks threads=CUDA_NUM_THREADS UpsampleBilinearKernel(n,c,h,w,out_h,out_w,height_scale,width_scale,x,out)
return out
end
# input is dY, output is dX
function UpsampleBilinearGradientKernel(
input_size,
num_channels,
input_height,
input_width,
output_height,
output_width,
height_scale,
width_scale,
dY,
dX)
#CUDA_1D_KERNEL_LOOP(index, input_size) {
for index in ((blockIdx().x - 1) * blockDim().x + threadIdx().x): blockDim().x * gridDim().x : input_size
indexTemp = index
in_x = indexTemp % input_width
indexTemp ÷= input_width
in_y = indexTemp % input_height
indexTemp ÷= input_height
c = indexTemp % num_channels
indexTemp ÷= num_channels
n = indexTemp
out_y = min(in_y / height_scale, output_height - 1)
out_x = min(in_x / width_scale, output_width - 1)
rheight = output_height > 1 ? (output_height - 1.f0) / (input_height - 1.f0) : 0.f0
rwidth = output_width > 1 ? (output_width - 1.f0) / (input_width - 1.f0) : 0.f0
# Compute Y axis lambdas
h1r = rheight * in_y
h1 = Int(round(h1r)) # check me
h1p = (h1 < output_height - 1) ? 1 : 0
h1lambda = h1r - h1
h0lambda = 1.f0 - h1lambda
# Compute X axis lambdas
w1r = rwidth * in_x
w1 = Int(round(w1r)) # check me
w1p = (w1 < output_width - 1) ? 1 : 0
w1lambda = w1r - w1
w0lambda = 1.f0 - w1lambda
#if __CUDA_ARCH__ >= 350
# dYi = ldg(dY[index]) # ldg(pointer(dY[index])) ?
dYi = ldg(dY, index) # ldg(pointer(dY[index])) ?
#else
#dYi = dY[index];
#endif
atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1, w1)), h0lambda * w0lambda * dYi)
atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1, w1 + w1p)), h0lambda * w1lambda * dYi)
atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1)), h1lambda * w0lambda * dYi)
atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1 + w1p)), h1lambda * w1lambda * dYi)
end
return nothing
end
function upsample_bilinear_gradient(dy, x)
# input_size,
# num_channels,
# input_height,
# input_width,
# output_height,
# output_width,
# height_scale,
# width_scale,
# dY,
# dX
n, c, h, w = Int32.(size(dy))
input_size = length(dy)
out_h = size(x)[3]
out_w = size(x)[4]
height_scale = Float32(out_h/h)
width_scale = Float32(out_w/w)
out_size = out_h * out_w
nblocks = GET_BLOCKS(out_size)
dx = CuArray{Float32}(undef, n, c, out_h, out_w) # zeros()?
CuArrays.@sync @cuda blocks=nblocks threads=CUDA_NUM_THREADS UpsampleBilinearGradientKernel(input_size, c, h, w, out_h, out_w, height_scale, width_scale, dy, dx)
return dx
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment