Skip to content

Instantly share code, notes, and snippets.

@maxfreu
Last active December 22, 2020 10:04
Show Gist options
  • Save maxfreu/815c8e0c8304bab39d7f78033289cece to your computer and use it in GitHub Desktop.
Save maxfreu/815c8e0c8304bab39d7f78033289cece to your computer and use it in GitHub Desktop.
# An implementation for GPU based bilinear upsampling including its gradient
# The code is a translation from the following files:
# https://github.com/pytorch/pytorch/blob/master/caffe2/operators/upsample_op.cu
# https://github.com/pytorch/pytorch/blob/master/caffe2/core/common_gpu.h
# Forward and backward pass have been tested to produce the same output
# as pytorch - it works modulo bit noise.
# Open issues:
# 1) licensing?
using CUDA
using CUDA: atomic_add!
const CUDA_NUM_THREADS = 128
const MAXIMUM_NUM_BLOCKS = 4096
@inline function GET_BLOCKS(N::Integer)
# Use at least 1 block, since CUDA does not allow empty block
return max(min((N + CUDA_NUM_THREADS - 1) ÷ CUDA_NUM_THREADS, MAXIMUM_NUM_BLOCKS), 1)
end
# pytorch: nchw with row major
# flux: whcn with column major -> same data layout in memory -> this function
# can stay as it is except for +1
@inline function idx(
n::Integer,
num_channels::Integer,
c::Integer,
height::Integer,
width::Integer,
y::Integer,
x::Integer)
return ((n * num_channels + c) * height + y) * width + x + 1
end
#---- begin forward ----
function upsample_bilinear_kernel!(
num_batch,
num_channels,
input_height,
input_width,
output_height,
output_width,
height_scale,
width_scale,
X, # input __restrict__
Y) # output __restrict__
out_size = output_height * output_width
# CUDA 1D kernel loop
@inbounds for index in ((blockIdx().x-1) * blockDim().x + threadIdx().x-1) : (blockDim().x * gridDim().x) : out_size-1
# mind the order!
indexTemp = index
out_x = indexTemp % output_width
indexTemp ÷= output_width
out_y = indexTemp % output_height
indexTemp ÷= output_height
indexTemp ÷= num_channels
rheight = output_height > 1 ? (input_height - 1f0) / (output_height - 1f0) : 0f0
rwidth = output_width > 1 ? (input_width - 1f0) / (output_width - 1f0) : 0f0
# Compute Y axis lambdas
h1r = rheight * out_y
h1 = round(Int, h1r, RoundDown) # here was a typecast (int)
h1p = (h1 < input_height - 1) ? 1 : 0
h1lambda = h1r - h1
h0lambda = 1f0 - h1lambda
# Compute X axis lambdas
w1r = rwidth * out_x
w1 = round(Int, w1r, RoundDown)
w1p = (w1 < input_width - 1) ? 1 : 0
w1lambda = w1r - w1
w0lambda = 1f0 - w1lambda
for n in 0:num_batch-1 # shift to original C indexing
for c in 0:num_channels-1
X0 = X[idx(n, num_channels, c, input_height, input_width, h1, w1)]
X1 = X[idx(n, num_channels, c, input_height, input_width, h1, w1 + w1p)]
X2 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1)]
X3 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1 + w1p)]
Y[idx(n, num_channels, c, output_height, output_width, out_y, out_x)] =
h0lambda * (w0lambda * X0 + w1lambda * X1) +
h1lambda * (w0lambda * X2 + w1lambda * X3)
end # channels
end # batch
end # 1D kernel loop
return nothing
end
function upsample_bilinear(x::CuArray{T,4}, height_scale, width_scale) where T
w, h, c, n = size(x)
out_h = round(Int, height_scale*h)
out_w = round(Int, width_scale*w)
out_size = out_h*out_w
nblocks = GET_BLOCKS(out_size)
out = CuArray{T}(undef, out_w, out_h, c, n)
CUDA.@sync @cuda blocks=nblocks threads=CUDA_NUM_THREADS upsample_bilinear_kernel!(n,c,h,w,out_h,out_w,height_scale,width_scale,x,out)
return out
end
#---- end forward ----
#---- begin backward ----
# input is dY, output is dX
function upsample_bilinear_gradient_kernel(
input_size,
num_channels,
input_height,
input_width,
output_height,
output_width,
height_scale,
width_scale,
dY, # const
dX)
@inbounds for index in ((blockIdx().x - 1) * blockDim().x + threadIdx().x-1): blockDim().x * gridDim().x : input_size-1
# mind the order!
indexTemp = index
in_x = indexTemp % input_width
indexTemp ÷= input_width
in_y = indexTemp % input_height
indexTemp ÷= input_height
c = indexTemp % num_channels
indexTemp ÷= num_channels
n = indexTemp
out_y = min(in_y / height_scale, output_height - 1)
out_x = min(in_x / width_scale, output_width - 1)
rheight = output_height > 1 ? (output_height - 1.f0) / (input_height - 1.f0) : 0.f0
rwidth = output_width > 1 ? (output_width - 1.f0) / (input_width - 1.f0) : 0.f0
# Compute Y axis lambdas
h1r = rheight * in_y
h1 = round(Int, h1r, RoundDown)
h1p = (h1 < output_height - 1) ? 1 : 0
h1lambda = h1r - h1
h0lambda = 1.f0 - h1lambda
# Compute X axis lambdas
w1r = rwidth * in_x
w1 = round(Int, w1r, RoundDown)
w1p = (w1 < output_width - 1) ? 1 : 0
w1lambda = w1r - w1
w0lambda = 1.f0 - w1lambda
#if __CUDA_ARCH__ >= 350 # true for everything from 9xx on
dYi = ldg(dY, index+1) # ldg(pointer(dY[index])) ?
#else
# dYi = dY[index + 1];
#endif
atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1, w1)), h0lambda * w0lambda * dYi)
atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1, w1 + w1p)), h0lambda * w1lambda * dYi)
atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1)), h1lambda * w0lambda * dYi)
atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1 + w1p)), h1lambda * w1lambda * dYi)
end
return nothing
end
function upsample_bilinear_gradient(dy::CuArray{T,4}, x::CuArray{T,4}) where T
w, h, c, n = size(dy)
input_size = length(dy)
out_w, out_h = size(x)[1:2]
out_size = out_h * out_w
height_scale = Float32(out_h/h)
width_scale = Float32(out_w/w)
nblocks = GET_BLOCKS(out_size)
dx = zero(CuArray{T}(undef, out_w, out_h, c, n))
CUDA.@sync @cuda blocks=nblocks threads=CUDA_NUM_THREADS upsample_bilinear_gradient_kernel(input_size, c, h, w, out_h, out_w, height_scale, width_scale, dy, dx)
return dx
end
#---- end backward ----
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment