Last active
December 22, 2020 10:04
-
-
Save maxfreu/815c8e0c8304bab39d7f78033289cece to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# An implementation for GPU based bilinear upsampling including its gradient | |
# The code is a translation from the following files: | |
# https://github.com/pytorch/pytorch/blob/master/caffe2/operators/upsample_op.cu | |
# https://github.com/pytorch/pytorch/blob/master/caffe2/core/common_gpu.h | |
# Forward and backward pass have been tested to produce the same output | |
# as pytorch - it works modulo bit noise. | |
# Open issues: | |
# 1) licensing? | |
using CUDA | |
using CUDA: atomic_add! | |
const CUDA_NUM_THREADS = 128 | |
const MAXIMUM_NUM_BLOCKS = 4096 | |
@inline function GET_BLOCKS(N::Integer) | |
# Use at least 1 block, since CUDA does not allow empty block | |
return max(min((N + CUDA_NUM_THREADS - 1) ÷ CUDA_NUM_THREADS, MAXIMUM_NUM_BLOCKS), 1) | |
end | |
# pytorch: nchw with row major | |
# flux: whcn with column major -> same data layout in memory -> this function | |
# can stay as it is except for +1 | |
@inline function idx( | |
n::Integer, | |
num_channels::Integer, | |
c::Integer, | |
height::Integer, | |
width::Integer, | |
y::Integer, | |
x::Integer) | |
return ((n * num_channels + c) * height + y) * width + x + 1 | |
end | |
#---- begin forward ---- | |
function upsample_bilinear_kernel!( | |
num_batch, | |
num_channels, | |
input_height, | |
input_width, | |
output_height, | |
output_width, | |
height_scale, | |
width_scale, | |
X, # input __restrict__ | |
Y) # output __restrict__ | |
out_size = output_height * output_width | |
# CUDA 1D kernel loop | |
@inbounds for index in ((blockIdx().x-1) * blockDim().x + threadIdx().x-1) : (blockDim().x * gridDim().x) : out_size-1 | |
# mind the order! | |
indexTemp = index | |
out_x = indexTemp % output_width | |
indexTemp ÷= output_width | |
out_y = indexTemp % output_height | |
indexTemp ÷= output_height | |
indexTemp ÷= num_channels | |
rheight = output_height > 1 ? (input_height - 1f0) / (output_height - 1f0) : 0f0 | |
rwidth = output_width > 1 ? (input_width - 1f0) / (output_width - 1f0) : 0f0 | |
# Compute Y axis lambdas | |
h1r = rheight * out_y | |
h1 = round(Int, h1r, RoundDown) # here was a typecast (int) | |
h1p = (h1 < input_height - 1) ? 1 : 0 | |
h1lambda = h1r - h1 | |
h0lambda = 1f0 - h1lambda | |
# Compute X axis lambdas | |
w1r = rwidth * out_x | |
w1 = round(Int, w1r, RoundDown) | |
w1p = (w1 < input_width - 1) ? 1 : 0 | |
w1lambda = w1r - w1 | |
w0lambda = 1f0 - w1lambda | |
for n in 0:num_batch-1 # shift to original C indexing | |
for c in 0:num_channels-1 | |
X0 = X[idx(n, num_channels, c, input_height, input_width, h1, w1)] | |
X1 = X[idx(n, num_channels, c, input_height, input_width, h1, w1 + w1p)] | |
X2 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1)] | |
X3 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1 + w1p)] | |
Y[idx(n, num_channels, c, output_height, output_width, out_y, out_x)] = | |
h0lambda * (w0lambda * X0 + w1lambda * X1) + | |
h1lambda * (w0lambda * X2 + w1lambda * X3) | |
end # channels | |
end # batch | |
end # 1D kernel loop | |
return nothing | |
end | |
function upsample_bilinear(x::CuArray{T,4}, height_scale, width_scale) where T | |
w, h, c, n = size(x) | |
out_h = round(Int, height_scale*h) | |
out_w = round(Int, width_scale*w) | |
out_size = out_h*out_w | |
nblocks = GET_BLOCKS(out_size) | |
out = CuArray{T}(undef, out_w, out_h, c, n) | |
CUDA.@sync @cuda blocks=nblocks threads=CUDA_NUM_THREADS upsample_bilinear_kernel!(n,c,h,w,out_h,out_w,height_scale,width_scale,x,out) | |
return out | |
end | |
#---- end forward ---- | |
#---- begin backward ---- | |
# input is dY, output is dX | |
function upsample_bilinear_gradient_kernel( | |
input_size, | |
num_channels, | |
input_height, | |
input_width, | |
output_height, | |
output_width, | |
height_scale, | |
width_scale, | |
dY, # const | |
dX) | |
@inbounds for index in ((blockIdx().x - 1) * blockDim().x + threadIdx().x-1): blockDim().x * gridDim().x : input_size-1 | |
# mind the order! | |
indexTemp = index | |
in_x = indexTemp % input_width | |
indexTemp ÷= input_width | |
in_y = indexTemp % input_height | |
indexTemp ÷= input_height | |
c = indexTemp % num_channels | |
indexTemp ÷= num_channels | |
n = indexTemp | |
out_y = min(in_y / height_scale, output_height - 1) | |
out_x = min(in_x / width_scale, output_width - 1) | |
rheight = output_height > 1 ? (output_height - 1.f0) / (input_height - 1.f0) : 0.f0 | |
rwidth = output_width > 1 ? (output_width - 1.f0) / (input_width - 1.f0) : 0.f0 | |
# Compute Y axis lambdas | |
h1r = rheight * in_y | |
h1 = round(Int, h1r, RoundDown) | |
h1p = (h1 < output_height - 1) ? 1 : 0 | |
h1lambda = h1r - h1 | |
h0lambda = 1.f0 - h1lambda | |
# Compute X axis lambdas | |
w1r = rwidth * in_x | |
w1 = round(Int, w1r, RoundDown) | |
w1p = (w1 < output_width - 1) ? 1 : 0 | |
w1lambda = w1r - w1 | |
w0lambda = 1.f0 - w1lambda | |
#if __CUDA_ARCH__ >= 350 # true for everything from 9xx on | |
dYi = ldg(dY, index+1) # ldg(pointer(dY[index])) ? | |
#else | |
# dYi = dY[index + 1]; | |
#endif | |
atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1, w1)), h0lambda * w0lambda * dYi) | |
atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1, w1 + w1p)), h0lambda * w1lambda * dYi) | |
atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1)), h1lambda * w0lambda * dYi) | |
atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1 + w1p)), h1lambda * w1lambda * dYi) | |
end | |
return nothing | |
end | |
function upsample_bilinear_gradient(dy::CuArray{T,4}, x::CuArray{T,4}) where T | |
w, h, c, n = size(dy) | |
input_size = length(dy) | |
out_w, out_h = size(x)[1:2] | |
out_size = out_h * out_w | |
height_scale = Float32(out_h/h) | |
width_scale = Float32(out_w/w) | |
nblocks = GET_BLOCKS(out_size) | |
dx = zero(CuArray{T}(undef, out_w, out_h, c, n)) | |
CUDA.@sync @cuda blocks=nblocks threads=CUDA_NUM_THREADS upsample_bilinear_gradient_kernel(input_size, c, h, w, out_h, out_w, height_scale, width_scale, dy, dx) | |
return dx | |
end | |
#---- end backward ---- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment