Skip to content

Instantly share code, notes, and snippets.

@zou3519
Last active June 12, 2018 12:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save zou3519/8e4ce6652a3dd132a8c4336e697bbbf0 to your computer and use it in GitHub Desktop.
Save zou3519/8e4ce6652a3dd132a8c4336e697bbbf0 to your computer and use it in GitHub Desktop.
[pytorch] GridSampler CUDNN vs THCUNN performance comparision script
import time
import torch
import torch.backends.cudnn as cudnn
import torch.nn.functional as F
from torch.autograd import Variable
def benchmark_shape(N, C, IH, IW, H, W, nrand, nrep):
"""
Performs nrand*nrep trials.
"""
input_datas = [torch.randn(C, N, IH, IW) for i in range(0, nrand)]
grid_datas = [torch.randn(H, N, W, 2) for i in range(0, nrand)]
datas = zip(input_datas, grid_datas)
# print "Running CPU benchmark"
# cpu_results = benchmark_helper(workload_cpu, datas, nrep);
print "Running CUDNN benchmark"
cudnn_results = benchmark_helper(workload_cudnn, datas, nrep);
assert(cudnn.enabled)
cudnn.enabled = False
print "Running THCUNN benchmark"
cuda_results = benchmark_helper(workload_cuda, datas, nrep);
cudnn.enabled = True
def check_shapes(N, C, IH, IW, H, W):
input_cpu = Variable(torch.randn(C, N, IH, IW).transpose(0, 1), \
requires_grad=True)
grid_cpu = Variable(torch.randn(H, N, W, 2).transpose(0, 1), \
requires_grad=True)
out_cpu = F.grid_sample(input_cpu, grid_cpu)
assert(out_cpu.size() == torch.Size([N, C, H, W]))
input_cuda = Variable(input_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True)
grid_cuda = Variable(grid_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True)
cudnn.enabled = False
out_cuda = F.grid_sample(input_cuda, grid_cuda)
cudnn.enabled = True
assertTensorsEqual(out_cpu, out_cuda)
input_cudnn = Variable(input_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True)
grid_cudnn = Variable(grid_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True)
out_cudnn = F.grid_sample(input_cudnn, grid_cudnn)
assertTensorsEqual(out_cpu, out_cudnn)
gradients = out_cpu.data.new(out_cpu.size()).normal_()
out_cpu.backward(gradients)
gradients_cuda = gradients.cuda()
cudnn.enabled = False
out_cuda.backward(gradients_cuda)
cudnn.enabled= True
out_cudnn.backward(gradients_cuda)
assertTensorsEqual(input_cpu.grad, input_cuda.grad, msg="A")
assertTensorsEqual(input_cpu.grad, input_cudnn.grad, msg="B")
assertTensorsEqual(input_cudnn.grad, input_cuda.grad, msg="C")
assertTensorsEqual(grid_cpu.grad, grid_cuda.grad, msg="D")
assertTensorsEqual(grid_cpu.grad, grid_cudnn.grad, msg="E")
assertTensorsEqual(grid_cuda.grad, grid_cudnn.grad, msg="F")
def benchmark_helper(workload_fn, datas, nrep):
start = time.time()
result = []
for (input_data, grid_data) in datas:
for i in range(0, nrep):
out = (workload_fn(input_data, grid_data))
result.append(out)
end = time.time()
print (end - start)
return result
def assertTensorsEqual(a, b, prec=1e-5, msg=''):
assert(a.size() == b.size())
a = a.cuda()
b = b.cuda()
diff = a - b
if diff.is_signed():
diff = diff.abs()
max_err = diff.max().data[0]
if (max_err > prec):
print msg
print "Error was " + str(max_err)
def workload_cpu(input_data, grid_data):
input = Variable(input_data.transpose(0, 1), requires_grad=True)
grid = Variable(grid_data.transpose(0, 1), requires_grad=True)
out = F.grid_sample(input, grid)
grads = out.data.new(out.size()).normal_()
out.backward(grads)
del input
del grid
del out
def workload_cudnn(input_data, grid_data):
assert(cudnn.enabled)
workload_cuda_helper(input_data, grid_data)
def workload_cuda(input_data, grid_data):
assert(not cudnn.enabled)
workload_cuda_helper(input_data, grid_data)
def workload_cuda_helper(input_data, grid_data):
input = Variable(input_data.transpose(0, 1).cuda(), requires_grad=True)
grid = Variable(grid_data.transpose(0, 1).cuda(), requires_grad=True)
out = F.grid_sample(input, grid)
grads = out.data.new(out.size()).normal_()
out.backward(grads)
del input
del grid
del out
if __name__ == "__main__":
# benchmark_shape(N, C, IH, IW, H, W, nrand, nrep)
print "Testing small sizes"
benchmark_shape(10, 5, 20, 20, 15, 15, 5, 5)
print ""
print "Testing small sizes, big N"
benchmark_shape(500, 5, 20, 20, 15, 15, 5, 5)
print ""
print "Testing large sizes"
benchmark_shape(50, 10, 100, 100, 100, 100, 5, 5)
print ""
print "Testing large sizes, small C"
benchmark_shape(50, 5, 100, 100, 100, 100, 5, 5)
print ""
print "Testing large N"
benchmark_shape(500, 10, 50, 50, 50, 50, 5, 5)
print ""
print "Testing large C"
benchmark_shape(50, 100, 50, 50, 50, 50, 5, 5)
print ""
print "Testing large input"
benchmark_shape(50, 10, 500, 500, 80, 80, 5, 5)
print ""
print "Testing large output"
benchmark_shape(50, 10, 80, 80, 500, 500, 5, 5)
print ""
# check_shapes(100, 8, 100, 100, 60, 60)
@ClementPinard
Copy link

Hello, thanks for your gist.
I have been working on your implementation of SpatialGridSamplerBilinear lately, and I think you may have done a mistake when computing indices n, h, w from thread id here : https://github.com/pytorch/pytorch/blob/master/aten/src/THCUNN/SpatialGridSamplerBilinear.cu#L45

I think it would make memory more coalescent to do the following instead of indexing in order n, h, w :

const int w = index % W;
const int h = (index / W) % H;
const int n = (index / (W * H)) % N;

To back my claim and maybe submit a PR, I think I can use your script (found on your initial PR for this very function) as a basis to measure speed and compare with CuDNN and former implementation. It may not be up to date, but I feel like it only needs a few changes.

However I am intrigued about the fact that you initially construct a C,N,H,W tensor only to transpose it just after. Is there a particular reason behind this ? does it change the stride of the first two dimensions ? (in that case that's important to know that for optimization)

Also, when doing consistency check with check_shapes , from untouched source code, I get bad results for grad check, for all possible test (D, E, and F). Was it already the case back then ? Maybe it's normal ? (it's only an error of max 1e-4, but still above 1e-5)

Maybe there's a more up to date test script somewhere ?

Thanks,

Clément

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment