culurciello/gist:5189137

## gistfile1.lua
#!/usr/bin/env torch
require 'nn'
require 'image'
require 'xlua'
require 'pl'


opt = lapp[[
   -t,--threads            (default 8)           number of threads
   -p,--type               (default float)       float or cuda
   -i,--devid              (default 1)           device ID (if using CUDA)
]]

p = xlua.Profiler()
torch.setnumthreads(opt.threads)
torch.manualSeed(1)
torch.setdefaulttensortype('torch.FloatTensor')


if opt.type == 'cuda' then
   print('==> switching to CUDA')
   require 'cunn'
   cutorch.setDevice(opt.devid)
   print('==> using GPU #' .. cutorch.getDevice())

   nn.SpatialConvolutionMM = nn.SpatialConvolution
end

-- input:
lena1 = torch.Tensor(1,512,512)
-- lena1 = image.lena()[{1}]:reshape(1,512,512)

-- model to test:
--model = nn.SpatialConvolution(1, 8, 9, 9)
model = nn.SpatialConvolution(1, 16, 10, 10) -- 16 filters of 10x10 on a 512x512 image

-- copy to GPU if desired:
if opt.type == 'cuda' then
   model:cuda()
   lena1 = torch.CudaTensor(1,512,512)
end

-- test speed:
p:start('spatialconv')
lena2 = model:forward(lena1)
if opt.type == 'cuda' then cutorch.synchronize() end
p:lap('spatialconv')

p:printAll{}


print('Gops/s:', ( 16*10*10*((512-9)+1)*((512-9)+1)*2 ) / p:cpu('spatialconv') / 1e9 ) -- 2 operations MUL, ACC
	#!/usr/bin/env torch
	require 'nn'
	require 'image'
	require 'xlua'
	require 'pl'


	opt = lapp[[
	-t,--threads (default 8) number of threads
	-p,--type (default float) float or cuda
	-i,--devid (default 1) device ID (if using CUDA)
	]]

	p = xlua.Profiler()
	torch.setnumthreads(opt.threads)
	torch.manualSeed(1)
	torch.setdefaulttensortype('torch.FloatTensor')


	if opt.type == 'cuda' then
	print('==> switching to CUDA')
	require 'cunn'
	cutorch.setDevice(opt.devid)
	print('==> using GPU #' .. cutorch.getDevice())

	nn.SpatialConvolutionMM = nn.SpatialConvolution
	end

	-- input:
	lena1 = torch.Tensor(1,512,512)
	-- lena1 = image.lena()[{1}]:reshape(1,512,512)

	-- model to test:
	--model = nn.SpatialConvolution(1, 8, 9, 9)
	model = nn.SpatialConvolution(1, 16, 10, 10) -- 16 filters of 10x10 on a 512x512 image

	-- copy to GPU if desired:
	if opt.type == 'cuda' then
	model:cuda()
	lena1 = torch.CudaTensor(1,512,512)
	end

	-- test speed:
	p:start('spatialconv')
	lena2 = model:forward(lena1)
	if opt.type == 'cuda' then cutorch.synchronize() end
	p:lap('spatialconv')

	p:printAll{}


	print('Gops/s:', ( 161010((512-9)+1)((512-9)+1)*2 ) / p:cpu('spatialconv') / 1e9 ) -- 2 operations MUL, ACC