colesbury/script.py

## script.py
import torch
import time

N = 1  # batch size

x = torch.randn(N, 64, 160, 120).cuda()
baseline_layer = torch.nn.Conv2d(64, 64, kernel_size=(3, 3), bias=False).cuda()
distilled_layer = torch.nn.Sequential(
        torch.nn.Conv2d(64, 14, kernel_size=(1, 1), bias=False),
        torch.nn.Conv2d(14, 15, kernel_size=(3, 3), bias=False),
        torch.nn.Conv2d(15, 64, kernel_size=(1, 1), bias=False)).cuda()

#x = torch.randn(N, 256, 160, 120).cuda()
#baseline_layer = torch.nn.Conv2d(256, 128, kernel_size=(1, 1), bias=False).cuda()
#distilled_layer = torch.nn.Sequential(
#        torch.nn.Conv2d(256, 1, kernel_size=(1, 1), bias=False),
#        torch.nn.Conv2d(1, 128, kernel_size=(1, 1), bias=False)).cuda()

# Forward-only (may reduce some bookeeping overhead)
torch.set_grad_enabled(False)

# without these lines cuDNN sometimes chooses suboptimal algos
# (need to investigate further)
baseline_layer(x)
distilled_layer(x)

# With torch.backends.cudnn.benchmark the first call
# may be much slower due to benchmarking all cuDNN algos
torch.backends.cudnn.benchmark = True
baseline_layer(x)
distilled_layer(x)

# time 1000 forward passes
def benchmark(layer, x):
    torch.cuda.synchronize()
    start = time.time()
    for _ in range(1000):
        layer(x)
    torch.cuda.synchronize()
    end = time.time()
    return end - start

for _ in range(4):
    print('baseline', benchmark(baseline_layer, x))
for _ in range(4):
    print('distilled', benchmark(distilled_layer, x))

# Run with cuProfiler enabled
def loop_with_profiler(layer, x):
    cudart = torch.cuda._load_cudart()
    torch.cuda.synchronize()
    cudart.cuProfilerStart()
    for _ in range(1000):
        layer(x)
    torch.cuda.synchronize()
    cudart.cuProfilerStop()

# uncomment a line below and run with `nvprof --profile-from-start off -- python script.py`
# only do one at a time
#loop_with_profiler(baseline_layer, x)
#loop_with_profiler(distilled_layer, x)
	import torch
	import time

	N = 1 # batch size

	x = torch.randn(N, 64, 160, 120).cuda()
	baseline_layer = torch.nn.Conv2d(64, 64, kernel_size=(3, 3), bias=False).cuda()
	distilled_layer = torch.nn.Sequential(
	torch.nn.Conv2d(64, 14, kernel_size=(1, 1), bias=False),
	torch.nn.Conv2d(14, 15, kernel_size=(3, 3), bias=False),
	torch.nn.Conv2d(15, 64, kernel_size=(1, 1), bias=False)).cuda()

	#x = torch.randn(N, 256, 160, 120).cuda()
	#baseline_layer = torch.nn.Conv2d(256, 128, kernel_size=(1, 1), bias=False).cuda()
	#distilled_layer = torch.nn.Sequential(
	# torch.nn.Conv2d(256, 1, kernel_size=(1, 1), bias=False),
	# torch.nn.Conv2d(1, 128, kernel_size=(1, 1), bias=False)).cuda()

	# Forward-only (may reduce some bookeeping overhead)
	torch.set_grad_enabled(False)

	# without these lines cuDNN sometimes chooses suboptimal algos
	# (need to investigate further)
	baseline_layer(x)
	distilled_layer(x)

	# With torch.backends.cudnn.benchmark the first call
	# may be much slower due to benchmarking all cuDNN algos
	torch.backends.cudnn.benchmark = True
	baseline_layer(x)
	distilled_layer(x)

	# time 1000 forward passes
	def benchmark(layer, x):
	torch.cuda.synchronize()
	start = time.time()
	for _ in range(1000):
	layer(x)
	torch.cuda.synchronize()
	end = time.time()
	return end - start

	for _ in range(4):
	print('baseline', benchmark(baseline_layer, x))
	for _ in range(4):
	print('distilled', benchmark(distilled_layer, x))

	# Run with cuProfiler enabled
	def loop_with_profiler(layer, x):
	cudart = torch.cuda._load_cudart()
	torch.cuda.synchronize()
	cudart.cuProfilerStart()
	for _ in range(1000):
	layer(x)
	torch.cuda.synchronize()
	cudart.cuProfilerStop()

	# uncomment a line below and run with `nvprof --profile-from-start off -- python script.py`
	# only do one at a time
	#loop_with_profiler(baseline_layer, x)
	#loop_with_profiler(distilled_layer, x)