colesbury/minibenchmark.py Secret

## minibenchmark.py
import torch

def loop(x):
  for _ in range(1000):
    x()
  torch.cuda.synchronize()

print('Add contiguous')
x = torch.randn(1024, 1024, device='cuda')
y = torch.randn(1024, 1024, device='cuda')
%timeit x + y; torch.cuda.synchronize()

print('Add broadcast outer')
x = torch.randn(1024, 1024, device='cuda')
y = torch.randn(1024, device='cuda')
%timeit x + y; torch.cuda.synchronize()

print('Add broadcast inner')
x = torch.randn(1024, 1024, device='cuda')
y = torch.randn(1024, 1, device='cuda')
%timeit x + y; torch.cuda.synchronize()

# Launch overhead is significant on ROCm so it's worth also running these 1000 times before synchronizing
print('Add contiguous')
x = torch.randn(1024, 1024, device='cuda')
y = torch.randn(1024, 1024, device='cuda')
%timeit loop(lambda: x + y)

print('Add broadcast outer')
x = torch.randn(1024, 1024, device='cuda')
y = torch.randn(1024, device='cuda')
%timeit loop(lambda: x + y)

print('Add broadcast inner')
x = torch.randn(1024, 1024, device='cuda')
y = torch.randn(1024, 1, device='cuda')
%timeit loop(lambda: x + y)

print('CUDA copy broadcasting')
x = torch.randn(1024, 1024, device='cuda')
y = torch.randn(1024, device='cuda')
%timeit x.copy_(y); torch.cuda.synchronize()

print('CUDA GPU1 -> GPU0')
x = torch.randn(1024, 1024, device='cuda:0')
y = torch.randn(1024, 1024, device='cuda:1')
%timeit x.copy_(y); torch.cuda.synchronize()

print('CUDA GPU1 -> GPU0 broadcasting')
x = torch.randn(1024, 1024, device='cuda:0')
y = torch.randn(1024, device='cuda:1')
%timeit x.copy_(y); torch.cuda.synchronize()

print('CUDA float -> int64')
x = torch.empty(1024, 1024, device='cuda', dtype=torch.int64)
y = torch.randn(1024, 1024, device='cuda')
%timeit x.copy_(y); torch.cuda.synchronize()

# PR
# Add contiguous
# 10000 loops, best of 3: 69.6 µs per loop
# Add broadcast outer
# 10000 loops, best of 3: 71.7 µs per loop
# Add broadcast inner
# 10000 loops, best of 3: 70.8 µs per loop
# Add contiguous
# 10 loops, best of 3: 29.9 ms per loop
# Add broadcast outer
# 10 loops, best of 3: 31.5 ms per loop
# Add broadcast inner
# 10 loops, best of 3: 31.8 ms per loop
# CUDA copy broadcasting
# 10000 loops, best of 3: 53.7 µs per loop
# CUDA GPU1 -> GPU0
# 1000 loops, best of 3: 340 µs per loop
# CUDA GPU1 -> GPU0 broadcasting
# 1000 loops, best of 3: 413 µs per loop
# CUDA float -> int64
# 10000 loops, best of 3: 52.2 µs per loop

# master
# Add contiguous
# 10000 loops, best of 3: 68.8 µs per loop
# Add broadcast outer
# 1000 loops, best of 3: 261 µs per loop
# Add broadcast inner
# 1000 loops, best of 3: 306 µs per loop
# Add contiguous
# 10 loops, best of 3: 25.9 ms per loop
# Add broadcast outer
# 10 loops, best of 3: 157 ms per loop
# Add broadcast inner
# 1 loop, best of 3: 252 ms per loop
# CUDA copy broadcasting
# 1000 loops, best of 3: 299 µs per loop
# CUDA GPU1 -> GPU0
# 1000 loops, best of 3: 381 µs per loop
# CUDA GPU1 -> GPU0 broadcasting
# 1000 loops, best of 3: 597 µs per loop
# CUDA float -> int64
# 10000 loops, best of 3: 46.9 µs per loop
	import torch

	def loop(x):
	for _ in range(1000):
	x()
	torch.cuda.synchronize()

	print('Add contiguous')
	x = torch.randn(1024, 1024, device='cuda')
	y = torch.randn(1024, 1024, device='cuda')
	%timeit x + y; torch.cuda.synchronize()

	print('Add broadcast outer')
	x = torch.randn(1024, 1024, device='cuda')
	y = torch.randn(1024, device='cuda')
	%timeit x + y; torch.cuda.synchronize()

	print('Add broadcast inner')
	x = torch.randn(1024, 1024, device='cuda')
	y = torch.randn(1024, 1, device='cuda')
	%timeit x + y; torch.cuda.synchronize()

	# Launch overhead is significant on ROCm so it's worth also running these 1000 times before synchronizing
	print('Add contiguous')
	x = torch.randn(1024, 1024, device='cuda')
	y = torch.randn(1024, 1024, device='cuda')
	%timeit loop(lambda: x + y)

	print('Add broadcast outer')
	x = torch.randn(1024, 1024, device='cuda')
	y = torch.randn(1024, device='cuda')
	%timeit loop(lambda: x + y)

	print('Add broadcast inner')
	x = torch.randn(1024, 1024, device='cuda')
	y = torch.randn(1024, 1, device='cuda')
	%timeit loop(lambda: x + y)

	print('CUDA copy broadcasting')
	x = torch.randn(1024, 1024, device='cuda')
	y = torch.randn(1024, device='cuda')
	%timeit x.copy_(y); torch.cuda.synchronize()

	print('CUDA GPU1 -> GPU0')
	x = torch.randn(1024, 1024, device='cuda:0')
	y = torch.randn(1024, 1024, device='cuda:1')
	%timeit x.copy_(y); torch.cuda.synchronize()

	print('CUDA GPU1 -> GPU0 broadcasting')
	x = torch.randn(1024, 1024, device='cuda:0')
	y = torch.randn(1024, device='cuda:1')
	%timeit x.copy_(y); torch.cuda.synchronize()

	print('CUDA float -> int64')
	x = torch.empty(1024, 1024, device='cuda', dtype=torch.int64)
	y = torch.randn(1024, 1024, device='cuda')
	%timeit x.copy_(y); torch.cuda.synchronize()

	# PR
	# Add contiguous
	# 10000 loops, best of 3: 69.6 µs per loop
	# Add broadcast outer
	# 10000 loops, best of 3: 71.7 µs per loop
	# Add broadcast inner
	# 10000 loops, best of 3: 70.8 µs per loop
	# Add contiguous
	# 10 loops, best of 3: 29.9 ms per loop
	# Add broadcast outer
	# 10 loops, best of 3: 31.5 ms per loop
	# Add broadcast inner
	# 10 loops, best of 3: 31.8 ms per loop
	# CUDA copy broadcasting
	# 10000 loops, best of 3: 53.7 µs per loop
	# CUDA GPU1 -> GPU0
	# 1000 loops, best of 3: 340 µs per loop
	# CUDA GPU1 -> GPU0 broadcasting
	# 1000 loops, best of 3: 413 µs per loop
	# CUDA float -> int64
	# 10000 loops, best of 3: 52.2 µs per loop

	# master
	# Add contiguous
	# 10000 loops, best of 3: 68.8 µs per loop
	# Add broadcast outer
	# 1000 loops, best of 3: 261 µs per loop
	# Add broadcast inner
	# 1000 loops, best of 3: 306 µs per loop
	# Add contiguous
	# 10 loops, best of 3: 25.9 ms per loop
	# Add broadcast outer
	# 10 loops, best of 3: 157 ms per loop
	# Add broadcast inner
	# 1 loop, best of 3: 252 ms per loop
	# CUDA copy broadcasting
	# 1000 loops, best of 3: 299 µs per loop
	# CUDA GPU1 -> GPU0
	# 1000 loops, best of 3: 381 µs per loop
	# CUDA GPU1 -> GPU0 broadcasting
	# 1000 loops, best of 3: 597 µs per loop
	# CUDA float -> int64
	# 10000 loops, best of 3: 46.9 µs per loop