Skip to content

Instantly share code, notes, and snippets.

@colesbury
Last active June 13, 2019 18:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save colesbury/1e917ae6a0ca9d24712121b92fed4c8f to your computer and use it in GitHub Desktop.
Save colesbury/1e917ae6a0ca9d24712121b92fed4c8f to your computer and use it in GitHub Desktop.
import torch
def loop(x):
for _ in range(1000):
x()
torch.cuda.synchronize()
print('Add contiguous')
x = torch.randn(1024, 1024, device='cuda')
y = torch.randn(1024, 1024, device='cuda')
%timeit x + y; torch.cuda.synchronize()
print('Add broadcast outer')
x = torch.randn(1024, 1024, device='cuda')
y = torch.randn(1024, device='cuda')
%timeit x + y; torch.cuda.synchronize()
print('Add broadcast inner')
x = torch.randn(1024, 1024, device='cuda')
y = torch.randn(1024, 1, device='cuda')
%timeit x + y; torch.cuda.synchronize()
# Launch overhead is significant on ROCm so it's worth also running these 1000 times before synchronizing
print('Add contiguous')
x = torch.randn(1024, 1024, device='cuda')
y = torch.randn(1024, 1024, device='cuda')
%timeit loop(lambda: x + y)
print('Add broadcast outer')
x = torch.randn(1024, 1024, device='cuda')
y = torch.randn(1024, device='cuda')
%timeit loop(lambda: x + y)
print('Add broadcast inner')
x = torch.randn(1024, 1024, device='cuda')
y = torch.randn(1024, 1, device='cuda')
%timeit loop(lambda: x + y)
print('CUDA copy broadcasting')
x = torch.randn(1024, 1024, device='cuda')
y = torch.randn(1024, device='cuda')
%timeit x.copy_(y); torch.cuda.synchronize()
print('CUDA GPU1 -> GPU0')
x = torch.randn(1024, 1024, device='cuda:0')
y = torch.randn(1024, 1024, device='cuda:1')
%timeit x.copy_(y); torch.cuda.synchronize()
print('CUDA GPU1 -> GPU0 broadcasting')
x = torch.randn(1024, 1024, device='cuda:0')
y = torch.randn(1024, device='cuda:1')
%timeit x.copy_(y); torch.cuda.synchronize()
print('CUDA float -> int64')
x = torch.empty(1024, 1024, device='cuda', dtype=torch.int64)
y = torch.randn(1024, 1024, device='cuda')
%timeit x.copy_(y); torch.cuda.synchronize()
# PR
# Add contiguous
# 10000 loops, best of 3: 69.6 µs per loop
# Add broadcast outer
# 10000 loops, best of 3: 71.7 µs per loop
# Add broadcast inner
# 10000 loops, best of 3: 70.8 µs per loop
# Add contiguous
# 10 loops, best of 3: 29.9 ms per loop
# Add broadcast outer
# 10 loops, best of 3: 31.5 ms per loop
# Add broadcast inner
# 10 loops, best of 3: 31.8 ms per loop
# CUDA copy broadcasting
# 10000 loops, best of 3: 53.7 µs per loop
# CUDA GPU1 -> GPU0
# 1000 loops, best of 3: 340 µs per loop
# CUDA GPU1 -> GPU0 broadcasting
# 1000 loops, best of 3: 413 µs per loop
# CUDA float -> int64
# 10000 loops, best of 3: 52.2 µs per loop
# master
# Add contiguous
# 10000 loops, best of 3: 68.8 µs per loop
# Add broadcast outer
# 1000 loops, best of 3: 261 µs per loop
# Add broadcast inner
# 1000 loops, best of 3: 306 µs per loop
# Add contiguous
# 10 loops, best of 3: 25.9 ms per loop
# Add broadcast outer
# 10 loops, best of 3: 157 ms per loop
# Add broadcast inner
# 1 loop, best of 3: 252 ms per loop
# CUDA copy broadcasting
# 1000 loops, best of 3: 299 µs per loop
# CUDA GPU1 -> GPU0
# 1000 loops, best of 3: 381 µs per loop
# CUDA GPU1 -> GPU0 broadcasting
# 1000 loops, best of 3: 597 µs per loop
# CUDA float -> int64
# 10000 loops, best of 3: 46.9 µs per loop
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment