-
-
Save colesbury/1e917ae6a0ca9d24712121b92fed4c8f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
def loop(x): | |
for _ in range(1000): | |
x() | |
torch.cuda.synchronize() | |
print('Add contiguous') | |
x = torch.randn(1024, 1024, device='cuda') | |
y = torch.randn(1024, 1024, device='cuda') | |
%timeit x + y; torch.cuda.synchronize() | |
print('Add broadcast outer') | |
x = torch.randn(1024, 1024, device='cuda') | |
y = torch.randn(1024, device='cuda') | |
%timeit x + y; torch.cuda.synchronize() | |
print('Add broadcast inner') | |
x = torch.randn(1024, 1024, device='cuda') | |
y = torch.randn(1024, 1, device='cuda') | |
%timeit x + y; torch.cuda.synchronize() | |
# Launch overhead is significant on ROCm so it's worth also running these 1000 times before synchronizing | |
print('Add contiguous') | |
x = torch.randn(1024, 1024, device='cuda') | |
y = torch.randn(1024, 1024, device='cuda') | |
%timeit loop(lambda: x + y) | |
print('Add broadcast outer') | |
x = torch.randn(1024, 1024, device='cuda') | |
y = torch.randn(1024, device='cuda') | |
%timeit loop(lambda: x + y) | |
print('Add broadcast inner') | |
x = torch.randn(1024, 1024, device='cuda') | |
y = torch.randn(1024, 1, device='cuda') | |
%timeit loop(lambda: x + y) | |
print('CUDA copy broadcasting') | |
x = torch.randn(1024, 1024, device='cuda') | |
y = torch.randn(1024, device='cuda') | |
%timeit x.copy_(y); torch.cuda.synchronize() | |
print('CUDA GPU1 -> GPU0') | |
x = torch.randn(1024, 1024, device='cuda:0') | |
y = torch.randn(1024, 1024, device='cuda:1') | |
%timeit x.copy_(y); torch.cuda.synchronize() | |
print('CUDA GPU1 -> GPU0 broadcasting') | |
x = torch.randn(1024, 1024, device='cuda:0') | |
y = torch.randn(1024, device='cuda:1') | |
%timeit x.copy_(y); torch.cuda.synchronize() | |
print('CUDA float -> int64') | |
x = torch.empty(1024, 1024, device='cuda', dtype=torch.int64) | |
y = torch.randn(1024, 1024, device='cuda') | |
%timeit x.copy_(y); torch.cuda.synchronize() | |
# PR | |
# Add contiguous | |
# 10000 loops, best of 3: 69.6 µs per loop | |
# Add broadcast outer | |
# 10000 loops, best of 3: 71.7 µs per loop | |
# Add broadcast inner | |
# 10000 loops, best of 3: 70.8 µs per loop | |
# Add contiguous | |
# 10 loops, best of 3: 29.9 ms per loop | |
# Add broadcast outer | |
# 10 loops, best of 3: 31.5 ms per loop | |
# Add broadcast inner | |
# 10 loops, best of 3: 31.8 ms per loop | |
# CUDA copy broadcasting | |
# 10000 loops, best of 3: 53.7 µs per loop | |
# CUDA GPU1 -> GPU0 | |
# 1000 loops, best of 3: 340 µs per loop | |
# CUDA GPU1 -> GPU0 broadcasting | |
# 1000 loops, best of 3: 413 µs per loop | |
# CUDA float -> int64 | |
# 10000 loops, best of 3: 52.2 µs per loop | |
# master | |
# Add contiguous | |
# 10000 loops, best of 3: 68.8 µs per loop | |
# Add broadcast outer | |
# 1000 loops, best of 3: 261 µs per loop | |
# Add broadcast inner | |
# 1000 loops, best of 3: 306 µs per loop | |
# Add contiguous | |
# 10 loops, best of 3: 25.9 ms per loop | |
# Add broadcast outer | |
# 10 loops, best of 3: 157 ms per loop | |
# Add broadcast inner | |
# 1 loop, best of 3: 252 ms per loop | |
# CUDA copy broadcasting | |
# 1000 loops, best of 3: 299 µs per loop | |
# CUDA GPU1 -> GPU0 | |
# 1000 loops, best of 3: 381 µs per loop | |
# CUDA GPU1 -> GPU0 broadcasting | |
# 1000 loops, best of 3: 597 µs per loop | |
# CUDA float -> int64 | |
# 10000 loops, best of 3: 46.9 µs per loop |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment