Skip to content

Instantly share code, notes, and snippets.

@Jokeren
Last active June 7, 2024 15:01
Show Gist options
  • Save Jokeren/34debd44b248c28100d06f774215b18e to your computer and use it in GitHub Desktop.
Save Jokeren/34debd44b248c28100d06f774215b18e to your computer and use it in GitHub Desktop.
Proton overhead
import torch
import time
import sys
def run(nelems, iters):
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tensor_a = torch.randn(nelems, dtype=torch.float32, device=device)
tensor_b = torch.randn(nelems, dtype=torch.float32, device=device)
result_gpu = torch.empty_like(tensor_a)
# warmup
for _ in range(10):
result_gpu.copy_(tensor_a + tensor_b, non_blocking=True)
start_time = time.time()
# measure
for _ in range(iters):
result_gpu.copy_(tensor_a + tensor_b, non_blocking=True)
end_time = time.time()
print("cpu time", end_time - start_time)
torch.cuda.synchronize()
if __name__ == "__main__":
workload = sys.argv[1]
if workload == "cpu_bound":
run(nelems=1000, iters=1000000)
elif workload == "gpu_bound":
run(nelems=100000000, iters=10000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment