-
-
Save mikasenghaas/f3663a1f26acbb95cc880db12e9547ea to your computer and use it in GitHub Desktop.
Dot Product Benchmark (NVIDIA A100-PCIE-40GB)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.10" | |
# dependencies = ["numpy", "torch"] | |
# /// | |
import torch; torch.manual_seed(42) | |
# Check if CUDA is available | |
assert torch.cuda.is_available(), "Please run on CUDA-enabled GPU" | |
# Hardware parameters | |
flops, bandwidth = 312e12, 1555e9 | |
# Theoretical throughput | |
N = 2 ** 30 | |
flops_per_matmul = 2 * N | |
bytes_per_matmul = 4 * N | |
compute_time = flops_per_matmul / flops | |
comm_time = bytes_per_matmul / bandwidth | |
theoretical_time = max(compute_time, comm_time) | |
# Create CUDA events for precise timing | |
start_event = torch.cuda.Event(enable_timing=True) | |
end_event = torch.cuda.Event(enable_timing=True) | |
# Create matrices on GPU | |
a = torch.randn(N, device="cuda", dtype=torch.bfloat16) | |
b = torch.randn(N, device="cuda", dtype=torch.bfloat16) | |
c = torch.empty((), device="cuda", dtype=torch.bfloat16) | |
# Warm up | |
for _ in range(10): | |
torch.matmul(a, b, out=c) | |
torch.cuda.synchronize() | |
# Benchmark | |
num_iterations = 1000 | |
elapsed_times = [] | |
for _ in range(num_iterations): | |
start_event.record() | |
torch.matmul(a, b, out=c) | |
end_event.record() | |
torch.cuda.synchronize() | |
elapsed_times.append(start_event.elapsed_time(end_event) / 1000) | |
elapsed_times = torch.tensor(elapsed_times) | |
avg_time, std_time = elapsed_times.mean(), elapsed_times.std() | |
flop_achieved = flops_per_matmul / avg_time | |
print(f"N={N}") | |
print(f"\nTheoretical time: {theoretical_time * 1e3:.2f} ms (compute: {compute_time * 1e3:.2f} ms, comm: {comm_time * 1e3:.2f} ms)") | |
print(f"Time achieved: {avg_time * 1e3:.2f} ms ± {std_time * 1e3:.2f} μs (Efficiency: {avg_time / theoretical_time:.2f}x)") | |
print(f"\nTheoretical FLOPs: {flops // 1e12} TFlops/s") | |
print(f"FLOPs achieved: {flop_achieved / 1e12:.2f} TFlops/s (Efficiency: {flop_achieved / flops * 100:.1f}%)") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Run with
wget https://gist.githubusercontent.com/mikasenghaas/f3663a1f26acbb95cc880db12e9547ea/raw/4e04df8971de2581ab5de9e7179eb572c7986ea6/dot-product-intensity && uv run dot-product-intensity.py
To get the following output (empirical time may vary slightly)