Skip to content

Instantly share code, notes, and snippets.

@mikasenghaas
Last active April 25, 2025 23:44
Show Gist options
  • Save mikasenghaas/f3663a1f26acbb95cc880db12e9547ea to your computer and use it in GitHub Desktop.
Save mikasenghaas/f3663a1f26acbb95cc880db12e9547ea to your computer and use it in GitHub Desktop.
Dot Product Benchmark (NVIDIA A100-PCIE-40GB)
# /// script
# requires-python = ">=3.10"
# dependencies = ["numpy", "torch"]
# ///
import torch; torch.manual_seed(42)
# Check if CUDA is available
assert torch.cuda.is_available(), "Please run on CUDA-enabled GPU"
# Hardware parameters
flops, bandwidth = 312e12, 1555e9
# Theoretical throughput
N = 2 ** 30
flops_per_matmul = 2 * N
bytes_per_matmul = 4 * N
compute_time = flops_per_matmul / flops
comm_time = bytes_per_matmul / bandwidth
theoretical_time = max(compute_time, comm_time)
# Create CUDA events for precise timing
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
# Create matrices on GPU
a = torch.randn(N, device="cuda", dtype=torch.bfloat16)
b = torch.randn(N, device="cuda", dtype=torch.bfloat16)
c = torch.empty((), device="cuda", dtype=torch.bfloat16)
# Warm up
for _ in range(10):
torch.matmul(a, b, out=c)
torch.cuda.synchronize()
# Benchmark
num_iterations = 1000
elapsed_times = []
for _ in range(num_iterations):
start_event.record()
torch.matmul(a, b, out=c)
end_event.record()
torch.cuda.synchronize()
elapsed_times.append(start_event.elapsed_time(end_event) / 1000)
elapsed_times = torch.tensor(elapsed_times)
avg_time, std_time = elapsed_times.mean(), elapsed_times.std()
flop_achieved = flops_per_matmul / avg_time
print(f"N={N}")
print(f"\nTheoretical time: {theoretical_time * 1e3:.2f} ms (compute: {compute_time * 1e3:.2f} ms, comm: {comm_time * 1e3:.2f} ms)")
print(f"Time achieved: {avg_time * 1e3:.2f} ms ± {std_time * 1e3:.2f} μs (Efficiency: {avg_time / theoretical_time:.2f}x)")
print(f"\nTheoretical FLOPs: {flops // 1e12} TFlops/s")
print(f"FLOPs achieved: {flop_achieved / 1e12:.2f} TFlops/s (Efficiency: {flop_achieved / flops * 100:.1f}%)")
@mikasenghaas
Copy link
Author

mikasenghaas commented Apr 22, 2025

Run with

wget https://gist.githubusercontent.com/mikasenghaas/f3663a1f26acbb95cc880db12e9547ea/raw/4e04df8971de2581ab5de9e7179eb572c7986ea6/dot-product-intensity && uv run dot-product-intensity.py

To get the following output (empirical time may vary slightly)

N=1073741824

Theoretical time: 2.76 ms (compute: 0.01 ms, comm: 2.76 ms)
Time achieved: 3.42 ms ± 0.04 μs (Efficiency: 1.24x)

Theoretical FLOPs: 312.0 TFlops/s
FLOPs achieved: 0.63 TFlops/s (Efficiency: 0.2%)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment