Skip to content

Instantly share code, notes, and snippets.

@mikasenghaas
Created April 25, 2025 23:48
Show Gist options
  • Save mikasenghaas/394f9b58977b5dc23cd3f908e4057855 to your computer and use it in GitHub Desktop.
Save mikasenghaas/394f9b58977b5dc23cd3f908e4057855 to your computer and use it in GitHub Desktop.
Peak theoretical throughput
# /// script
# requires-python = ">=3.10"
# dependencies = ["numpy", "pandas", "seaborn", "matplotlib"]
# ///
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Set dark theme
plt.style.use('dark_background')
# Model parameters
MODEL = pd.DataFrame({
"P": np.array([1498482688, 6738415616, 13015864320]),
"H": np.array([64, 128, 128]),
"K": np.array([8, 32, 40]),
"L": np.array([16, 32, 40]),
"T": 4096
}, index=["Llama-3.2 1B", "Llama-2 7B", "Llama-2 13B"])
# Hardware parameters
GPU = pd.DataFrame({
"memory": np.array([24, 40, 80]) * 1e9,
"flops": np.array([165, 312, 989]) * 1e12,
"bandwidth": np.array([1.008, 1.555, 3.35]) * 1e12
}, index=["24GB", "40GB", "80GB"])
GPU["intensity"] = (GPU["flops"] / GPU["bandwidth"]).astype(int)
# Compute peak theoretical throughput at different batch sizes
rows = []
for model in MODEL.index:
for gpu in GPU.index:
# Compute model parameters
P = MODEL.loc[model, "P"]
H = MODEL.loc[model, "H"]
K = MODEL.loc[model, "K"]
L = MODEL.loc[model, "L"]
T = 4096
# Get hardware parameters
total_memory = GPU.loc[gpu, "memory"]
flops = GPU.loc[gpu, "flops"]
bandwidth = GPU.loc[gpu, "bandwidth"]
# Compute maximum batch size
model_size = 2 * P
kv_cache_size = 2 * 2 * H * K * L
ephemeral_size = 0.1 * (model_size + T * kv_cache_size) # 10% of model size + kv cache size
max_batch_size = (total_memory - model_size - ephemeral_size) // (T * kv_cache_size)
max_batch_size = np.where(max_batch_size > 0, max_batch_size, 0)
# Compute latencies
B = np.arange(1, max_batch_size.max() + 1)
time_comp_linear = 2 * B * P / flops
time_mem_linear = 2 * P / bandwidth
time_linear = np.maximum(time_comp_linear, time_mem_linear)
time_mem_att = B * T * kv_cache_size / (2 * bandwidth) # T=1
time_att = time_mem_att
step_time = time_linear + time_att
throughput = B / step_time
for batch_size, time_comp_linear, time_mem_att, step_time, theoretical_throughput in zip(B, time_comp_linear, time_mem_att, step_time, throughput):
rows.append({
"model": model,
"gpu": gpu,
"batch_size": int(batch_size),
"theoretical_throughput": theoretical_throughput
})
# Save as dataframe
theoretical_perf = pd.DataFrame(rows)
# Plot theoretical throughput with dark theme
g = sns.FacetGrid(data=theoretical_perf, col="model", row="gpu", height=3, aspect=1, margin_titles=True)
purple, blue = '#8B5CF6', '#3B82F6'
g.map_dataframe(sns.lineplot, x="batch_size", y="theoretical_throughput", color=purple) # Using a purple color
g.set_xlabels("Batch Size")
g.set_ylabels("Throughput (tokens/s)")
g.set(xscale="log", yscale="log")
# Add vertical dashed line at critical batch size
for i, ax in enumerate(g.axes.flat):
critical_batch_size = GPU["intensity"].iloc[i // g.axes.shape[1]]
ax.axvline(x=critical_batch_size, color=blue, linestyle='--')
ax.text(critical_batch_size*1.1, ax.get_ylim()[0]+10, f'Critical BS: {critical_batch_size:.0f}', rotation=90, verticalalignment='bottom', color=blue, alpha=0.7)
ax.axvspan(critical_batch_size, len(theoretical_perf), color=blue, alpha=0.3)
# Add horizontal dashed line at peak throughput
def show_peak_throughput(data, **kws):
peak_throughput = data["theoretical_throughput"].max()
ax = plt.gca()
ax.text(1, peak_throughput*1.1, f"Peak T/s: {peak_throughput:.0f}", transform=ax.transData, color=purple)
ax.axhline(y=peak_throughput, color=purple, linestyle='--')
g.map_dataframe(show_peak_throughput)
# Save and show the figure
plt.savefig('tmp-theoretical-throughput.png', dpi=300, bbox_inches='tight', facecolor='black')
plt.show()
@mikasenghaas
Copy link
Author

mikasenghaas commented Apr 25, 2025

Generate and save the figure by running.

wget https://gist.githubusercontent.com/mikasenghaas/394f9b58977b5dc23cd3f908e4057855/raw/957b774831ac2bc7ffaae4b0a25a5bda3b46fdc0/peak-theoretical-throughput.py && uv run peak-theoretical-throughput.py

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment