jeromeku/peak_mm_perf.py

## peak_mm_perf.py
import torch
from triton.testing import do_bench
import torch._inductor.config as config

config.max_autotune_gemm_backends = "cutlass"

torch.set_default_device('cuda')

a = torch.randn(4224, 8192, dtype=torch.bfloat16)
b = torch.randn(2048, 8192, dtype=torch.bfloat16).t()

def get_flops(f):
    ms = do_bench(f, warmup=100, rep=10000)
    print(ms)
    print((1e3/ms) * a.shape[0] * a.shape[1] * b.shape[1] * 2 / 1e12, 'TF')

f = lambda: torch.mm(a, b)
f = torch.compile(f, mode="max-autotune-no-cudagraphs")

# Also set `sudo nvidia-smi boost-slider --vboost 1`, which shifts more power from l2 cache to tensor cores
get_flops(f)  # 780.1689058368037 TF
	import torch
	from triton.testing import do_bench
	import torch._inductor.config as config

	config.max_autotune_gemm_backends = "cutlass"

	torch.set_default_device('cuda')

	a = torch.randn(4224, 8192, dtype=torch.bfloat16)
	b = torch.randn(2048, 8192, dtype=torch.bfloat16).t()

	def get_flops(f):
	ms = do_bench(f, warmup=100, rep=10000)
	print(ms)
	print((1e3/ms) * a.shape[0] * a.shape[1] * b.shape[1] * 2 / 1e12, 'TF')

	f = lambda: torch.mm(a, b)
	f = torch.compile(f, mode="max-autotune-no-cudagraphs")

	# Also set `sudo nvidia-smi boost-slider --vboost 1`, which shifts more power from l2 cache to tensor cores
	get_flops(f) # 780.1689058368037 TF