Skip to content

Instantly share code, notes, and snippets.

@chsasank
Last active February 13, 2024 13:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chsasank/407df67ac0c848d6259f0340887648a9 to your computer and use it in GitHub Desktop.
Save chsasank/407df67ac0c848d6259f0340887648a9 to your computer and use it in GitHub Desktop.
Measure Bandwidth and FLOPs with PyTorch
import torch
import time
import numpy as np
from torch import mps, cuda
num_trails = 10
def flops_benchmark(device):
test_range = 2 ** np.arange(8, 13, 0.25)
print('size, elapsed_time, flops')
for n in test_range:
total = 0
for _ in range(num_trails):
n = int(n)
a = torch.rand(n, n, device=device)
synchronize(device)
now = time.time()
b = torch.matmul(a, a)
synchronize(device)
total += time.time() - now
total = total / num_trails
tflops = 2 * n**3 / total / 1e12
print(n, total, tflops, sep=", ")
def synchronize(device):
if device.type == "cuda":
cuda.synchronize()
elif device.type == "mps":
mps.synchronize()
elif device.type == "cpu":
pass
def memory_bandwidth_benchmark(device, size=1024 * 1024 * 256): # 256MB
test_range = 2 ** (np.arange(20, 28, 0.5))
print('size (GB), elapsed_time, bandwidth')
for size in test_range:
elapsed_time = 0
for _ in range(num_trails):
size = int(size)
# Create random tensors
a = torch.rand(size, device=device)
b = torch.rand(size, device=device)
# Warm-up to ensure CUDA kernel is initialized if using GPU
synchronize(device)
a.copy_(b)
synchronize(device)
# Record the start time
start_time = time.time()
# Perform the copy operation
a.copy_(b)
# Synchronize if using CUDA to make sure operation is finished
synchronize(device)
# Record the end time
end_time = time.time()
# Compute elapsed time
elapsed_time += end_time - start_time
elapsed_time = elapsed_time / num_trails
# Calculate Bandwidth in GB/s
bytes_copied = a.nelement() * a.element_size() # bytes
bandwidth = 2 * bytes_copied / elapsed_time / 1e9 # GB/s
print(bytes_copied / 1e9, elapsed_time, bandwidth, sep=', ')
return bandwidth
if __name__ == "__main__":
device = torch.device('cpu')
flops_benchmark(device)
memory_bandwidth_benchmark(device)
@chsasank
Copy link
Author

chsasank commented Aug 22, 2023

Results on some devices

Device Device Type TFLOPs (FP32) Memory Bandwidth (GB/s)
Apple M1 CPU CPU 0.8 46
Apple M1 GPU GPU 1.4 90
Apple M2 CPU CPU 1 60
Apple M2 GPU GPU 2 56
SteamDeck CPU CPU 0.17 20
SteamDeck GPU GPU 1.22 69
Samsung Exynos 2100 CPU 0.1 16
Intel i7-8559U CPU 0.2 10
Intel i7-1360P CPU 0.4 24
Intel i5-12400 CPU 0.7 26
AMD Ryzen Threadripper PRO 5975WX 32-Cores CPU 1.5 28
AMD Ryzen 5 4600HS CPU 0.4 22
Nvidia T4 GPU 4 240
Nvidia GeForce GTX 1650 Ti Mobile GPU 3 172
Intel Arc 770 16GB GPU 15 452
Intel Arc 370m GPU 4 93
Nvidia 4090 GPU 52 912
Radeon RX 7900 XTX GPU 26 792

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment