chsasank/benchmark.py

## benchmark.py
import torch
import time
import numpy as np
from torch import mps, cuda

num_trails = 10

def flops_benchmark(device):
    test_range = 2 ** np.arange(8, 13, 0.25)

    print('size, elapsed_time, flops')
    for n in test_range:
        total = 0
        for _ in range(num_trails):
            n = int(n)
            a = torch.rand(n, n, device=device)

            synchronize(device)
            now = time.time()
            b = torch.matmul(a, a)
            synchronize(device)

            total += time.time() - now

        total = total / num_trails

        tflops = 2 * n**3 / total / 1e12

        print(n, total, tflops, sep=", ")


def synchronize(device):
    if device.type == "cuda":
        cuda.synchronize()
    elif device.type == "mps":
        mps.synchronize()
    elif device.type == "cpu":
        pass


def memory_bandwidth_benchmark(device, size=1024 * 1024 * 256):  # 256MB
    test_range = 2 ** (np.arange(20, 28, 0.5))

    print('size (GB), elapsed_time, bandwidth')
    for size in test_range:
        elapsed_time = 0
        for _ in range(num_trails):
            size = int(size)

            # Create random tensors
            a = torch.rand(size, device=device)
            b = torch.rand(size, device=device)

            # Warm-up to ensure CUDA kernel is initialized if using GPU
            synchronize(device)
            a.copy_(b)
            synchronize(device)

            # Record the start time
            start_time = time.time()

            # Perform the copy operation
            a.copy_(b)

            # Synchronize if using CUDA to make sure operation is finished
            synchronize(device)

            # Record the end time
            end_time = time.time()

            # Compute elapsed time
            elapsed_time += end_time - start_time

        elapsed_time = elapsed_time / num_trails
        # Calculate Bandwidth in GB/s
        bytes_copied = a.nelement() * a.element_size()  # bytes
        bandwidth = 2 * bytes_copied / elapsed_time / 1e9  # GB/s

        print(bytes_copied / 1e9, elapsed_time, bandwidth, sep=', ')

    return bandwidth


if __name__ == "__main__":
    device = torch.device('cpu')
    flops_benchmark(device)
    memory_bandwidth_benchmark(device)
	import torch
	import time
	import numpy as np
	from torch import mps, cuda

	num_trails = 10

	def flops_benchmark(device):
	test_range = 2 ** np.arange(8, 13, 0.25)

	print('size, elapsed_time, flops')
	for n in test_range:
	total = 0
	for _ in range(num_trails):
	n = int(n)
	a = torch.rand(n, n, device=device)

	synchronize(device)
	now = time.time()
	b = torch.matmul(a, a)
	synchronize(device)

	total += time.time() - now

	total = total / num_trails

	tflops = 2 * n**3 / total / 1e12

	print(n, total, tflops, sep=", ")


	def synchronize(device):
	if device.type == "cuda":
	cuda.synchronize()
	elif device.type == "mps":
	mps.synchronize()
	elif device.type == "cpu":
	pass


	def memory_bandwidth_benchmark(device, size=1024 * 1024 * 256): # 256MB
	test_range = 2 ** (np.arange(20, 28, 0.5))

	print('size (GB), elapsed_time, bandwidth')
	for size in test_range:
	elapsed_time = 0
	for _ in range(num_trails):
	size = int(size)

	# Create random tensors
	a = torch.rand(size, device=device)
	b = torch.rand(size, device=device)

	# Warm-up to ensure CUDA kernel is initialized if using GPU
	synchronize(device)
	a.copy_(b)
	synchronize(device)

	# Record the start time
	start_time = time.time()

	# Perform the copy operation
	a.copy_(b)

	# Synchronize if using CUDA to make sure operation is finished
	synchronize(device)

	# Record the end time
	end_time = time.time()

	# Compute elapsed time
	elapsed_time += end_time - start_time

	elapsed_time = elapsed_time / num_trails
	# Calculate Bandwidth in GB/s
	bytes_copied = a.nelement() * a.element_size() # bytes
	bandwidth = 2 * bytes_copied / elapsed_time / 1e9 # GB/s

	print(bytes_copied / 1e9, elapsed_time, bandwidth, sep=', ')

	return bandwidth


	if __name__ == "__main__":
	device = torch.device('cpu')
	flops_benchmark(device)
	memory_bandwidth_benchmark(device)
Device	Device Type	TFLOPs (FP32)	Memory Bandwidth (GB/s)
Apple M1 CPU	CPU	0.8	46
Apple M1 GPU	GPU	1.4	90
Apple M2 CPU	CPU	1	60
Apple M2 GPU	GPU	2	56
SteamDeck CPU	CPU	0.17	20
SteamDeck GPU	GPU	1.22	69
Samsung Exynos 2100	CPU	0.1	16
Intel i7-8559U	CPU	0.2	10
Intel i7-1360P	CPU	0.4	24
Intel i5-12400	CPU	0.7	26
AMD Ryzen Threadripper PRO 5975WX 32-Cores	CPU	1.5	28
AMD Ryzen 5 4600HS	CPU	0.4	22
Nvidia T4	GPU	4	240
Nvidia GeForce GTX 1650 Ti Mobile	GPU	3	172
Intel Arc 770 16GB	GPU	15	452
Intel Arc 370m	GPU	4	93
Nvidia 4090	GPU	52	912
Radeon RX 7900 XTX	GPU	26	792