richbs/pytorch_benchmark.py

## pytorch_benchmark.py
import torch
import time
from typing import List, Tuple
import platform

def get_available_devices() -> List[str]:
    """Return list of available devices for PyTorch computation.

    Returns:
        List[str]: List containing available devices ('cpu', 'cuda', 'mps').
        CUDA is available on NVIDIA GPUs, MPS on Apple Silicon.
    """
    devices = ['cpu']  # CPU is always available

    # Check for NVIDIA GPU support
    if torch.cuda.is_available():
        devices.append('cuda')

    # Check for Apple Silicon (M1/M2/M3) GPU support
    if torch.backends.mps.is_available():
        devices.append('mps')

    return devices

def create_tensors(size: Tuple[int, int], device: str) -> Tuple[torch.Tensor, torch.Tensor]:
    """Create random tensors of specified size on given device.

    Args:
        size (Tuple[int, int]): Dimensions of the tensors to create (rows, cols)
        device (str): Device to place tensors on ('cpu', 'cuda', or 'mps')

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: Two random tensors of the specified size
    """
    return (
        torch.randn(size, device=device),
        torch.randn(size, device=device)
    )

def benchmark_matmul(size: Tuple[int, int], device: str, num_iterations: int = 100) -> float:
    """Benchmark matrix multiplication for given size and device.

    Args:
        size (Tuple[int, int]): Size of matrices to multiply
        device (str): Device to run benchmark on
        num_iterations (int): Number of iterations to average over

    Returns:
        float: Average time per operation in seconds
    """
    # Create input tensors on the specified device
    a, b = create_tensors(size, device)

    # Perform warmup iterations to ensure GPU is at full speed
    # and avoid including compilation/optimization time
    for _ in range(10):
        _ = torch.matmul(a, b)

    # Ensure all operations are completed before timing
    # Different devices require different synchronization methods
    if device == 'cuda':
        torch.cuda.synchronize()
    elif device == 'mps':
        torch.mps.synchronize()

    # Time the actual benchmark iterations
    start_time = time.perf_counter()

    for _ in range(num_iterations):
        _ = torch.matmul(a, b)

    # Ensure all operations are completed before stopping timer
    if device == 'cuda':
        torch.cuda.synchronize()
    elif device == 'mps':
        torch.mps.synchronize()

    end_time = time.perf_counter()

    # Calculate average time per operation
    return (end_time - start_time) / num_iterations

def run_benchmarks():
    """Run benchmarks across all available devices.

    This function:
    1. Detects available devices
    2. Tests multiple matrix sizes
    3. Reports timing results in a formatted table
    4. Handles errors gracefully
    """
    # Get list of available compute devices
    devices = get_available_devices()

    # Define matrix sizes to test
    # Larger sizes will show more pronounced differences between devices
    sizes = [
        (1000, 1000),  # 1K x 1K elements
        (2000, 2000),  # 2K x 2K elements
        (4000, 4000)   # 4K x 4K elements
    ]

    # Print system information and available devices
    print(f"PyTorch version: {torch.__version__}")
    print(f"System: {platform.system()} {platform.machine()}\n")
    print("Available devices:", devices)
    print("\nMatrix Multiplication Benchmark Results (seconds per operation):\n")

    # Create and print table header
    header = "Size".ljust(15)
    for device in devices:
        header += f"{device.upper().ljust(15)}"
    print(header)
    print("-" * (15 + 15 * len(devices)))

    # Run benchmarks for each matrix size
    for size in sizes:
        size_str = f"{size[0]}x{size[1]}".ljust(15)

        # Test each available device
        for device in devices:
            try:
                time_taken = benchmark_matmul(size, device)
                size_str += f"{time_taken:.6f}".ljust(15)
            except Exception as e:
                # Handle any errors (out of memory, device not supported, etc.)
                size_str += f"Error: {str(e)[:10]}".ljust(15)

        print(size_str)

if __name__ == "__main__":
    run_benchmarks()
	import torch
	import time
	from typing import List, Tuple
	import platform

	def get_available_devices() -> List[str]:
	"""Return list of available devices for PyTorch computation.

	Returns:
	List[str]: List containing available devices ('cpu', 'cuda', 'mps').
	CUDA is available on NVIDIA GPUs, MPS on Apple Silicon.
	"""
	devices = ['cpu'] # CPU is always available

	# Check for NVIDIA GPU support
	if torch.cuda.is_available():
	devices.append('cuda')

	# Check for Apple Silicon (M1/M2/M3) GPU support
	if torch.backends.mps.is_available():
	devices.append('mps')

	return devices

	def create_tensors(size: Tuple[int, int], device: str) -> Tuple[torch.Tensor, torch.Tensor]:
	"""Create random tensors of specified size on given device.

	Args:
	size (Tuple[int, int]): Dimensions of the tensors to create (rows, cols)
	device (str): Device to place tensors on ('cpu', 'cuda', or 'mps')

	Returns:
	Tuple[torch.Tensor, torch.Tensor]: Two random tensors of the specified size
	"""
	return (
	torch.randn(size, device=device),
	torch.randn(size, device=device)
	)

	def benchmark_matmul(size: Tuple[int, int], device: str, num_iterations: int = 100) -> float:
	"""Benchmark matrix multiplication for given size and device.

	Args:
	size (Tuple[int, int]): Size of matrices to multiply
	device (str): Device to run benchmark on
	num_iterations (int): Number of iterations to average over

	Returns:
	float: Average time per operation in seconds
	"""
	# Create input tensors on the specified device
	a, b = create_tensors(size, device)

	# Perform warmup iterations to ensure GPU is at full speed
	# and avoid including compilation/optimization time
	for _ in range(10):
	_ = torch.matmul(a, b)

	# Ensure all operations are completed before timing
	# Different devices require different synchronization methods
	if device == 'cuda':
	torch.cuda.synchronize()
	elif device == 'mps':
	torch.mps.synchronize()

	# Time the actual benchmark iterations
	start_time = time.perf_counter()

	for _ in range(num_iterations):
	_ = torch.matmul(a, b)

	# Ensure all operations are completed before stopping timer
	if device == 'cuda':
	torch.cuda.synchronize()
	elif device == 'mps':
	torch.mps.synchronize()

	end_time = time.perf_counter()

	# Calculate average time per operation
	return (end_time - start_time) / num_iterations

	def run_benchmarks():
	"""Run benchmarks across all available devices.

	This function:
	1. Detects available devices
	2. Tests multiple matrix sizes
	3. Reports timing results in a formatted table
	4. Handles errors gracefully
	"""
	# Get list of available compute devices
	devices = get_available_devices()

	# Define matrix sizes to test
	# Larger sizes will show more pronounced differences between devices
	sizes = [
	(1000, 1000), # 1K x 1K elements
	(2000, 2000), # 2K x 2K elements
	(4000, 4000) # 4K x 4K elements
	]

	# Print system information and available devices
	print(f"PyTorch version: {torch.__version__}")
	print(f"System: {platform.system()} {platform.machine()}\n")
	print("Available devices:", devices)
	print("\nMatrix Multiplication Benchmark Results (seconds per operation):\n")

	# Create and print table header
	header = "Size".ljust(15)
	for device in devices:
	header += f"{device.upper().ljust(15)}"
	print(header)
	print("-" * (15 + 15 * len(devices)))

	# Run benchmarks for each matrix size
	for size in sizes:
	size_str = f"{size[0]}x{size[1]}".ljust(15)

	# Test each available device
	for device in devices:
	try:
	time_taken = benchmark_matmul(size, device)
	size_str += f"{time_taken:.6f}".ljust(15)
	except Exception as e:
	# Handle any errors (out of memory, device not supported, etc.)
	size_str += f"Error: {str(e)[:10]}".ljust(15)

	print(size_str)

	if __name__ == "__main__":
	run_benchmarks()