Garrett Goon garrett361

## matmul_bench.py
from time import perf_counter
from typing import Optional, Union

import torch

if torch.cuda.is_available():
    from torch import cuda as accel

    device = "cuda"

## allgather_test_mp.py
import argparse
import multiprocessing as mp
import os
import socket
from concurrent.futures import ProcessPoolExecutor

import torch
import torch.distributed as dist

if torch.cuda.is_available():

## profile_maybe_with_comms.py
"""
Minimal distributed profiling. Profiles compute and collective communications by default. Pass the
`--no-comms` flag to avoid collectives. Run as in

torchrun --nnodes=1 --nproc-per-node=2 profile_maybe_with_comms.py [--no-comms]
"""

import argparse
import os
from pathlib import Path

## profile_comms_compute_overlap.py
"""
Minimal profiling script for profiling compute/comms overlap.

torchrun --nnodes=1 --nproc-per-node=2 profile_comms_compute_overlap.py [--no-comms]
"""

import argparse
import os
from pathlib import Path

## launch_torchrun.sh
#!/bin/bash -l

# Minimal torchrun-based launch script

# See https://docs.alcf.anl.gov/aurora/data-science/frameworks/pytorch for more recommendations.

# Usage:
#
# # qsub -v SCRIPT_PATH=your_script_path] [ARGS=...] [NPROC_PER_NODE=...] launch_torch.sh

## reduce_scatter.py
"""
Raises a ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY after the 29th iteration on an Intel 1550 max.
"""

import argparse
import os


import torch
import intel_extension_for_pytorch as ipex  # noqa

## collectives.py
from abc import ABC, abstractmethod
from typing import Type

import torch
import torch.distributed as dist

if torch.cuda.is_available():
    accel = torch.cuda
    DEVICE_TYPE = "cuda"
    BACKEND = "nccl"

## linear_model_fsdp_ddp.py
"""
Basic FSDP/DDP applied to a linear model.
"""

import argparse
import os

import torch
import torch.distributed as dist
import torch.nn as nn

## collective.py
from abc import ABC, abstractmethod

import torch
import torch.distributed as dist

if torch.cuda.is_available():
    accel = torch.cuda
    DEVICE_TYPE = "cuda"
    BACKEND = "nccl"
else:

## mp_torch_reduce_scatter.py
"""
Launch single-node reduce scatter with multiprocessing.

python3 mp_torch_reduce_scatter.py
"""

import os
import socket
from concurrent.futures import ProcessPoolExecutor
	from time import perf_counter
	from typing import Optional, Union

	import torch

	if torch.cuda.is_available():
	from torch import cuda as accel

	device = "cuda"
	import argparse
	import multiprocessing as mp
	import os
	import socket
	from concurrent.futures import ProcessPoolExecutor

	import torch
	import torch.distributed as dist

	if torch.cuda.is_available():
	"""
	Minimal distributed profiling. Profiles compute and collective communications by default. Pass the
	`--no-comms` flag to avoid collectives. Run as in

	torchrun --nnodes=1 --nproc-per-node=2 profile_maybe_with_comms.py [--no-comms]
	"""

	import argparse
	import os
	from pathlib import Path
	"""
	Minimal profiling script for profiling compute/comms overlap.

	torchrun --nnodes=1 --nproc-per-node=2 profile_comms_compute_overlap.py [--no-comms]
	"""

	import argparse
	import os
	from pathlib import Path
	#!/bin/bash -l

	# Minimal torchrun-based launch script

	# See https://docs.alcf.anl.gov/aurora/data-science/frameworks/pytorch for more recommendations.

	# Usage:
	#
	# # qsub -v SCRIPT_PATH=your_script_path] [ARGS=...] [NPROC_PER_NODE=...] launch_torch.sh
	"""
	Raises a ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY after the 29th iteration on an Intel 1550 max.
	"""

	import argparse
	import os


	import torch
	import intel_extension_for_pytorch as ipex # noqa
	from abc import ABC, abstractmethod
	from typing import Type

	import torch
	import torch.distributed as dist

	if torch.cuda.is_available():
	accel = torch.cuda
	DEVICE_TYPE = "cuda"
	BACKEND = "nccl"
	"""
	Basic FSDP/DDP applied to a linear model.
	"""

	import argparse
	import os

	import torch
	import torch.distributed as dist
	import torch.nn as nn
	"""
	Launch single-node reduce scatter with multiprocessing.

	python3 mp_torch_reduce_scatter.py
	"""

	import os
	import socket
	from concurrent.futures import ProcessPoolExecutor