muellerzr/affinity.py

## affinity.py
import builtins
import fcntl
import os
import socket
import torch
import torch.distributed as dist

print("STARTED")

def print(*args, **kwargs):
    """ solves multi-process interleaved print problem """
    builtins.print(*args, **kwargs)

local_rank = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
hostname = socket.gethostname()

gpu = f"[{hostname}-{local_rank}]"

try:
    # test distributed
    dist.init_process_group("nccl")

    # global rank
    rank = dist.get_rank()
    world_size = dist.get_world_size()

    # reduction test
    t = torch.ones(1, device=device)
    dist.all_reduce(t, op=dist.ReduceOp.SUM)
    dist.barrier()
    print(f"{gpu} Reduction op=sum result: {t.item()}")

    # test cuda is available and can allocate memory
    torch.cuda.is_available()
    torch.ones(1).cuda(local_rank)

    print(f"{gpu} is OK (global rank: {rank}/{world_size})")

    print(f"{gpu} numa affinity: {sorted(os.sched_getaffinity(0))}")

    dist.barrier()
    if rank == 0:
        print(f"pt={torch.__version__}, cuda={torch.version.cuda}, nccl={torch.cuda.nccl.version()}")
        print(f"device compute capabilities={torch.cuda.get_device_capability()}")
        print(f"pytorch compute capabilities={torch.cuda.get_arch_list()}")

except Exception:
    print(f"{gpu} is broken")
    raise

## affinity_baseline.py
import builtins
import fcntl
import os
import socket
import torch
import torch.distributed as dist

print("STARTED")

def print(*args, **kwargs):
    """ solves multi-process interleaved print problem """
    builtins.print(*args, **kwargs)

local_rank = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
hostname = socket.gethostname()

gpu = f"[{hostname}-{local_rank}]"

try:
    # test distributed
    dist.init_process_group("nccl")

    # global rank
    rank = dist.get_rank()
    world_size = dist.get_world_size()

    # reduction test
    t = torch.ones(1, device=device)
    dist.all_reduce(t, op=dist.ReduceOp.SUM)
    dist.barrier()
    print(f"{gpu} Reduction op=sum result: {t.item()}")

    # test cuda is available and can allocate memory
    torch.cuda.is_available()
    torch.ones(1).cuda(local_rank)

    print(f"{gpu} is OK (global rank: {rank}/{world_size})")

    print(f"{gpu} numa affinity: {sorted(os.sched_getaffinity(0))}")

    dist.barrier()
    if rank == 0:
        print(f"pt={torch.__version__}, cuda={torch.version.cuda}, nccl={torch.cuda.nccl.version()}")
        print(f"device compute capabilities={torch.cuda.get_device_capability()}")
        print(f"pytorch compute capabilities={torch.cuda.get_arch_list()}")

except Exception:
    print(f"{gpu} is broken")
    raise
	import builtins
	import fcntl
	import os
	import socket
	import torch
	import torch.distributed as dist

	print("STARTED")

	def print(args, *kwargs):
	""" solves multi-process interleaved print problem """
	builtins.print(args, *kwargs)

	local_rank = int(os.environ["LOCAL_RANK"])
	torch.cuda.set_device(local_rank)
	device = torch.device("cuda", local_rank)
	hostname = socket.gethostname()

	gpu = f"[{hostname}-{local_rank}]"

	try:
	# test distributed
	dist.init_process_group("nccl")

	# global rank
	rank = dist.get_rank()
	world_size = dist.get_world_size()

	# reduction test
	t = torch.ones(1, device=device)
	dist.all_reduce(t, op=dist.ReduceOp.SUM)
	dist.barrier()
	print(f"{gpu} Reduction op=sum result: {t.item()}")

	# test cuda is available and can allocate memory
	torch.cuda.is_available()
	torch.ones(1).cuda(local_rank)

	print(f"{gpu} is OK (global rank: {rank}/{world_size})")

	print(f"{gpu} numa affinity: {sorted(os.sched_getaffinity(0))}")

	dist.barrier()
	if rank == 0:
	print(f"pt={torch.__version__}, cuda={torch.version.cuda}, nccl={torch.cuda.nccl.version()}")
	print(f"device compute capabilities={torch.cuda.get_device_capability()}")
	print(f"pytorch compute capabilities={torch.cuda.get_arch_list()}")

	except Exception:
	print(f"{gpu} is broken")
	raise