Skip to content

Instantly share code, notes, and snippets.

@muellerzr
Last active March 19, 2024 22:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save muellerzr/902be8cc9edaee6ddbb49502b482c18e to your computer and use it in GitHub Desktop.
Save muellerzr/902be8cc9edaee6ddbb49502b482c18e to your computer and use it in GitHub Desktop.
import builtins
import fcntl
import os
import socket
import torch
import torch.distributed as dist
print("STARTED")
def print(*args, **kwargs):
""" solves multi-process interleaved print problem """
builtins.print(*args, **kwargs)
local_rank = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
hostname = socket.gethostname()
gpu = f"[{hostname}-{local_rank}]"
try:
# test distributed
dist.init_process_group("nccl")
# global rank
rank = dist.get_rank()
world_size = dist.get_world_size()
# reduction test
t = torch.ones(1, device=device)
dist.all_reduce(t, op=dist.ReduceOp.SUM)
dist.barrier()
print(f"{gpu} Reduction op=sum result: {t.item()}")
# test cuda is available and can allocate memory
torch.cuda.is_available()
torch.ones(1).cuda(local_rank)
print(f"{gpu} is OK (global rank: {rank}/{world_size})")
print(f"{gpu} numa affinity: {sorted(os.sched_getaffinity(0))}")
dist.barrier()
if rank == 0:
print(f"pt={torch.__version__}, cuda={torch.version.cuda}, nccl={torch.cuda.nccl.version()}")
print(f"device compute capabilities={torch.cuda.get_device_capability()}")
print(f"pytorch compute capabilities={torch.cuda.get_arch_list()}")
except Exception:
print(f"{gpu} is broken")
raise
import builtins
import fcntl
import os
import socket
import torch
import torch.distributed as dist
print("STARTED")
def print(*args, **kwargs):
""" solves multi-process interleaved print problem """
builtins.print(*args, **kwargs)
local_rank = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
hostname = socket.gethostname()
gpu = f"[{hostname}-{local_rank}]"
try:
# test distributed
dist.init_process_group("nccl")
# global rank
rank = dist.get_rank()
world_size = dist.get_world_size()
# reduction test
t = torch.ones(1, device=device)
dist.all_reduce(t, op=dist.ReduceOp.SUM)
dist.barrier()
print(f"{gpu} Reduction op=sum result: {t.item()}")
# test cuda is available and can allocate memory
torch.cuda.is_available()
torch.ones(1).cuda(local_rank)
print(f"{gpu} is OK (global rank: {rank}/{world_size})")
print(f"{gpu} numa affinity: {sorted(os.sched_getaffinity(0))}")
dist.barrier()
if rank == 0:
print(f"pt={torch.__version__}, cuda={torch.version.cuda}, nccl={torch.cuda.nccl.version()}")
print(f"device compute capabilities={torch.cuda.get_device_capability()}")
print(f"pytorch compute capabilities={torch.cuda.get_arch_list()}")
except Exception:
print(f"{gpu} is broken")
raise
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment