Skip to content

Instantly share code, notes, and snippets.

@nousr
Created July 11, 2022 21:35
Show Gist options
  • Save nousr/5dd641a34cb5976f9cde0501af4519fe to your computer and use it in GitHub Desktop.
Save nousr/5dd641a34cb5976f9cde0501af4519fe to your computer and use it in GitHub Desktop.
#!/bin/bash
#SBATCH --partition=compute-od-gpu
#SBATCH --job-name=intelmpi_test
#SBATCH --nodes 4
#SBATCH --ntasks-per-node 1
#SBATCH --cpus-per-gpu=6
#SBATCH --gres=gpu:8
#SBATCH --output=%x_%j.out
#SBATCH --comment "Key=Monitoring,Value=ON"
#SBATCH --exclusive
module load intelmpi
source /opt/intel/mpi/latest/env/vars.sh
export LD_LIBRARY_PATH=/opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:$LD_LIBRARY_PATH
export NCCL_PROTO=simple
export PATH=/opt/amazon/efa/bin:$PATH
export LD_PRELOAD="/opt/nccl/build/lib/libnccl.so"
export FI_EFA_FORK_SAFE=1
export FI_LOG_LEVEL=1
export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
#export NCCL_ALGO=ring
export NCCL_DEBUG=info
#export NCCL_DEBUG_SUBSYS=INIT,ENV,GRAPH,COLL
export PYTHONFAULTHANDLER=1
export CUDA_LAUNCH_BLOCKING=0
export OMPI_MCA_mtl_base_verbose=1
export FI_EFA_ENABLE_SHM_TRANSFER=0
export FI_PROVIDER=efa
export FI_EFA_TX_MIN_CREDITS=64
export NCCL_TREE_THRESHOLD=0
#export NCCL_P2P_DISABLE=1
#export NCCL_IBEXT_DISABLE=1
#export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond"
# sent to sub script
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_PORT=12802
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
echo go $COUNT_NODE
echo $HOSTNAMES
mpirun -n $COUNT_NODE -perhost 1 /fsx/nousr/start_in_container.sh
#!/usr/bin/env bash
#sleep 30
#fi_info -p efa -t FI_EP_RDM
# HOSTNAMES MASTER_ADDR MASTER_PORT COUNT_NODE are coming from the main script
echo myuser=`whoami`
echo COUNT_NODE=$COUNT_NODE
echo LD_LIBRARY_PATH = $LD_LIBRARY_PATH
echo PATH = $PATH
echo which mpicc `which mpicc`
echo HOSTNAMES = $HOSTNAMES
echo hostname = `hostname`
echo MASTER_ADDR= $MASTER_ADDR
echo MASTER_PORT= $MASTER_PORT
source /fsx/nousr/dalle2/bin/activate
H=`hostname`
THEID=`echo -e $HOSTNAMES | python3 -c "import sys;[sys.stdout.write(str(i)) for i,line in enumerate(next(sys.stdin).split(' ')) if line.strip() == '$H'.strip()]"`
echo THEID=$THEID
echo python3 version = `python3 --version`
python3 -c "import torch"
accelerate launch --num_processes $(( 8 * $COUNT_NODE )) --num_machines $COUNT_NODE --multi_gpu --machine_rank $THEID --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT /fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py --config_file /fsx/nousr/DALLE2-pytorch/configs/prior.json
Loading intelmpi version 2021.4.0
go 4
compute-od-gpu-dy-p4d-24xlarge-25 compute-od-gpu-dy-p4d-24xlarge-26 compute-od-gpu-dy-p4d-24xlarge-28 compute-od-gpu-dy-p4d-24xlarge-29
myuser=zion
COUNT_NODE=4
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/92d25e35d9bf1a6b16f7d0758f25d48ace11e5b9/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin
myuser=zion
COUNT_NODE=4
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/92d25e35d9bf1a6b16f7d0758f25d48ace11e5b9/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-25 compute-od-gpu-dy-p4d-24xlarge-26 compute-od-gpu-dy-p4d-24xlarge-28 compute-od-gpu-dy-p4d-24xlarge-29
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-25 compute-od-gpu-dy-p4d-24xlarge-26 compute-od-gpu-dy-p4d-24xlarge-28 compute-od-gpu-dy-p4d-24xlarge-29
hostname = compute-od-gpu-dy-p4d-24xlarge-25
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-25
MASTER_PORT= 12802
hostname = compute-od-gpu-dy-p4d-24xlarge-26
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-25
MASTER_PORT= 12802
THEID=1
myuser=zion
COUNT_NODE=4
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/92d25e35d9bf1a6b16f7d0758f25d48ace11e5b9/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin
python3 version = Python 3.7.10
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-25 compute-od-gpu-dy-p4d-24xlarge-26 compute-od-gpu-dy-p4d-24xlarge-28 compute-od-gpu-dy-p4d-24xlarge-29
hostname = compute-od-gpu-dy-p4d-24xlarge-28
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-25
MASTER_PORT= 12802
THEID=0
Traceback (most recent call last):
File "<string>", line 1, in <module>
ModuleNotFoundError: No module named 'torch'
python3 version = Python 3.8.5
/fsx/nousr/start_in_container.sh: /fsx/nousr/dalle2/bin/accelerate: /fsx/nousr/dalle2/bin/python3.8: bad interpreter: No such file or directory
command ran was
$ sbatch --exclude=compute-od-gpu-dy-p4d-24xlarge-[23,24] start_intelmpi.sh
(originally though to be issues specific to nodes 23 & 24)
@nousr
Copy link
Author

nousr commented Jul 11, 2022

permissions on the scripts are

-rwx--x--x  1 zion Domain Users   996 Jul 11 21:07 start_in_container.sh
-r-x--x--x   1 zion Domain Users  1581 Jul 11 19:53 start_intelmpi.sh

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment