Created
July 11, 2022 21:35
-
-
Save nousr/5dd641a34cb5976f9cde0501af4519fe to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#SBATCH --partition=compute-od-gpu | |
#SBATCH --job-name=intelmpi_test | |
#SBATCH --nodes 4 | |
#SBATCH --ntasks-per-node 1 | |
#SBATCH --cpus-per-gpu=6 | |
#SBATCH --gres=gpu:8 | |
#SBATCH --output=%x_%j.out | |
#SBATCH --comment "Key=Monitoring,Value=ON" | |
#SBATCH --exclusive | |
module load intelmpi | |
source /opt/intel/mpi/latest/env/vars.sh | |
export LD_LIBRARY_PATH=/opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:$LD_LIBRARY_PATH | |
export NCCL_PROTO=simple | |
export PATH=/opt/amazon/efa/bin:$PATH | |
export LD_PRELOAD="/opt/nccl/build/lib/libnccl.so" | |
export FI_EFA_FORK_SAFE=1 | |
export FI_LOG_LEVEL=1 | |
export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn | |
#export NCCL_ALGO=ring | |
export NCCL_DEBUG=info | |
#export NCCL_DEBUG_SUBSYS=INIT,ENV,GRAPH,COLL | |
export PYTHONFAULTHANDLER=1 | |
export CUDA_LAUNCH_BLOCKING=0 | |
export OMPI_MCA_mtl_base_verbose=1 | |
export FI_EFA_ENABLE_SHM_TRANSFER=0 | |
export FI_PROVIDER=efa | |
export FI_EFA_TX_MIN_CREDITS=64 | |
export NCCL_TREE_THRESHOLD=0 | |
#export NCCL_P2P_DISABLE=1 | |
#export NCCL_IBEXT_DISABLE=1 | |
#export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond" | |
# sent to sub script | |
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` | |
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) | |
export MASTER_PORT=12802 | |
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` | |
echo go $COUNT_NODE | |
echo $HOSTNAMES | |
mpirun -n $COUNT_NODE -perhost 1 /fsx/nousr/start_in_container.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
#sleep 30 | |
#fi_info -p efa -t FI_EP_RDM | |
# HOSTNAMES MASTER_ADDR MASTER_PORT COUNT_NODE are coming from the main script | |
echo myuser=`whoami` | |
echo COUNT_NODE=$COUNT_NODE | |
echo LD_LIBRARY_PATH = $LD_LIBRARY_PATH | |
echo PATH = $PATH | |
echo which mpicc `which mpicc` | |
echo HOSTNAMES = $HOSTNAMES | |
echo hostname = `hostname` | |
echo MASTER_ADDR= $MASTER_ADDR | |
echo MASTER_PORT= $MASTER_PORT | |
source /fsx/nousr/dalle2/bin/activate | |
H=`hostname` | |
THEID=`echo -e $HOSTNAMES | python3 -c "import sys;[sys.stdout.write(str(i)) for i,line in enumerate(next(sys.stdin).split(' ')) if line.strip() == '$H'.strip()]"` | |
echo THEID=$THEID | |
echo python3 version = `python3 --version` | |
python3 -c "import torch" | |
accelerate launch --num_processes $(( 8 * $COUNT_NODE )) --num_machines $COUNT_NODE --multi_gpu --machine_rank $THEID --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT /fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py --config_file /fsx/nousr/DALLE2-pytorch/configs/prior.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading intelmpi version 2021.4.0 | |
go 4 | |
compute-od-gpu-dy-p4d-24xlarge-25 compute-od-gpu-dy-p4d-24xlarge-26 compute-od-gpu-dy-p4d-24xlarge-28 compute-od-gpu-dy-p4d-24xlarge-29 | |
myuser=zion | |
COUNT_NODE=4 | |
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib | |
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/92d25e35d9bf1a6b16f7d0758f25d48ace11e5b9/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin | |
myuser=zion | |
COUNT_NODE=4 | |
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib | |
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/92d25e35d9bf1a6b16f7d0758f25d48ace11e5b9/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin | |
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc | |
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-25 compute-od-gpu-dy-p4d-24xlarge-26 compute-od-gpu-dy-p4d-24xlarge-28 compute-od-gpu-dy-p4d-24xlarge-29 | |
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc | |
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-25 compute-od-gpu-dy-p4d-24xlarge-26 compute-od-gpu-dy-p4d-24xlarge-28 compute-od-gpu-dy-p4d-24xlarge-29 | |
hostname = compute-od-gpu-dy-p4d-24xlarge-25 | |
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-25 | |
MASTER_PORT= 12802 | |
hostname = compute-od-gpu-dy-p4d-24xlarge-26 | |
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-25 | |
MASTER_PORT= 12802 | |
THEID=1 | |
myuser=zion | |
COUNT_NODE=4 | |
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib | |
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/92d25e35d9bf1a6b16f7d0758f25d48ace11e5b9/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin | |
python3 version = Python 3.7.10 | |
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc | |
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-25 compute-od-gpu-dy-p4d-24xlarge-26 compute-od-gpu-dy-p4d-24xlarge-28 compute-od-gpu-dy-p4d-24xlarge-29 | |
hostname = compute-od-gpu-dy-p4d-24xlarge-28 | |
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-25 | |
MASTER_PORT= 12802 | |
THEID=0 | |
Traceback (most recent call last): | |
File "<string>", line 1, in <module> | |
ModuleNotFoundError: No module named 'torch' | |
python3 version = Python 3.8.5 | |
/fsx/nousr/start_in_container.sh: /fsx/nousr/dalle2/bin/accelerate: /fsx/nousr/dalle2/bin/python3.8: bad interpreter: No such file or directory |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
command ran was | |
$ sbatch --exclude=compute-od-gpu-dy-p4d-24xlarge-[23,24] start_intelmpi.sh | |
(originally though to be issues specific to nodes 23 & 24) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
permissions on the scripts are