Don't forget line 18 to 25 create a hostfile; you can export the path for it
and then can use deepspeed --hostfile=$varibale_name_that_have_been_exported other stuff
#!/bin/bash | |
#SBATCH --job-name="elm" | |
#SBATCH --partition=gpu | |
#SBATCH --mem-per-cpu=16GB # Amount of CPU memory | |
#SBATCH --nodes=1 | |
#SBATCH --ntasks-per-node=8 # Crucial - only 1 task per dist per node! | |
#SBATCH --cpus-per-task=6 # Number of cores per tasks | |
#SBATCH --hint=nomultithread # We get physical cores not logical | |
#SBATCH --gres=gpu:8 # Number of gpus | |
#SBATCH --output=%x_%j.out # Set this dir where you want slurm outs to go | |
#SBATCH --error=%x_%j.out # Set this dir where you want slurm outs to go | |
#SBATCH --exclusive # Turn off node sharing | |
#SBATCH --comment=elm | |
module load openmpi | |
module load cuda/11.4 | |
mkdir -p /fsx/home-$(whoami)/hostfiles | |
hostfile=/fsx/home-$(whoami)/hostfiles/hosts_$SLURM_JOBID | |
rm $hostfile &> /dev/null # for consecutive calls to this script in interactive jobs | |
for i in `scontrol show hostnames $SLURM_NODELIST` | |
do | |
echo $i slots=8 >>$hostfile | |
done | |
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` | |
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) | |
export MASTER_PORT=12802 | |
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` | |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib | |
export NCCL_PROTO=simple | |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/aws-ofi-nccl/lib | |
export PATH=$PATH:/opt/amazon/efa/bin:/opt/amazon/openmpi/bin | |
export FI_EFA_FORK_SAFE=1 | |
export FI_LOG_LEVEL=1 | |
export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn | |
export NCCL_DEBUG=info | |
export OMPI_MCA_mtl_base_verbose=1 | |
export FI_EFA_ENABLE_SHM_TRANSFER=0 | |
export FI_PROVIDER=efa | |
export FI_EFA_TX_MIN_CREDITS=64 | |
export NCCL_TREE_THRESHOLD=0 | |
export OMPI_MCA_pml="^cm" | |
export OMPI_MCA_btl="tcp,self" | |
export OMPI_MCA_btl_tcp_if_exclude="lo,docker1" | |
export OMPI_MCA_plm_rsh_no_tree_spawn=1 | |
export TORCH_EXTENSIONS_DIR=/fsx/codeSeCodegen/extensions | |
export XDG_CACHE_HOME=/fsx/codeSeCodegen/hf_cache | |
#can be changed | |
source /fsx/codeSeCodegen/codeSeEnv/bin/activate | |
# can be changed | |
srun --comment elm --pty bash -i |
main.sh was sanity run
def can use main2.sh it's everything that one need for running multinode deepspeed