harry-stark/main.sh

## readme.md

      
    Raw
  

              readme.md
            
          
    Template for running setting up multinode job

Don't forget line 18 to 25 create a hostfile; you can export the path for it
and then can use deepspeed --hostfile=$varibale_name_that_have_been_exported other stuff

  
## main.sh
#!/bin/bash
#SBATCH --job-name="elm"
#SBATCH --partition=gpu
#SBATCH --mem-per-cpu=16GB        # Amount of CPU memory
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8      # Crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=6           # Number of cores per tasks
#SBATCH --hint=nomultithread         # We get physical cores not logical
#SBATCH --gres=gpu:8                 # Number of gpus
#SBATCH --output=%x_%j.out   # Set this dir where you want slurm outs to go
#SBATCH --error=%x_%j.out    # Set this dir where you want slurm outs to go
#SBATCH --exclusive      # Turn off node sharing
#SBATCH --comment=elm

module load openmpi
module load cuda/11.4

mkdir -p /fsx/home-$(whoami)/hostfiles
hostfile=/fsx/home-$(whoami)/hostfiles/hosts_$SLURM_JOBID
rm $hostfile &> /dev/null # for consecutive calls to this script in interactive jobs

for i in `scontrol show hostnames $SLURM_NODELIST`
do
    echo $i slots=8 >>$hostfile
done

export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_PORT=12802
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib
export NCCL_PROTO=simple
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/aws-ofi-nccl/lib
export PATH=$PATH:/opt/amazon/efa/bin:/opt/amazon/openmpi/bin
export FI_EFA_FORK_SAFE=1
export FI_LOG_LEVEL=1
export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
export NCCL_DEBUG=info
export OMPI_MCA_mtl_base_verbose=1
export FI_EFA_ENABLE_SHM_TRANSFER=0
export FI_PROVIDER=efa
export FI_EFA_TX_MIN_CREDITS=64
export NCCL_TREE_THRESHOLD=0
export OMPI_MCA_pml="^cm"
export OMPI_MCA_btl="tcp,self"
export OMPI_MCA_btl_tcp_if_exclude="lo,docker1"
export OMPI_MCA_plm_rsh_no_tree_spawn=1


export TORCH_EXTENSIONS_DIR=/fsx/codeSeCodegen/extensions
export XDG_CACHE_HOME=/fsx/codeSeCodegen/hf_cache

#can be changed
source /fsx/codeSeCodegen/codeSeEnv/bin/activate

# can be changed
srun  --comment elm --pty bash -i
	#!/bin/bash
	#SBATCH --job-name="elm"
	#SBATCH --partition=gpu
	#SBATCH --mem-per-cpu=16GB # Amount of CPU memory
	#SBATCH --nodes=1
	#SBATCH --ntasks-per-node=8 # Crucial - only 1 task per dist per node!
	#SBATCH --cpus-per-task=6 # Number of cores per tasks
	#SBATCH --hint=nomultithread # We get physical cores not logical
	#SBATCH --gres=gpu:8 # Number of gpus
	#SBATCH --output=%x_%j.out # Set this dir where you want slurm outs to go
	#SBATCH --error=%x_%j.out # Set this dir where you want slurm outs to go
	#SBATCH --exclusive # Turn off node sharing
	#SBATCH --comment=elm

	module load openmpi
	module load cuda/11.4

	mkdir -p /fsx/home-$(whoami)/hostfiles
	hostfile=/fsx/home-$(whoami)/hostfiles/hosts_$SLURM_JOBID
	rm $hostfile &> /dev/null # for consecutive calls to this script in interactive jobs

	for i in `scontrol show hostnames $SLURM_NODELIST`
	do
	echo $i slots=8 >>$hostfile
	done

	export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
	export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" \| head -n 1)
	export MASTER_PORT=12802
	export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" \| wc -l`

	export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib
	export NCCL_PROTO=simple
	export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/aws-ofi-nccl/lib
	export PATH=$PATH:/opt/amazon/efa/bin:/opt/amazon/openmpi/bin
	export FI_EFA_FORK_SAFE=1
	export FI_LOG_LEVEL=1
	export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
	export NCCL_DEBUG=info
	export OMPI_MCA_mtl_base_verbose=1
	export FI_EFA_ENABLE_SHM_TRANSFER=0
	export FI_PROVIDER=efa
	export FI_EFA_TX_MIN_CREDITS=64
	export NCCL_TREE_THRESHOLD=0
	export OMPI_MCA_pml="^cm"
	export OMPI_MCA_btl="tcp,self"
	export OMPI_MCA_btl_tcp_if_exclude="lo,docker1"
	export OMPI_MCA_plm_rsh_no_tree_spawn=1


	export TORCH_EXTENSIONS_DIR=/fsx/codeSeCodegen/extensions
	export XDG_CACHE_HOME=/fsx/codeSeCodegen/hf_cache

	#can be changed
	source /fsx/codeSeCodegen/codeSeEnv/bin/activate

	# can be changed
	srun --comment elm --pty bash -i