Skip to content

Instantly share code, notes, and snippets.

@mehdidc
Created April 17, 2024 07:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mehdidc/a96c80d056a48695dc0db5806de8d95c to your computer and use it in GitHub Desktop.
Save mehdidc/a96c80d056a48695dc0db5806de8d95c to your computer and use it in GitHub Desktop.
#!/bin/bash
#SBATCH --nodes=8
#SBATCH --time=00:20:00
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=48
#SBATCH --gres=gpu:4
#SBATCH --partition=booster
#SBATCH --account=transfernetx
#SBATCH --exclude=jwb[0059,0067,0069,0193,0284,0287,0294,0359,0418,0637,0829,0832,0838,0898,0907,0921,0971,1004,1023,1029,1213,1126]
#SBATCH --threads-per-core=1
#SBATCH --mem=0
REPO_PATH=$(pwd)
export CUDA_DEVICE_MAX_CONNECTIONS=1
export CUDA_VISIBLE_DEVICES="0,1,2,3"
export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
#export MASTER_ADDR="${MASTER_ADDR}.juwels"
#export MASTER_ADDR="${MASTER_ADDR}.jureca"
#export MASTER_ADDR="${MASTER_ADDR}"
export MASTER_ADDR="${MASTER_ADDR}i"
export MASTER_PORT=12345
export NNODES=$SLURM_JOB_NUM_NODES
export GPUS_PER_NODE=4
echo $MASTER_ADDR $MASTER_PORT
ml GCC
source /p/project/ccstdl/laion/mamba/bin/activate experimental-torch-nightly
export PYTHONPATH=$(pwd)/src:$PYTHONPATH
export OMP_NUM_THREADS=1
export TRITON_CACHE_DIR=cache
export CUDA_LAUNCH_BLOCKING=1
export NCCL_ASYNC_ERROR_HANDLING=1
export NCCL_IB_TIMEOUT=20
export NCCL_SOCKET_IFNAME=ib0
export NCCL_DEBUG=INFO
#LAUNCHER="python -u -m torch.distributed.run \
#--nproc_per_node $GPUS_PER_NODE \
#--nnodes $NNODES \
#--node_rank \$SLURM_PROCID \
#--master_addr $MASTER_ADDR \
#--master_port $MASTER_PORT \
#--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
#--rdzv_backend c10d \
#"
LAUNCHER="python -u -m torch.distributed.run \
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
--rdzv_backend static \
--max_restarts 0 \
--tee 3 \
"
PROGRAM="$REPO_PATH/run_train.py --config-file examples/config_poro_34b.yaml"
#export CMD="${LAUNCHER} ${PROGRAM}"
export CMD="$LAUNCHER --node_rank \$SLURM_PROCID $PROGRAM"
echo $CMD
export WANDB_MODE="offline"
SRUN_ARGS=" --threads-per-core=1\
--wait=60 \
--kill-on-bad-exit=1 \
--jobid $SLURM_JOB_ID \
"
DATETIME=$(date +'date_%y-%m-%d_time_%H-%M-%S')
LOG_PATH="logs/${SLURM_JOB_NAME}_${DATETIME}.log"
srun $SRUN_ARGS bash -c "$CMD" 2>&1 | tee -a $LOG_PATH
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment