malteos/bigscience-deepspeedmeg-example.sbatch

## bigscience-deepspeedmeg-example.sbatch
#!/bin/bash
#SBATCH --job-name=oxw-bloom-1b7-twc-german
#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
#SBATCH --nodes=4
#SBATCH --gres=gpu:4       # ---> does not matter on JUWELS
#SBATCH --cpus-per-task=48           # number of cores per tasks
#SBATCH --hint=nomultithread         # we get physical cores not logical
#SBATCH --time=0-12:00:00
#SBATCH --output=%j.%x.out
#SBATCH --partition=booster
# Use `develbooster` for debugging, `booster` for "normal" jobs, and
# `largebooster` for jobs on more than 256 nodes.
# send signal 4 mins before time limit
#SBATCH --signal=B:INT@240

# Enable logging
#set -x -e
echo "START TIME: $(date)"
# copy this batch script into log directory for reproducibility (TODO this only works when no extra sbatch args are defined)
if [ -e "$0" ]; then
    cp -p "$0" "slurmLog/${SLURM_JOB_ID}.${SLURM_JOB_NAME}.sh"
else
    echo 'Please execute the sbatch script from the `run_scripts` directory.'
    exit 1
fi

if [[ -z "$EXP_DIR" ]] || [[ -z "$BASE_DIR" ]]; then
    echo "Environment variables (BASE_DIR, EXP_DIR...) are not set!" 1>&2
    exit 1
fi

# load experiment env
source ${BASE_DIR}/slurm/bloom-1b7-twc-german/exp.sh || exit 1

if ["$EXP_NAME" != "bloom-1b7-twc-german"]; then
    echo "Invalid EXP_NAME"
    exit 1
fi

# load juwels env
if ! [ -e activate.bash ]; then
    echo 'Please execute the sbatch script from the `run_scripts` directory.'
    exit 1
fi
source activate.bash || exit 1

# Hardware settings
GPUS_PER_NODE=4
NNODES=$SLURM_JOB_NUM_NODES

# Paths
BIGS_WORKING_DIR=$EXP_DIR/tr1
DATA_OUTPUT_PATH="$BIGS_WORKING_DIR"/output_dir/job$SLURM_JOBID
CHECKPOINT_PATH=$BIGS_WORKING_DIR/checkpoints
##CHECKPOINT_PATH=$EXP_DIR/tr1/checkpoints   # for debugging load old stage 2020 checkpoint

TENSORBOARD_PATH=$BIGS_WORKING_DIR/tensorboard
CODECARBON_PATH=$BIGS_WORKING_DIR/codecarbon
LOGS_PATH=$BIGS_WORKING_DIR/logs
KILL_SWITCH_PATH=$BIGS_WORKING_DIR/kill-switch$SLURM_JOBID

mkdir -p "$LOGS_PATH"

### the following is mostly copied from
# https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/smaller_models/tr11b-1B3-ml.slurm
# (slurm script corresponds to https://huggingface.co/bigscience/bloom-1b7)

PP_SIZE=1
TP_SIZE=1

MICRO_BATCH_SIZE=32 # TODO increase? MICRO_BATCH_SIZE=2 works, MICRO_BATCH_SIZE=3 use to work with old setup
##GLOBAL_BATCH_SIZE=$(((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE)) * MICRO_BATCH_SIZE))
#GLOBAL_BATCH_SIZE=240  # gradient accumulation steps = 5
GAS=1
GLOBAL_BATCH_SIZE=$(((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE)) *  MICRO_BATCH_SIZE * GAS))
# goal: 512 = same as gpt2-xl


echo "MICRO_BATCH_SIZE=$MICRO_BATCH_SIZE"
echo "GLOBAL_BATCH_SIZE=$GLOBAL_BATCH_SIZE"

# Model settings
NLAYERS=24
NHIDDEN=2048
NHEADS=16
MAX_POS_EMBEDDING=2048
SEQ_LEN=512  #$MAX_POS_EMBEDDING

## original script
# GLOBAL_BATCH_SIZE=240  # gradient accumulation steps = 5
# NLAYERS=24
# NHIDDEN=1024
# NHEADS=16
# SEQ_LEN=2048

SAVE_INTERVAL=6250  # (WECHSEL = 12500)
LOG_INTERVAL=10
EVAL_INTERVAL=1250 #2500  # (WECHSEL = 12500)
EVAL_ITERS=100  # depends on batch size
#SAVE_INTERVAL=1  # (WECHSEL = 12500)
#LOG_INTERVAL=1
#EVAL_INTERVAL=1  # (WECHSEL = 12500)
#EVAL_ITERS=-1  # depends on batch size


#TRAIN_SAMPLES=220_000_000  # 450B tokens  ## 1.0
#LR_DECAY_SAMPLES=200_000_000  # Decay for the first 410B tokens then continue at fixed --min-lr ## 0.9111111111111111
#LR_WARMUP_SAMPLES=183_105  # 375M tokens  ## 0.0008333333333333334

# Train on OSCAR-DE (same as GPT2-WECHSEL-German)
TRAIN_DATA_PATH=${BASE_DIR}/data/gpt2_oscar_unshuffled_deduplicated_de_without_4gb_valid/bigs/train_text_document
VALID_DATA_PATH=${BASE_DIR}/data/gpt2_oscar_unshuffled_deduplicated_de_without_4gb_valid/bigs/validation_text_document

TOKENIZER_NAME_OR_PATH=${EXP_DIR}
FROM_PRETRAINED=${EXP_DIR}

# total = 62234300 document (split 998,1,1)
# 99.8% for training = 56_010_870
# 0.1% for validation
# 0.1% for test
TRAIN_SAMPLES=62_109_831
LR_DECAY_SAMPLES=55_898_848  # 0.9
LR_WARMUP_SAMPLES=51_758  # 0.0008333333333333334

# Network settings
MASTER_PORT=6000
MASTER_ADDR="$(hostname)"
scontrol && MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"

# Allow communication over InfiniBand cells. (JUWELS only)
if [ -d "$JUWELS_BASE_DIR" ]; then
    echo "JUWELS dected (${JUWELS_BASE_DIR} exists)"
    echo "Appending i to MASTER_ADDR=${MASTER_ADDR} ..."
    MASTER_ADDR="${MASTER_ADDR}i"
    echo "=> MASTER_ADDR=${MASTER_ADDR} "
fi

# NCCL related environment variables
# (from https://github.com/OpenGPTX/BigScience-Setup/blob/main/run_scripts/tr1-13B-round1_juwels_pipe.sbatch)

# do not remove or the training will hang and nodes will be lost w/o this workaround
#export CUDA_LAUNCH_BLOCKING=1
# hide duplicated errors using this hack - will be properly fixed in pt-1.12
export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
# force crashing on nccl issues like hanging broadcast
#export NCCL_ASYNC_ERROR_HANDLING=1
# handle timeouts
export NCCL_IB_TIMEOUT=50
export UCX_RC_TIMEOUT=4s
export NCCL_IB_RETRY_CNT=10
# setting IB for out of band communication
export NCCL_SOCKET_IFNAME=ib0
# NCCL and Torch debug
export NCCL_DEBUG=INFO
# export NCCL_DEBUG_SUBSYS=ALL
# export TORCH_DISTRIBUTED_DEBUG=INFO

# Change to code base directory
cd "$MEGATRON_DEEPSPEED_REPO"

# Log git status (oxw repo + obmd repo)
git -C ${BASE_DIR} branch -vv
git branch -vv
git remote -v

# Rebuild fused kernels
CLEAN_PREV_JIT_BUILD=0
rm -f megatron/fused_kernels/build/lock
((CLEAN_PREV_JIT_BUILD)) && rm -rf megatron/fused_kernels/{build,__pycache__}

OPTIMIZER_ARGS=" \
     --optimizer adam \
     --adam-beta1 0.9 \
     --adam-beta2 0.95 \
     --adam-eps 1e-8 \
     --lr 3.0e-4 \
     --min-lr 1e-5 \
     --lr-decay-style cosine \
     --lr-decay-samples $LR_DECAY_SAMPLES \
     --lr-warmup-samples $LR_WARMUP_SAMPLES \
     --clip-grad 1.0 \
     --weight-decay 1e-1 \
     "

# for 20h 1190, for 100h 5990
#    --exit-duration-in-mins 1190 \
EXIT_OPTS=" \
    --exit-duration-in-mins 5990 \
    --kill-switch-path ${KILL_SWITCH_PATH} \
    "
# delete old kill switch
rm -f ${KILL_SWITCH_PATH}

#     --pad-vocab-size-to 250880 \
#     --rampup-batch-size 192 32 9_765_625 \
#    --pp-partition-method 'type:transformer|embedding' \
#
#
GPT_ARGS=" \
    --from-pretrained-hf $FROM_PRETRAINED \
    --pp-partition-method type:transformer|embedding \
    --num-layers $NLAYERS \
    --hidden-size $NHIDDEN \
    --num-attention-heads $NHEADS \
    --seq-length $SEQ_LEN \
    --max-position-embeddings $MAX_POS_EMBEDDING \
    --micro-batch-size $MICRO_BATCH_SIZE \
    --global-batch-size $GLOBAL_BATCH_SIZE \
    --train-samples $TRAIN_SAMPLES \
    --tokenizer-type PretrainedFromHF \
    --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
    --init-method-std 0.0048 \
    --embed-layernorm \
    --checkpoint-activations \
    --bf16 \
    --seed 42 \
    --position-embedding-type alibi \
    --abort-on-unmet-fused-kernel-constraints \
    $OPTIMIZER_ARGS \
    $EXIT_OPTS \
    "

# TODO: decide on efficient eval-interval + eval-iters

OUTPUT_ARGS=" \
    --log-interval $LOG_INTERVAL \
    --save-interval $SAVE_INTERVAL \
    --eval-interval $EVAL_INTERVAL \
    --eval-iters $EVAL_ITERS \
    --tensorboard-dir $TENSORBOARD_PATH \
    --tensorboard-queue-size 5 \
    --log-timers-to-tensorboard \
    --log-batch-size-to-tensorboard \
    --log-validation-ppl-to-tensorboard \
    "

ZERO_STAGE=0 # important: bf16 must use z0! it implements its own zero stage 1 equivalent

config_json="$BIGS_WORKING_DIR/ds_config.$SLURM_JOBID.json"

# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
cat <<EOT > $config_json
{
  "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
  "train_batch_size": $GLOBAL_BATCH_SIZE,
  "gradient_clipping": 1.0,
  "zero_optimization": {
    "stage": $ZERO_STAGE
  },
  "bf16": {
    "enabled": true
  },
  "steps_per_print": 2000,
  "wall_clock_breakdown": false
}
EOT


DEEPSPEED_ARGS=" \
    --deepspeed \
    --deepspeed_config ${config_json} \
    --zero-stage ${ZERO_STAGE} \
    --deepspeed-activation-checkpointing \
    "

export LAUNCHER="python -u -m torch.distributed.run \
    --nproc_per_node $GPUS_PER_NODE \
    --nnodes $NNODES \
    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
    --rdzv_backend c10d \
    --max_restarts 0 \
    --tee 3 \
    "

export CMD=" \
    `pwd`/pretrain_gpt.py \
    --tensor-model-parallel-size $TP_SIZE \
    --pipeline-model-parallel-size $PP_SIZE \
    $GPT_ARGS \
    $OUTPUT_ARGS \
    --load $CHECKPOINT_PATH \
    --save $CHECKPOINT_PATH \
    --data-path $TRAIN_DATA_PATH \
    --split 998,1,1 \
    --data-impl mmap \
    --distributed-backend nccl \
     $DEEPSPEED_ARGS \
    "

echo $LAUNCHER
echo $CMD

# catch signals
SLEEP_BEFORE_KILL=180
trap 'echo "Signal recieved! Saving kill switch to $KILL_SWITCH_PATH and wait for $SLEEP_BEFORE_KILL seconds"; touch $KILL_SWITCH_PATH; sleep $SLEEP_BEFORE_KILL; echo Done' USR1 SIGINT SIGTERM

(srun --jobid $SLURM_JOB_ID \
            bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 \
    | tee -a "$LOGS_PATH"/main_log.txt) & PID="$!"
wait "${PID}"

echo "END TIME: $(date)"
	#!/bin/bash
	#SBATCH --job-name=oxw-bloom-1b7-twc-german
	#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
	#SBATCH --nodes=4
	#SBATCH --gres=gpu:4 # ---> does not matter on JUWELS
	#SBATCH --cpus-per-task=48 # number of cores per tasks
	#SBATCH --hint=nomultithread # we get physical cores not logical
	#SBATCH --time=0-12:00:00
	#SBATCH --output=%j.%x.out
	#SBATCH --partition=booster
	# Use `develbooster` for debugging, `booster` for "normal" jobs, and
	# `largebooster` for jobs on more than 256 nodes.
	# send signal 4 mins before time limit
	#SBATCH --signal=B:INT@240

	# Enable logging
	#set -x -e
	echo "START TIME: $(date)"
	# copy this batch script into log directory for reproducibility (TODO this only works when no extra sbatch args are defined)
	if [ -e "$0" ]; then
	cp -p "$0" "slurmLog/${SLURM_JOB_ID}.${SLURM_JOB_NAME}.sh"
	else
	echo 'Please execute the sbatch script from the `run_scripts` directory.'
	exit 1
	fi

	if [[ -z "$EXP_DIR" ]] \|\| [[ -z "$BASE_DIR" ]]; then
	echo "Environment variables (BASE_DIR, EXP_DIR...) are not set!" 1>&2
	exit 1
	fi

	# load experiment env
	source ${BASE_DIR}/slurm/bloom-1b7-twc-german/exp.sh \|\| exit 1

	if ["$EXP_NAME" != "bloom-1b7-twc-german"]; then
	echo "Invalid EXP_NAME"
	exit 1
	fi

	# load juwels env
	if ! [ -e activate.bash ]; then
	echo 'Please execute the sbatch script from the `run_scripts` directory.'
	exit 1
	fi
	source activate.bash \|\| exit 1

	# Hardware settings
	GPUS_PER_NODE=4
	NNODES=$SLURM_JOB_NUM_NODES

	# Paths
	BIGS_WORKING_DIR=$EXP_DIR/tr1
	DATA_OUTPUT_PATH="$BIGS_WORKING_DIR"/output_dir/job$SLURM_JOBID
	CHECKPOINT_PATH=$BIGS_WORKING_DIR/checkpoints
	##CHECKPOINT_PATH=$EXP_DIR/tr1/checkpoints # for debugging load old stage 2020 checkpoint

	TENSORBOARD_PATH=$BIGS_WORKING_DIR/tensorboard
	CODECARBON_PATH=$BIGS_WORKING_DIR/codecarbon
	LOGS_PATH=$BIGS_WORKING_DIR/logs
	KILL_SWITCH_PATH=$BIGS_WORKING_DIR/kill-switch$SLURM_JOBID

	mkdir -p "$LOGS_PATH"

	### the following is mostly copied from
	# https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/smaller_models/tr11b-1B3-ml.slurm
	# (slurm script corresponds to https://huggingface.co/bigscience/bloom-1b7)

	PP_SIZE=1
	TP_SIZE=1

	MICRO_BATCH_SIZE=32 # TODO increase? MICRO_BATCH_SIZE=2 works, MICRO_BATCH_SIZE=3 use to work with old setup
	##GLOBAL_BATCH_SIZE=$(((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE)) * MICRO_BATCH_SIZE))
	#GLOBAL_BATCH_SIZE=240 # gradient accumulation steps = 5
	GAS=1
	GLOBAL_BATCH_SIZE=$(((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE)) * MICRO_BATCH_SIZE * GAS))
	# goal: 512 = same as gpt2-xl


	echo "MICRO_BATCH_SIZE=$MICRO_BATCH_SIZE"
	echo "GLOBAL_BATCH_SIZE=$GLOBAL_BATCH_SIZE"

	# Model settings
	NLAYERS=24
	NHIDDEN=2048
	NHEADS=16
	MAX_POS_EMBEDDING=2048
	SEQ_LEN=512 #$MAX_POS_EMBEDDING

	## original script
	# GLOBAL_BATCH_SIZE=240 # gradient accumulation steps = 5
	# NLAYERS=24
	# NHIDDEN=1024
	# NHEADS=16
	# SEQ_LEN=2048

	SAVE_INTERVAL=6250 # (WECHSEL = 12500)
	LOG_INTERVAL=10
	EVAL_INTERVAL=1250 #2500 # (WECHSEL = 12500)
	EVAL_ITERS=100 # depends on batch size
	#SAVE_INTERVAL=1 # (WECHSEL = 12500)
	#LOG_INTERVAL=1
	#EVAL_INTERVAL=1 # (WECHSEL = 12500)
	#EVAL_ITERS=-1 # depends on batch size


	#TRAIN_SAMPLES=220_000_000 # 450B tokens ## 1.0
	#LR_DECAY_SAMPLES=200_000_000 # Decay for the first 410B tokens then continue at fixed --min-lr ## 0.9111111111111111
	#LR_WARMUP_SAMPLES=183_105 # 375M tokens ## 0.0008333333333333334

	# Train on OSCAR-DE (same as GPT2-WECHSEL-German)
	TRAIN_DATA_PATH=${BASE_DIR}/data/gpt2_oscar_unshuffled_deduplicated_de_without_4gb_valid/bigs/train_text_document
	VALID_DATA_PATH=${BASE_DIR}/data/gpt2_oscar_unshuffled_deduplicated_de_without_4gb_valid/bigs/validation_text_document

	TOKENIZER_NAME_OR_PATH=${EXP_DIR}
	FROM_PRETRAINED=${EXP_DIR}

	# total = 62234300 document (split 998,1,1)
	# 99.8% for training = 56_010_870
	# 0.1% for validation
	# 0.1% for test
	TRAIN_SAMPLES=62_109_831
	LR_DECAY_SAMPLES=55_898_848 # 0.9
	LR_WARMUP_SAMPLES=51_758 # 0.0008333333333333334

	# Network settings
	MASTER_PORT=6000
	MASTER_ADDR="$(hostname)"
	scontrol && MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" \| head -n 1)"

	# Allow communication over InfiniBand cells. (JUWELS only)
	if [ -d "$JUWELS_BASE_DIR" ]; then
	echo "JUWELS dected (${JUWELS_BASE_DIR} exists)"
	echo "Appending i to MASTER_ADDR=${MASTER_ADDR} ..."
	MASTER_ADDR="${MASTER_ADDR}i"
	echo "=> MASTER_ADDR=${MASTER_ADDR} "
	fi

	# NCCL related environment variables
	# (from https://github.com/OpenGPTX/BigScience-Setup/blob/main/run_scripts/tr1-13B-round1_juwels_pipe.sbatch)

	# do not remove or the training will hang and nodes will be lost w/o this workaround
	#export CUDA_LAUNCH_BLOCKING=1
	# hide duplicated errors using this hack - will be properly fixed in pt-1.12
	export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
	# force crashing on nccl issues like hanging broadcast
	#export NCCL_ASYNC_ERROR_HANDLING=1
	# handle timeouts
	export NCCL_IB_TIMEOUT=50
	export UCX_RC_TIMEOUT=4s
	export NCCL_IB_RETRY_CNT=10
	# setting IB for out of band communication
	export NCCL_SOCKET_IFNAME=ib0
	# NCCL and Torch debug
	export NCCL_DEBUG=INFO
	# export NCCL_DEBUG_SUBSYS=ALL
	# export TORCH_DISTRIBUTED_DEBUG=INFO

	# Change to code base directory
	cd "$MEGATRON_DEEPSPEED_REPO"

	# Log git status (oxw repo + obmd repo)
	git -C ${BASE_DIR} branch -vv
	git branch -vv
	git remote -v

	# Rebuild fused kernels
	CLEAN_PREV_JIT_BUILD=0
	rm -f megatron/fused_kernels/build/lock
	((CLEAN_PREV_JIT_BUILD)) && rm -rf megatron/fused_kernels/{build,__pycache__}

	OPTIMIZER_ARGS=" \
	--optimizer adam \
	--adam-beta1 0.9 \
	--adam-beta2 0.95 \
	--adam-eps 1e-8 \
	--lr 3.0e-4 \
	--min-lr 1e-5 \
	--lr-decay-style cosine \
	--lr-decay-samples $LR_DECAY_SAMPLES \
	--lr-warmup-samples $LR_WARMUP_SAMPLES \
	--clip-grad 1.0 \
	--weight-decay 1e-1 \
	"

	# for 20h 1190, for 100h 5990
	# --exit-duration-in-mins 1190 \
	EXIT_OPTS=" \
	--exit-duration-in-mins 5990 \
	--kill-switch-path ${KILL_SWITCH_PATH} \
	"
	# delete old kill switch
	rm -f ${KILL_SWITCH_PATH}

	# --pad-vocab-size-to 250880 \
	# --rampup-batch-size 192 32 9_765_625 \
	# --pp-partition-method 'type:transformer\|embedding' \
	#
	#
	GPT_ARGS=" \
	--from-pretrained-hf $FROM_PRETRAINED \
	--pp-partition-method type:transformer\|embedding \
	--num-layers $NLAYERS \
	--hidden-size $NHIDDEN \
	--num-attention-heads $NHEADS \
	--seq-length $SEQ_LEN \
	--max-position-embeddings $MAX_POS_EMBEDDING \
	--micro-batch-size $MICRO_BATCH_SIZE \
	--global-batch-size $GLOBAL_BATCH_SIZE \
	--train-samples $TRAIN_SAMPLES \
	--tokenizer-type PretrainedFromHF \
	--tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
	--init-method-std 0.0048 \
	--embed-layernorm \
	--checkpoint-activations \
	--bf16 \
	--seed 42 \
	--position-embedding-type alibi \
	--abort-on-unmet-fused-kernel-constraints \
	$OPTIMIZER_ARGS \
	$EXIT_OPTS \
	"

	# TODO: decide on efficient eval-interval + eval-iters

	OUTPUT_ARGS=" \
	--log-interval $LOG_INTERVAL \
	--save-interval $SAVE_INTERVAL \
	--eval-interval $EVAL_INTERVAL \
	--eval-iters $EVAL_ITERS \
	--tensorboard-dir $TENSORBOARD_PATH \
	--tensorboard-queue-size 5 \
	--log-timers-to-tensorboard \
	--log-batch-size-to-tensorboard \
	--log-validation-ppl-to-tensorboard \
	"

	ZERO_STAGE=0 # important: bf16 must use z0! it implements its own zero stage 1 equivalent

	config_json="$BIGS_WORKING_DIR/ds_config.$SLURM_JOBID.json"

	# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
	cat <<EOT > $config_json
	{
	"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
	"train_batch_size": $GLOBAL_BATCH_SIZE,
	"gradient_clipping": 1.0,
	"zero_optimization": {
	"stage": $ZERO_STAGE
	},
	"bf16": {
	"enabled": true
	},
	"steps_per_print": 2000,
	"wall_clock_breakdown": false
	}
	EOT


	DEEPSPEED_ARGS=" \
	--deepspeed \
	--deepspeed_config ${config_json} \
	--zero-stage ${ZERO_STAGE} \
	--deepspeed-activation-checkpointing \
	"

	export LAUNCHER="python -u -m torch.distributed.run \
	--nproc_per_node $GPUS_PER_NODE \
	--nnodes $NNODES \
	--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
	--rdzv_backend c10d \
	--max_restarts 0 \
	--tee 3 \
	"

	export CMD=" \
	`pwd`/pretrain_gpt.py \
	--tensor-model-parallel-size $TP_SIZE \
	--pipeline-model-parallel-size $PP_SIZE \
	$GPT_ARGS \
	$OUTPUT_ARGS \
	--load $CHECKPOINT_PATH \
	--save $CHECKPOINT_PATH \
	--data-path $TRAIN_DATA_PATH \
	--split 998,1,1 \
	--data-impl mmap \
	--distributed-backend nccl \
	$DEEPSPEED_ARGS \
	"

	echo $LAUNCHER
	echo $CMD

	# catch signals
	SLEEP_BEFORE_KILL=180
	trap 'echo "Signal recieved! Saving kill switch to $KILL_SWITCH_PATH and wait for $SLEEP_BEFORE_KILL seconds"; touch $KILL_SWITCH_PATH; sleep $SLEEP_BEFORE_KILL; echo Done' USR1 SIGINT SIGTERM

	(srun --jobid $SLURM_JOB_ID \
	bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 \
	\| tee -a "$LOGS_PATH"/main_log.txt) & PID="$!"
	wait "${PID}"

	echo "END TIME: $(date)"