Skip to content

Instantly share code, notes, and snippets.

@malteos
Created April 1, 2024 19:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save malteos/71635c411544fe86464663720c24aff5 to your computer and use it in GitHub Desktop.
Save malteos/71635c411544fe86464663720c24aff5 to your computer and use it in GitHub Desktop.
Sbatch example
#!/bin/bash
#SBATCH --job-name=oxw-bloom-1b7-twc-german
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
#SBATCH --nodes=4
#SBATCH --gres=gpu:4 # ---> does not matter on JUWELS
#SBATCH --cpus-per-task=48 # number of cores per tasks
#SBATCH --hint=nomultithread # we get physical cores not logical
#SBATCH --time=0-12:00:00
#SBATCH --output=%j.%x.out
#SBATCH --partition=booster
# Use `develbooster` for debugging, `booster` for "normal" jobs, and
# `largebooster` for jobs on more than 256 nodes.
# send signal 4 mins before time limit
#SBATCH --signal=B:INT@240
# Enable logging
#set -x -e
echo "START TIME: $(date)"
# copy this batch script into log directory for reproducibility (TODO this only works when no extra sbatch args are defined)
if [ -e "$0" ]; then
cp -p "$0" "slurmLog/${SLURM_JOB_ID}.${SLURM_JOB_NAME}.sh"
else
echo 'Please execute the sbatch script from the `run_scripts` directory.'
exit 1
fi
if [[ -z "$EXP_DIR" ]] || [[ -z "$BASE_DIR" ]]; then
echo "Environment variables (BASE_DIR, EXP_DIR...) are not set!" 1>&2
exit 1
fi
# load experiment env
source ${BASE_DIR}/slurm/bloom-1b7-twc-german/exp.sh || exit 1
if ["$EXP_NAME" != "bloom-1b7-twc-german"]; then
echo "Invalid EXP_NAME"
exit 1
fi
# load juwels env
if ! [ -e activate.bash ]; then
echo 'Please execute the sbatch script from the `run_scripts` directory.'
exit 1
fi
source activate.bash || exit 1
# Hardware settings
GPUS_PER_NODE=4
NNODES=$SLURM_JOB_NUM_NODES
# Paths
BIGS_WORKING_DIR=$EXP_DIR/tr1
DATA_OUTPUT_PATH="$BIGS_WORKING_DIR"/output_dir/job$SLURM_JOBID
CHECKPOINT_PATH=$BIGS_WORKING_DIR/checkpoints
##CHECKPOINT_PATH=$EXP_DIR/tr1/checkpoints # for debugging load old stage 2020 checkpoint
TENSORBOARD_PATH=$BIGS_WORKING_DIR/tensorboard
CODECARBON_PATH=$BIGS_WORKING_DIR/codecarbon
LOGS_PATH=$BIGS_WORKING_DIR/logs
KILL_SWITCH_PATH=$BIGS_WORKING_DIR/kill-switch$SLURM_JOBID
mkdir -p "$LOGS_PATH"
### the following is mostly copied from
# https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/smaller_models/tr11b-1B3-ml.slurm
# (slurm script corresponds to https://huggingface.co/bigscience/bloom-1b7)
PP_SIZE=1
TP_SIZE=1
MICRO_BATCH_SIZE=32 # TODO increase? MICRO_BATCH_SIZE=2 works, MICRO_BATCH_SIZE=3 use to work with old setup
##GLOBAL_BATCH_SIZE=$(((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE)) * MICRO_BATCH_SIZE))
#GLOBAL_BATCH_SIZE=240 # gradient accumulation steps = 5
GAS=1
GLOBAL_BATCH_SIZE=$(((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE)) * MICRO_BATCH_SIZE * GAS))
# goal: 512 = same as gpt2-xl
echo "MICRO_BATCH_SIZE=$MICRO_BATCH_SIZE"
echo "GLOBAL_BATCH_SIZE=$GLOBAL_BATCH_SIZE"
# Model settings
NLAYERS=24
NHIDDEN=2048
NHEADS=16
MAX_POS_EMBEDDING=2048
SEQ_LEN=512 #$MAX_POS_EMBEDDING
## original script
# GLOBAL_BATCH_SIZE=240 # gradient accumulation steps = 5
# NLAYERS=24
# NHIDDEN=1024
# NHEADS=16
# SEQ_LEN=2048
SAVE_INTERVAL=6250 # (WECHSEL = 12500)
LOG_INTERVAL=10
EVAL_INTERVAL=1250 #2500 # (WECHSEL = 12500)
EVAL_ITERS=100 # depends on batch size
#SAVE_INTERVAL=1 # (WECHSEL = 12500)
#LOG_INTERVAL=1
#EVAL_INTERVAL=1 # (WECHSEL = 12500)
#EVAL_ITERS=-1 # depends on batch size
#TRAIN_SAMPLES=220_000_000 # 450B tokens ## 1.0
#LR_DECAY_SAMPLES=200_000_000 # Decay for the first 410B tokens then continue at fixed --min-lr ## 0.9111111111111111
#LR_WARMUP_SAMPLES=183_105 # 375M tokens ## 0.0008333333333333334
# Train on OSCAR-DE (same as GPT2-WECHSEL-German)
TRAIN_DATA_PATH=${BASE_DIR}/data/gpt2_oscar_unshuffled_deduplicated_de_without_4gb_valid/bigs/train_text_document
VALID_DATA_PATH=${BASE_DIR}/data/gpt2_oscar_unshuffled_deduplicated_de_without_4gb_valid/bigs/validation_text_document
TOKENIZER_NAME_OR_PATH=${EXP_DIR}
FROM_PRETRAINED=${EXP_DIR}
# total = 62234300 document (split 998,1,1)
# 99.8% for training = 56_010_870
# 0.1% for validation
# 0.1% for test
TRAIN_SAMPLES=62_109_831
LR_DECAY_SAMPLES=55_898_848 # 0.9
LR_WARMUP_SAMPLES=51_758 # 0.0008333333333333334
# Network settings
MASTER_PORT=6000
MASTER_ADDR="$(hostname)"
scontrol && MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
# Allow communication over InfiniBand cells. (JUWELS only)
if [ -d "$JUWELS_BASE_DIR" ]; then
echo "JUWELS dected (${JUWELS_BASE_DIR} exists)"
echo "Appending i to MASTER_ADDR=${MASTER_ADDR} ..."
MASTER_ADDR="${MASTER_ADDR}i"
echo "=> MASTER_ADDR=${MASTER_ADDR} "
fi
# NCCL related environment variables
# (from https://github.com/OpenGPTX/BigScience-Setup/blob/main/run_scripts/tr1-13B-round1_juwels_pipe.sbatch)
# do not remove or the training will hang and nodes will be lost w/o this workaround
#export CUDA_LAUNCH_BLOCKING=1
# hide duplicated errors using this hack - will be properly fixed in pt-1.12
export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
# force crashing on nccl issues like hanging broadcast
#export NCCL_ASYNC_ERROR_HANDLING=1
# handle timeouts
export NCCL_IB_TIMEOUT=50
export UCX_RC_TIMEOUT=4s
export NCCL_IB_RETRY_CNT=10
# setting IB for out of band communication
export NCCL_SOCKET_IFNAME=ib0
# NCCL and Torch debug
export NCCL_DEBUG=INFO
# export NCCL_DEBUG_SUBSYS=ALL
# export TORCH_DISTRIBUTED_DEBUG=INFO
# Change to code base directory
cd "$MEGATRON_DEEPSPEED_REPO"
# Log git status (oxw repo + obmd repo)
git -C ${BASE_DIR} branch -vv
git branch -vv
git remote -v
# Rebuild fused kernels
CLEAN_PREV_JIT_BUILD=0
rm -f megatron/fused_kernels/build/lock
((CLEAN_PREV_JIT_BUILD)) && rm -rf megatron/fused_kernels/{build,__pycache__}
OPTIMIZER_ARGS=" \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 3.0e-4 \
--min-lr 1e-5 \
--lr-decay-style cosine \
--lr-decay-samples $LR_DECAY_SAMPLES \
--lr-warmup-samples $LR_WARMUP_SAMPLES \
--clip-grad 1.0 \
--weight-decay 1e-1 \
"
# for 20h 1190, for 100h 5990
# --exit-duration-in-mins 1190 \
EXIT_OPTS=" \
--exit-duration-in-mins 5990 \
--kill-switch-path ${KILL_SWITCH_PATH} \
"
# delete old kill switch
rm -f ${KILL_SWITCH_PATH}
# --pad-vocab-size-to 250880 \
# --rampup-batch-size 192 32 9_765_625 \
# --pp-partition-method 'type:transformer|embedding' \
#
#
GPT_ARGS=" \
--from-pretrained-hf $FROM_PRETRAINED \
--pp-partition-method type:transformer|embedding \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--seq-length $SEQ_LEN \
--max-position-embeddings $MAX_POS_EMBEDDING \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train-samples $TRAIN_SAMPLES \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
--init-method-std 0.0048 \
--embed-layernorm \
--checkpoint-activations \
--bf16 \
--seed 42 \
--position-embedding-type alibi \
--abort-on-unmet-fused-kernel-constraints \
$OPTIMIZER_ARGS \
$EXIT_OPTS \
"
# TODO: decide on efficient eval-interval + eval-iters
OUTPUT_ARGS=" \
--log-interval $LOG_INTERVAL \
--save-interval $SAVE_INTERVAL \
--eval-interval $EVAL_INTERVAL \
--eval-iters $EVAL_ITERS \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
"
ZERO_STAGE=0 # important: bf16 must use z0! it implements its own zero stage 1 equivalent
config_json="$BIGS_WORKING_DIR/ds_config.$SLURM_JOBID.json"
# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"bf16": {
"enabled": true
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
DEEPSPEED_ARGS=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
export LAUNCHER="python -u -m torch.distributed.run \
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
--rdzv_backend c10d \
--max_restarts 0 \
--tee 3 \
"
export CMD=" \
`pwd`/pretrain_gpt.py \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
$GPT_ARGS \
$OUTPUT_ARGS \
--load $CHECKPOINT_PATH \
--save $CHECKPOINT_PATH \
--data-path $TRAIN_DATA_PATH \
--split 998,1,1 \
--data-impl mmap \
--distributed-backend nccl \
$DEEPSPEED_ARGS \
"
echo $LAUNCHER
echo $CMD
# catch signals
SLEEP_BEFORE_KILL=180
trap 'echo "Signal recieved! Saving kill switch to $KILL_SWITCH_PATH and wait for $SLEEP_BEFORE_KILL seconds"; touch $KILL_SWITCH_PATH; sleep $SLEEP_BEFORE_KILL; echo Done' USR1 SIGINT SIGTERM
(srun --jobid $SLURM_JOB_ID \
bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 \
| tee -a "$LOGS_PATH"/main_log.txt) & PID="$!"
wait "${PID}"
echo "END TIME: $(date)"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment