Skip to content

Instantly share code, notes, and snippets.

@chengscott
Last active July 21, 2023 18:45
Show Gist options
  • Save chengscott/3193bdcb73c73b78aedbf6556d423350 to your computer and use it in GitHub Desktop.
Save chengscott/3193bdcb73c73b78aedbf6556d423350 to your computer and use it in GitHub Desktop.
diff --git a/megatron/arguments.py b/megatron/arguments.py
index 4bf1d72..bcea6ce 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -1090,7 +1090,7 @@ def _add_distributed_args(parser):
default=False, help='If set, use custom-built ring exchange '
'for p2p communications. Note that this option will require '
'a custom built image that support ring-exchange p2p.')
- group.add_argument('--local_rank', type=int, default=None,
+ group.add_argument('--local-rank', type=int, default=None,
help='local rank passed from distributed launcher.')
group.add_argument('--lazy-mpu-init', type=bool, required=False,
help='If set to True, initialize_megatron() '
{
"train_batch_size" : GLOBAL_BATCH,
"train_micro_batch_size_per_gpu": MICRO_BATCH,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"wall_clock_breakdown" : true,
"zero_optimization": {
"stage": ZERO_STAGE,
"allgather_partitions": true,
"reduce_scatter": true,
"allgather_bucket_size": 5e8,
"overlap_comm": true,
"contiguous_gradients": true
},
"fp16": {
"enabled": true,
"initial_scale_power": 12
}
}
#! /bin/bash -xe
[[ -z "$MODEL_SIZE" ]] && { echo "MODEL_SIZE is not set"; exit 1; }
[[ -z "$NLAYERS" ]] && { echo "NLAYERS is not set"; exit 1; }
[[ -z "$HIDDEN" ]] && { echo "HIDDEN is not set"; exit 1; }
[[ -z "$ATEN_HEADS" ]] && { echo "ATEN_HEADS is not set"; exit 1; }
[[ -z "$PBS_NODEFILE" ]] && { echo "PBS_NODEFILE is not set"; exit 1; }
[[ -z "$ZERO_STAGE" ]] && { echo "ZERO_STAGE is not set"; exit 1; }
[[ -z "$DATA_PREFIX" ]] && { echo "DATA_PREFIX is not set"; exit 1; }
[[ -z "$CHECKPOINT_PATH" ]] && { echo "CHECKPOINT_PATH is not set"; exit 1; }
NNODES=$(wc -l < "${PBS_NODEFILE}")
GPUS_PER_NODE=4
WORLD_SIZE=$((GPUS_PER_NODE * NNODES))
DDP_IMPL="local"
MPSIZE=1
MICRO_BATCH=1
GLOBAL_BATCH=$((WORLD_SIZE * MICRO_BATCH / MPSIZE ))
DS_CFG="ds-${NNODES}-${MODEL_SIZE}-zero${ZERO_STAGE}-${DDP_IMPL}.json"
cp ds.tpl.json "${DS_CFG}"
sed -i "s/GLOBAL_BATCH/${GLOBAL_BATCH}/g" "${DS_CFG}"
sed -i "s/MICRO_BATCH/${MICRO_BATCH}/g" "${DS_CFG}"
sed -i "s/ZERO_STAGE/${ZERO_STAGE}/g" "${DS_CFG}"
date -R
mpiexec -np "${WORLD_SIZE}" --ppn "${GPUS_PER_NODE}" --hostfile "${PBS_NODEFILE}" \
--env MASTER_ADDR="${HOSTNAME}" \
--env MASTER_PORT=5566 \
python pretrain_gpt.py \
--DDP-impl "${DDP_IMPL}" \
--deepspeed \
--deepspeed_mpi \
--deepspeed_config "${DS_CFG}" \
--zero-stage "${ZERO_STAGE}" \
--tensor-model-parallel-size "${MPSIZE}" \
--no-pipeline-parallel \
--num-layers "${NLAYERS}" \
--hidden-size "${HIDDEN}" \
--num-attention-heads "${ATEN_HEADS}" \
--micro-batch-size "${MICRO_BATCH}" \
--global-batch-size "${GLOBAL_BATCH}" \
--seq-length 1024 \
--max-position-embeddings 1024 \
--train-iters 200000 \
--lr-decay-iters 320000 \
--save "${CHECKPOINT_PATH}" \
--num-workers 1 \
--data-path "${DATA_PREFIX}/my-gpt2_text_document" \
--vocab-file "${DATA_PREFIX}/gpt2-vocab.json" \
--merge-file "${DATA_PREFIX}/gpt2-merges.txt" \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.00015 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--lr-warmup-fraction .01 \
--checkpoint-activations \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16
rm -f "${DS_CFG}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment