chengscott/ds.patch

## ds.patch
diff --git a/megatron/arguments.py b/megatron/arguments.py
index 4bf1d72..bcea6ce 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -1090,7 +1090,7 @@ def _add_distributed_args(parser):
                        default=False, help='If set, use custom-built ring exchange '
                        'for p2p communications. Note that this option will require '
                        'a custom built image that support ring-exchange p2p.')
-    group.add_argument('--local_rank', type=int, default=None,
+    group.add_argument('--local-rank', type=int, default=None,
                        help='local rank passed from distributed launcher.')
     group.add_argument('--lazy-mpu-init', type=bool, required=False,
                        help='If set to True, initialize_megatron() '

## ds.tpl.json
{
    "train_batch_size" : GLOBAL_BATCH,
    "train_micro_batch_size_per_gpu": MICRO_BATCH,
    "gradient_accumulation_steps": 1,
    "steps_per_print": 1,
    "wall_clock_breakdown" : true,
    "zero_optimization": {
      "stage": ZERO_STAGE,
      "allgather_partitions": true,
      "reduce_scatter": true,
      "allgather_bucket_size": 5e8,
      "overlap_comm": true,
      "contiguous_gradients": true
    },
    "fp16": {
      "enabled": true,
      "initial_scale_power": 12
    }
}

## gpt.sh
#! /bin/bash -xe
[[ -z "$MODEL_SIZE" ]] && { echo "MODEL_SIZE is not set"; exit 1; }
[[ -z "$NLAYERS" ]] && { echo "NLAYERS is not set"; exit 1; }
[[ -z "$HIDDEN" ]] && { echo "HIDDEN is not set"; exit 1; }
[[ -z "$ATEN_HEADS" ]] && { echo "ATEN_HEADS is not set"; exit 1; }
[[ -z "$PBS_NODEFILE" ]] && { echo "PBS_NODEFILE is not set"; exit 1; }
[[ -z "$ZERO_STAGE" ]] && { echo "ZERO_STAGE is not set"; exit 1; }
[[ -z "$DATA_PREFIX" ]] && { echo "DATA_PREFIX is not set"; exit 1; }
[[ -z "$CHECKPOINT_PATH" ]] && { echo "CHECKPOINT_PATH is not set"; exit 1; }

NNODES=$(wc -l < "${PBS_NODEFILE}")
GPUS_PER_NODE=4
WORLD_SIZE=$((GPUS_PER_NODE * NNODES))

DDP_IMPL="local"
MPSIZE=1
MICRO_BATCH=1
GLOBAL_BATCH=$((WORLD_SIZE * MICRO_BATCH / MPSIZE ))

DS_CFG="ds-${NNODES}-${MODEL_SIZE}-zero${ZERO_STAGE}-${DDP_IMPL}.json"
cp ds.tpl.json "${DS_CFG}"
sed -i "s/GLOBAL_BATCH/${GLOBAL_BATCH}/g" "${DS_CFG}"
sed -i "s/MICRO_BATCH/${MICRO_BATCH}/g" "${DS_CFG}"
sed -i "s/ZERO_STAGE/${ZERO_STAGE}/g" "${DS_CFG}"

date -R


mpiexec -np "${WORLD_SIZE}" --ppn "${GPUS_PER_NODE}" --hostfile "${PBS_NODEFILE}" \
    --env MASTER_ADDR="${HOSTNAME}" \
    --env MASTER_PORT=5566 \
    python pretrain_gpt.py \
       --DDP-impl "${DDP_IMPL}" \
       --deepspeed \
       --deepspeed_mpi \
       --deepspeed_config "${DS_CFG}" \
       --zero-stage "${ZERO_STAGE}" \
       --tensor-model-parallel-size "${MPSIZE}" \
       --no-pipeline-parallel \
       --num-layers "${NLAYERS}" \
       --hidden-size "${HIDDEN}" \
       --num-attention-heads "${ATEN_HEADS}" \
       --micro-batch-size "${MICRO_BATCH}" \
       --global-batch-size "${GLOBAL_BATCH}" \
       --seq-length 1024 \
       --max-position-embeddings 1024 \
       --train-iters 200000 \
       --lr-decay-iters 320000 \
       --save "${CHECKPOINT_PATH}" \
       --num-workers 1 \
       --data-path "${DATA_PREFIX}/my-gpt2_text_document" \
       --vocab-file "${DATA_PREFIX}/gpt2-vocab.json" \
       --merge-file "${DATA_PREFIX}/gpt2-merges.txt" \
       --data-impl mmap \
       --split 949,50,1 \
       --distributed-backend nccl \
       --lr 0.00015 \
       --lr-decay-style cosine \
       --min-lr 1.0e-5 \
       --weight-decay 1e-2 \
       --clip-grad 1.0 \
       --lr-warmup-fraction .01 \
       --checkpoint-activations \
       --log-interval 100 \
       --save-interval 10000 \
       --eval-interval 1000 \
       --eval-iters 10 \
       --fp16

rm -f "${DS_CFG}"
	diff --git a/megatron/arguments.py b/megatron/arguments.py
	index 4bf1d72..bcea6ce 100644
	--- a/megatron/arguments.py
	+++ b/megatron/arguments.py
	@@ -1090,7 +1090,7 @@ def _add_distributed_args(parser):
	default=False, help='If set, use custom-built ring exchange '
	'for p2p communications. Note that this option will require '
	'a custom built image that support ring-exchange p2p.')
	- group.add_argument('--local_rank', type=int, default=None,
	+ group.add_argument('--local-rank', type=int, default=None,
	help='local rank passed from distributed launcher.')
	group.add_argument('--lazy-mpu-init', type=bool, required=False,
	help='If set to True, initialize_megatron() '
	{
	"train_batch_size" : GLOBAL_BATCH,
	"train_micro_batch_size_per_gpu": MICRO_BATCH,
	"gradient_accumulation_steps": 1,
	"steps_per_print": 1,
	"wall_clock_breakdown" : true,
	"zero_optimization": {
	"stage": ZERO_STAGE,
	"allgather_partitions": true,
	"reduce_scatter": true,
	"allgather_bucket_size": 5e8,
	"overlap_comm": true,
	"contiguous_gradients": true
	},
	"fp16": {
	"enabled": true,
	"initial_scale_power": 12
	}
	}
	#! /bin/bash -xe
	[[ -z "$MODEL_SIZE" ]] && { echo "MODEL_SIZE is not set"; exit 1; }
	[[ -z "$NLAYERS" ]] && { echo "NLAYERS is not set"; exit 1; }
	[[ -z "$HIDDEN" ]] && { echo "HIDDEN is not set"; exit 1; }
	[[ -z "$ATEN_HEADS" ]] && { echo "ATEN_HEADS is not set"; exit 1; }
	[[ -z "$PBS_NODEFILE" ]] && { echo "PBS_NODEFILE is not set"; exit 1; }
	[[ -z "$ZERO_STAGE" ]] && { echo "ZERO_STAGE is not set"; exit 1; }
	[[ -z "$DATA_PREFIX" ]] && { echo "DATA_PREFIX is not set"; exit 1; }
	[[ -z "$CHECKPOINT_PATH" ]] && { echo "CHECKPOINT_PATH is not set"; exit 1; }

	NNODES=$(wc -l < "${PBS_NODEFILE}")
	GPUS_PER_NODE=4
	WORLD_SIZE=$((GPUS_PER_NODE * NNODES))

	DDP_IMPL="local"
	MPSIZE=1
	MICRO_BATCH=1
	GLOBAL_BATCH=$((WORLD_SIZE * MICRO_BATCH / MPSIZE ))

	DS_CFG="ds-${NNODES}-${MODEL_SIZE}-zero${ZERO_STAGE}-${DDP_IMPL}.json"
	cp ds.tpl.json "${DS_CFG}"
	sed -i "s/GLOBAL_BATCH/${GLOBAL_BATCH}/g" "${DS_CFG}"
	sed -i "s/MICRO_BATCH/${MICRO_BATCH}/g" "${DS_CFG}"
	sed -i "s/ZERO_STAGE/${ZERO_STAGE}/g" "${DS_CFG}"

	date -R


	mpiexec -np "${WORLD_SIZE}" --ppn "${GPUS_PER_NODE}" --hostfile "${PBS_NODEFILE}" \
	--env MASTER_ADDR="${HOSTNAME}" \
	--env MASTER_PORT=5566 \
	python pretrain_gpt.py \
	--DDP-impl "${DDP_IMPL}" \
	--deepspeed \
	--deepspeed_mpi \
	--deepspeed_config "${DS_CFG}" \
	--zero-stage "${ZERO_STAGE}" \
	--tensor-model-parallel-size "${MPSIZE}" \
	--no-pipeline-parallel \
	--num-layers "${NLAYERS}" \
	--hidden-size "${HIDDEN}" \
	--num-attention-heads "${ATEN_HEADS}" \
	--micro-batch-size "${MICRO_BATCH}" \
	--global-batch-size "${GLOBAL_BATCH}" \
	--seq-length 1024 \
	--max-position-embeddings 1024 \
	--train-iters 200000 \
	--lr-decay-iters 320000 \
	--save "${CHECKPOINT_PATH}" \
	--num-workers 1 \
	--data-path "${DATA_PREFIX}/my-gpt2_text_document" \
	--vocab-file "${DATA_PREFIX}/gpt2-vocab.json" \
	--merge-file "${DATA_PREFIX}/gpt2-merges.txt" \
	--data-impl mmap \
	--split 949,50,1 \
	--distributed-backend nccl \
	--lr 0.00015 \
	--lr-decay-style cosine \
	--min-lr 1.0e-5 \
	--weight-decay 1e-2 \
	--clip-grad 1.0 \
	--lr-warmup-fraction .01 \
	--checkpoint-activations \
	--log-interval 100 \
	--save-interval 10000 \
	--eval-interval 1000 \
	--eval-iters 10 \
	--fp16

	rm -f "${DS_CFG}"