sdan/train_swarm.sh

## train_swarm.sh
#!/bin/sh
set -x

# == Swarm training (alpha release) ==

# Setup:
#
#   git clone https://github.com/shawwn/gpt-2
#   cd gpt-2
#   git checkout dev-shard
#   python3 download_model.py 117M
#   python3 download_model.py 1558M
#   sudo pip3 install -r requirements.txt
#
# also install tensorflow==1.15.0, or put it in a venv if you don't want to globally downgrade


# To prepare a dataset, you don't need to do anything! You can pass the dataset as --dataset foo.txt
#
# However, for performance, when training on dozens or hundreds of TPUs, you will want to tokenize the dataset
# since the tokenizer can only generate about 16k tokens per second. (That's about 4 TPUv3-8s for 117M, or 64 TPUv3-8's for 1.5B.)
#
# To tokenize the dataset, run:
#
#   python3 tokenize_dataset.py foo.txt foo.txt.npz
#
# then pass in --dataset foo.txt.npz to train_multi.py.
#

# ~~~WARNING~~~: The following scripts create TPUv3-8's in zone europe-west4-a. TFRC doesn't grant access to these TPUs by default.
# To train using the default 100 TPUv2-8's, replace "europe-west4-a" with "us-central1-f" in all following lines, and change
# --accelerator-type "v3-8" to --accelerator-type "v2-8".
#
# Note that training GPT-2 1.5B might not work on TPUv2-8's due to memory limitations, because right now the code is set up to
# use 3 out of 8 TPU cores. However, there are two ways around this:
#
# 1. You can create up to 5 non-preemptible TPUv3-8's in us-central1-f, and create a swarm consisting of those. (Be sure to
#    **remove** the --preemptible flag when creating the five TPUs!)
#
# 2. Bug me on twitter to add a command line option so that you can train on 100 preemptible TPUv2-8's by disabling the multicore training.
#
# (This is an alpha release, so bear with me. The final release will support all these configurations without much effort.)
# ~~~END WARNING~~~


# ~~~ CRUCIAL WARNING ~~~ YOU MUST READ THIS SECTION!! ~~~
#
# Make **ABSOLUTELY CERTAIN** that your VM is in the *exact same region* as the TPUs. If your VM is in a different region,
# then swarm training will accrue hundreds of dollars of bandwidth charges very quickly! I learned this the hard way.
#
# YOU HAVE BEEN WARNED.
#
# On the other hand, this is pretty much the only thing you have to worry about. Everything else is perfectly safe.
# The worst that can happen other than this is that the code won't work.
#
# ~~~ END CRUCIAL WARNING ~~~


# Save the following script as `tpu-create-eu`:
#
#  #!/bin/sh
#  set -ex
#  i=${1}
#  shift 1
#  exec gcloud compute tpus create tpeu${i} --zone europe-west4-a --network default --range 10.49.${i}.0/29 --version 1.15 --accelerator-type "v3-8" --preemptible "$@"
#

# Now you can create a bunch of TPUs by running:
#
# for i in {0..19}; do tpu-create-eu $i --async & done
#

# Each TPU will preempt after 24 hours. After that, you'll need to delete them and re-create them. Save this script as `tpu-delete-eu`:
#
#  #!/bin/sh
#  set -ex
#  i=${1}
#  shift 1
#  exec gcloud compute tpus delete --zone europe-west4-a tpeu${1} "$@"
#

# Now you can delete the TPUs by running:
#
# for i in {0..19}; do tpu-delete-eu $i --quiet --async & done
#

# After creating the TPUs, fill in their IP addresses below.
# (The $targets variable should end up as a comma-separated list of IPs.)
#
# You can get your TPU ip addresses by running:
#
#  gcloud compute tpus list --zone europe-west4-a
#
# (which I've aliased to `tpu-satus-eu` for convenience.)
#

# If you run into any problems or have any questions, message me or DM me on twitter:
#
#   https://twitter.com/theshawwn
#


targets=
targets="${targets}grpc://10.49.0.2:8470"
targets="${targets},grpc://10.49.1.2:8470"
targets="${targets},grpc://10.49.2.2:8470"
targets="${targets},grpc://10.49.3.2:8470"
targets="${targets},grpc://10.49.4.2:8470"
targets="${targets},grpc://10.49.5.2:8470"
targets="${targets},grpc://10.49.6.2:8470"
targets="${targets},grpc://10.49.7.2:8470"
targets="${targets},grpc://10.49.8.2:8470"
targets="${targets},grpc://10.49.9.2:8470"
targets="${targets},grpc://10.49.10.2:8470"
targets="${targets},grpc://10.49.11.2:8470"
targets="${targets},grpc://10.49.12.2:8470"
targets="${targets},grpc://10.49.13.2:8470"
targets="${targets},grpc://10.49.14.2:8470"
targets="${targets},grpc://10.49.15.2:8470"
targets="${targets},grpc://10.49.16.2:8470"
targets="${targets},grpc://10.49.17.2:8470"
targets="${targets},grpc://10.49.18.2:8470"
targets="${targets},grpc://10.49.19.2:8470"

# 117M
exec python3 -m pdb -c continue train_multi.py --targets "$targets" --dataset ./yourdataset.txt.npz --run_name yourmodelsmall --optimizer adam --model_name 117M --batch_size 28 --learning_rate 0.000055 --only_train_transformer_layers --dtype float32 --device 0 --skip_cores 1 --max_cores 7 --colocate_gradients --memory_saving_gradients --allow_soft_placement --init_tpu "$@"

# 1.5B
#exec python3 -m pdb -c continue train_multi.py --targets "$targets" --dataset ./yourdataset.txt.npz --run_name yourmodelxl --optimizer adam --model_name 1558M --batch_size 6 --learning_rate 0.000055 --only_train_transformer_layers --dtype float32 --device 1 --skip_cores 4 --max_cores 3 --colocate_gradients --memory_saving_gradients --allow_soft_placement --init_tpu "$@"
	#!/bin/sh
	set -x

	# == Swarm training (alpha release) ==

	# Setup:
	#
	# git clone https://github.com/shawwn/gpt-2
	# cd gpt-2
	# git checkout dev-shard
	# python3 download_model.py 117M
	# python3 download_model.py 1558M
	# sudo pip3 install -r requirements.txt
	#
	# also install tensorflow==1.15.0, or put it in a venv if you don't want to globally downgrade


	# To prepare a dataset, you don't need to do anything! You can pass the dataset as --dataset foo.txt
	#
	# However, for performance, when training on dozens or hundreds of TPUs, you will want to tokenize the dataset
	# since the tokenizer can only generate about 16k tokens per second. (That's about 4 TPUv3-8s for 117M, or 64 TPUv3-8's for 1.5B.)
	#
	# To tokenize the dataset, run:
	#
	# python3 tokenize_dataset.py foo.txt foo.txt.npz
	#
	# then pass in --dataset foo.txt.npz to train_multi.py.
	#

	# ~~~WARNING~~~: The following scripts create TPUv3-8's in zone europe-west4-a. TFRC doesn't grant access to these TPUs by default.
	# To train using the default 100 TPUv2-8's, replace "europe-west4-a" with "us-central1-f" in all following lines, and change
	# --accelerator-type "v3-8" to --accelerator-type "v2-8".
	#
	# Note that training GPT-2 1.5B might not work on TPUv2-8's due to memory limitations, because right now the code is set up to
	# use 3 out of 8 TPU cores. However, there are two ways around this:
	#
	# 1. You can create up to 5 non-preemptible TPUv3-8's in us-central1-f, and create a swarm consisting of those. (Be sure to
	# remove the --preemptible flag when creating the five TPUs!)
	#
	# 2. Bug me on twitter to add a command line option so that you can train on 100 preemptible TPUv2-8's by disabling the multicore training.
	#
	# (This is an alpha release, so bear with me. The final release will support all these configurations without much effort.)
	# ~~~END WARNING~~~


	# ~~~ CRUCIAL WARNING ~~~ YOU MUST READ THIS SECTION!! ~~~
	#
	# Make ABSOLUTELY CERTAIN that your VM is in the exact same region as the TPUs. If your VM is in a different region,
	# then swarm training will accrue hundreds of dollars of bandwidth charges very quickly! I learned this the hard way.
	#
	# YOU HAVE BEEN WARNED.
	#
	# On the other hand, this is pretty much the only thing you have to worry about. Everything else is perfectly safe.
	# The worst that can happen other than this is that the code won't work.
	#
	# ~~~ END CRUCIAL WARNING ~~~


	# Save the following script as `tpu-create-eu`:
	#
	# #!/bin/sh
	# set -ex
	# i=${1}
	# shift 1
	# exec gcloud compute tpus create tpeu${i} --zone europe-west4-a --network default --range 10.49.${i}.0/29 --version 1.15 --accelerator-type "v3-8" --preemptible "$@"
	#

	# Now you can create a bunch of TPUs by running:
	#
	# for i in {0..19}; do tpu-create-eu $i --async & done
	#

	# Each TPU will preempt after 24 hours. After that, you'll need to delete them and re-create them. Save this script as `tpu-delete-eu`:
	#
	# #!/bin/sh
	# set -ex
	# i=${1}
	# shift 1
	# exec gcloud compute tpus delete --zone europe-west4-a tpeu${1} "$@"
	#

	# Now you can delete the TPUs by running:
	#
	# for i in {0..19}; do tpu-delete-eu $i --quiet --async & done
	#

	# After creating the TPUs, fill in their IP addresses below.
	# (The $targets variable should end up as a comma-separated list of IPs.)
	#
	# You can get your TPU ip addresses by running:
	#
	# gcloud compute tpus list --zone europe-west4-a
	#
	# (which I've aliased to `tpu-satus-eu` for convenience.)
	#

	# If you run into any problems or have any questions, message me or DM me on twitter:
	#
	# https://twitter.com/theshawwn
	#


	targets=
	targets="${targets}grpc://10.49.0.2:8470"
	targets="${targets},grpc://10.49.1.2:8470"
	targets="${targets},grpc://10.49.2.2:8470"
	targets="${targets},grpc://10.49.3.2:8470"
	targets="${targets},grpc://10.49.4.2:8470"
	targets="${targets},grpc://10.49.5.2:8470"
	targets="${targets},grpc://10.49.6.2:8470"
	targets="${targets},grpc://10.49.7.2:8470"
	targets="${targets},grpc://10.49.8.2:8470"
	targets="${targets},grpc://10.49.9.2:8470"
	targets="${targets},grpc://10.49.10.2:8470"
	targets="${targets},grpc://10.49.11.2:8470"
	targets="${targets},grpc://10.49.12.2:8470"
	targets="${targets},grpc://10.49.13.2:8470"
	targets="${targets},grpc://10.49.14.2:8470"
	targets="${targets},grpc://10.49.15.2:8470"
	targets="${targets},grpc://10.49.16.2:8470"
	targets="${targets},grpc://10.49.17.2:8470"
	targets="${targets},grpc://10.49.18.2:8470"
	targets="${targets},grpc://10.49.19.2:8470"

	# 117M
	exec python3 -m pdb -c continue train_multi.py --targets "$targets" --dataset ./yourdataset.txt.npz --run_name yourmodelsmall --optimizer adam --model_name 117M --batch_size 28 --learning_rate 0.000055 --only_train_transformer_layers --dtype float32 --device 0 --skip_cores 1 --max_cores 7 --colocate_gradients --memory_saving_gradients --allow_soft_placement --init_tpu "$@"

	# 1.5B
	#exec python3 -m pdb -c continue train_multi.py --targets "$targets" --dataset ./yourdataset.txt.npz --run_name yourmodelxl --optimizer adam --model_name 1558M --batch_size 6 --learning_rate 0.000055 --only_train_transformer_layers --dtype float32 --device 1 --skip_cores 4 --max_cores 3 --colocate_gradients --memory_saving_gradients --allow_soft_placement --init_tpu "$@"