bigsnarfdude/train.sh

## train.sh
# install cudnn so we can use FlashAttention and run fast (optional)
# https://developer.nvidia.com/cudnn-downloads
# for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get -y install libcudnn9-dev-cuda-12

# "install" cudnn-frontend to ~/
git clone https://github.com/NVIDIA/cudnn-frontend.git

# install MPI (optional, if you intend to use multiple GPUs)
# (you might also have to install NVIDIA NCCL if it doesn't come with your setup)
sudo apt -y install openmpi-bin openmpi-doc libopenmpi-dev

# download and enter llm.c repo
git clone https://github.com/karpathy/llm.c.git
cd llm.c

# download the "starter pack" (~1GB download)
# contains GPT2-124M weights (used in tests), tokenizer, eval data .bin s
./dev/download_starter_pack.sh

# download the training dataset (FineWeb-Edu 100B token) .bin data shards
# note: this is a total of 1001 data shards. If you only want to test things
# out and don't want to do an actual run, feel free to append the number of
# training shards to download (e.g. for just 10 shards: ./edu_fineweb.sh 10)
# the full dataset is ~200GB, we can store it here in dev/data directory.
cd dev/data
./edu_fineweb.sh

# compile (~1 min 1st time for cuDNN mostly, few sec from then on)
cd ../../
make train_gpt2cu USE_CUDNN=1

# and train! (wait 24 hours here)
mpirun -np 8 ./train_gpt2cu \
	-i "dev/data/edu_fineweb100B/edu_fineweb_train_*.bin" \
	-j "dev/data/edu_fineweb100B/edu_fineweb_val_*.bin" \
	-o "log_gpt2_1558M" \
	-v 250 -s 300000 -g 384 \
	-h 1 \
	-b 16 -t 1024 \
	-d 1048576 \
	-r 0 \
	-z 1 \
	-c 0.1 \
	-k "cosine" \
	-l 0.0006 \
	-q 0.1 \
	-u 700 \
	-n 2000 \
	-x 32000 \
	-ge 1 \
	-y 1 \
	-e "d48"


num_parameters: 1557686400 => bytes: 3115372800
allocated 2971 MiB for model parameters
batch_size B=16 * seq_len T=1024 * num_processes=8 and total_batch_size=1048576
=> setting grad_accum_steps=8
created directory: log_gpt2_1558M
allocating 40409 MiB for activations
val loss 11.129390
allocating 2971 MiB for parameter gradients
allocating 742 MiB for AdamW optimizer state m
allocating 742 MiB for AdamW optimizer state v
allocating 742 MiB for master copy of params
step    1/32000 | loss 11.133732 (+nanz)| norm 52.9732 (+nanz)| lr 8.57e-07 | 3056.36 ms | 42.6% bf16 MFU | 343080 tok/s
step    2/32000 | loss 10.539388 (+nanz)| norm 43.5996 (+nanz)| lr 1.71e-06 | 2747.19 ms | 47.4% bf16 MFU | 381690 tok/s
step    3/32000 | loss 9.894109 (+nanz)| norm 23.2229 (+nanz)| lr 2.57e-06 | 2753.25 ms | 47.3% bf16 MFU | 381259 tok/s
step    4/32000 | loss 9.566241 (+nanz)| norm 28.4920 (+nanz)| lr 3.43e-06 | 2741.47 ms | 47.5% bf16 MFU | 381690 tok/s
step    5/32000 | loss 9.482848 (+nanz)| norm 23.7817 (+nanz)| lr 4.29e-06 | 2752.07 ms | 47.3% bf16 MFU | 381507 tok/s
step    6/32000 | loss 9.332832 (+nanz)| norm 15.9113 (+nanz)| lr 5.14e-06 | 2751.01 ms | 47.3% bf16 MFU | 381431 tok/s
step    7/32000 | loss 9.165650 (+nanz)| norm 10.5941 (+nanz)| lr 6.00e-06 | 2753.03 ms | 47.3% bf16 MFU | 381327 tok/s
step    8/32000 | loss 9.132234 (+nanz)| norm 16.2733 (+nanz)| lr 6.86e-06 | 2748.91 ms | 47.3% bf16 MFU | 381348 tok/s
step    9/32000 | loss 9.097384 (+nanz)| norm 12.1342 (+nanz)| lr 7.71e-06 | 2748.73 ms | 47.3% bf16 MFU | 381367 tok/s
step   10/32000 | loss 9.072879 (+nanz)| norm 10.5923 (+nanz)| lr 8.57e-06 | 2749.40 ms | 47.3% bf16 MFU | 381369 tok/s
...
	# install cudnn so we can use FlashAttention and run fast (optional)
	# https://developer.nvidia.com/cudnn-downloads
	# for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04
	wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
	sudo dpkg -i cuda-keyring_1.1-1_all.deb
	sudo apt-get update
	sudo apt-get -y install libcudnn9-dev-cuda-12

	# "install" cudnn-frontend to ~/
	git clone https://github.com/NVIDIA/cudnn-frontend.git

	# install MPI (optional, if you intend to use multiple GPUs)
	# (you might also have to install NVIDIA NCCL if it doesn't come with your setup)
	sudo apt -y install openmpi-bin openmpi-doc libopenmpi-dev

	# download and enter llm.c repo
	git clone https://github.com/karpathy/llm.c.git
	cd llm.c

	# download the "starter pack" (~1GB download)
	# contains GPT2-124M weights (used in tests), tokenizer, eval data .bin s
	./dev/download_starter_pack.sh

	# download the training dataset (FineWeb-Edu 100B token) .bin data shards
	# note: this is a total of 1001 data shards. If you only want to test things
	# out and don't want to do an actual run, feel free to append the number of
	# training shards to download (e.g. for just 10 shards: ./edu_fineweb.sh 10)
	# the full dataset is ~200GB, we can store it here in dev/data directory.
	cd dev/data
	./edu_fineweb.sh

	# compile (~1 min 1st time for cuDNN mostly, few sec from then on)
	cd ../../
	make train_gpt2cu USE_CUDNN=1

	# and train! (wait 24 hours here)
	mpirun -np 8 ./train_gpt2cu \
	-i "dev/data/edu_fineweb100B/edu_fineweb_train_*.bin" \
	-j "dev/data/edu_fineweb100B/edu_fineweb_val_*.bin" \
	-o "log_gpt2_1558M" \
	-v 250 -s 300000 -g 384 \
	-h 1 \
	-b 16 -t 1024 \
	-d 1048576 \
	-r 0 \
	-z 1 \
	-c 0.1 \
	-k "cosine" \
	-l 0.0006 \
	-q 0.1 \
	-u 700 \
	-n 2000 \
	-x 32000 \
	-ge 1 \
	-y 1 \
	-e "d48"




	num_parameters: 1557686400 => bytes: 3115372800
	allocated 2971 MiB for model parameters
	batch_size B=16 * seq_len T=1024 * num_processes=8 and total_batch_size=1048576
	=> setting grad_accum_steps=8
	created directory: log_gpt2_1558M
	allocating 40409 MiB for activations
	val loss 11.129390
	allocating 2971 MiB for parameter gradients
	allocating 742 MiB for AdamW optimizer state m
	allocating 742 MiB for AdamW optimizer state v
	allocating 742 MiB for master copy of params
	step 1/32000 \| loss 11.133732 (+nanz)\| norm 52.9732 (+nanz)\| lr 8.57e-07 \| 3056.36 ms \| 42.6% bf16 MFU \| 343080 tok/s
	step 2/32000 \| loss 10.539388 (+nanz)\| norm 43.5996 (+nanz)\| lr 1.71e-06 \| 2747.19 ms \| 47.4% bf16 MFU \| 381690 tok/s
	step 3/32000 \| loss 9.894109 (+nanz)\| norm 23.2229 (+nanz)\| lr 2.57e-06 \| 2753.25 ms \| 47.3% bf16 MFU \| 381259 tok/s
	step 4/32000 \| loss 9.566241 (+nanz)\| norm 28.4920 (+nanz)\| lr 3.43e-06 \| 2741.47 ms \| 47.5% bf16 MFU \| 381690 tok/s
	step 5/32000 \| loss 9.482848 (+nanz)\| norm 23.7817 (+nanz)\| lr 4.29e-06 \| 2752.07 ms \| 47.3% bf16 MFU \| 381507 tok/s
	step 6/32000 \| loss 9.332832 (+nanz)\| norm 15.9113 (+nanz)\| lr 5.14e-06 \| 2751.01 ms \| 47.3% bf16 MFU \| 381431 tok/s
	step 7/32000 \| loss 9.165650 (+nanz)\| norm 10.5941 (+nanz)\| lr 6.00e-06 \| 2753.03 ms \| 47.3% bf16 MFU \| 381327 tok/s
	step 8/32000 \| loss 9.132234 (+nanz)\| norm 16.2733 (+nanz)\| lr 6.86e-06 \| 2748.91 ms \| 47.3% bf16 MFU \| 381348 tok/s
	step 9/32000 \| loss 9.097384 (+nanz)\| norm 12.1342 (+nanz)\| lr 7.71e-06 \| 2748.73 ms \| 47.3% bf16 MFU \| 381367 tok/s
	step 10/32000 \| loss 9.072879 (+nanz)\| norm 10.5923 (+nanz)\| lr 8.57e-06 \| 2749.40 ms \| 47.3% bf16 MFU \| 381369 tok/s
	...