Last active
July 14, 2024 22:56
-
-
Save bigsnarfdude/1f22a71c3b6ab35bf94e89b06a13571f to your computer and use it in GitHub Desktop.
1.8B GTP2 script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# install cudnn so we can use FlashAttention and run fast (optional) | |
# https://developer.nvidia.com/cudnn-downloads | |
# for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04 | |
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb | |
sudo dpkg -i cuda-keyring_1.1-1_all.deb | |
sudo apt-get update | |
sudo apt-get -y install libcudnn9-dev-cuda-12 | |
# "install" cudnn-frontend to ~/ | |
git clone https://github.com/NVIDIA/cudnn-frontend.git | |
# install MPI (optional, if you intend to use multiple GPUs) | |
# (you might also have to install NVIDIA NCCL if it doesn't come with your setup) | |
sudo apt -y install openmpi-bin openmpi-doc libopenmpi-dev | |
# download and enter llm.c repo | |
git clone https://github.com/karpathy/llm.c.git | |
cd llm.c | |
# download the "starter pack" (~1GB download) | |
# contains GPT2-124M weights (used in tests), tokenizer, eval data .bin s | |
./dev/download_starter_pack.sh | |
# download the training dataset (FineWeb-Edu 100B token) .bin data shards | |
# note: this is a total of 1001 data shards. If you only want to test things | |
# out and don't want to do an actual run, feel free to append the number of | |
# training shards to download (e.g. for just 10 shards: ./edu_fineweb.sh 10) | |
# the full dataset is ~200GB, we can store it here in dev/data directory. | |
cd dev/data | |
./edu_fineweb.sh | |
# compile (~1 min 1st time for cuDNN mostly, few sec from then on) | |
cd ../../ | |
make train_gpt2cu USE_CUDNN=1 | |
# and train! (wait 24 hours here) | |
mpirun -np 8 ./train_gpt2cu \ | |
-i "dev/data/edu_fineweb100B/edu_fineweb_train_*.bin" \ | |
-j "dev/data/edu_fineweb100B/edu_fineweb_val_*.bin" \ | |
-o "log_gpt2_1558M" \ | |
-v 250 -s 300000 -g 384 \ | |
-h 1 \ | |
-b 16 -t 1024 \ | |
-d 1048576 \ | |
-r 0 \ | |
-z 1 \ | |
-c 0.1 \ | |
-k "cosine" \ | |
-l 0.0006 \ | |
-q 0.1 \ | |
-u 700 \ | |
-n 2000 \ | |
-x 32000 \ | |
-ge 1 \ | |
-y 1 \ | |
-e "d48" | |
num_parameters: 1557686400 => bytes: 3115372800 | |
allocated 2971 MiB for model parameters | |
batch_size B=16 * seq_len T=1024 * num_processes=8 and total_batch_size=1048576 | |
=> setting grad_accum_steps=8 | |
created directory: log_gpt2_1558M | |
allocating 40409 MiB for activations | |
val loss 11.129390 | |
allocating 2971 MiB for parameter gradients | |
allocating 742 MiB for AdamW optimizer state m | |
allocating 742 MiB for AdamW optimizer state v | |
allocating 742 MiB for master copy of params | |
step 1/32000 | loss 11.133732 (+nanz)| norm 52.9732 (+nanz)| lr 8.57e-07 | 3056.36 ms | 42.6% bf16 MFU | 343080 tok/s | |
step 2/32000 | loss 10.539388 (+nanz)| norm 43.5996 (+nanz)| lr 1.71e-06 | 2747.19 ms | 47.4% bf16 MFU | 381690 tok/s | |
step 3/32000 | loss 9.894109 (+nanz)| norm 23.2229 (+nanz)| lr 2.57e-06 | 2753.25 ms | 47.3% bf16 MFU | 381259 tok/s | |
step 4/32000 | loss 9.566241 (+nanz)| norm 28.4920 (+nanz)| lr 3.43e-06 | 2741.47 ms | 47.5% bf16 MFU | 381690 tok/s | |
step 5/32000 | loss 9.482848 (+nanz)| norm 23.7817 (+nanz)| lr 4.29e-06 | 2752.07 ms | 47.3% bf16 MFU | 381507 tok/s | |
step 6/32000 | loss 9.332832 (+nanz)| norm 15.9113 (+nanz)| lr 5.14e-06 | 2751.01 ms | 47.3% bf16 MFU | 381431 tok/s | |
step 7/32000 | loss 9.165650 (+nanz)| norm 10.5941 (+nanz)| lr 6.00e-06 | 2753.03 ms | 47.3% bf16 MFU | 381327 tok/s | |
step 8/32000 | loss 9.132234 (+nanz)| norm 16.2733 (+nanz)| lr 6.86e-06 | 2748.91 ms | 47.3% bf16 MFU | 381348 tok/s | |
step 9/32000 | loss 9.097384 (+nanz)| norm 12.1342 (+nanz)| lr 7.71e-06 | 2748.73 ms | 47.3% bf16 MFU | 381367 tok/s | |
step 10/32000 | loss 9.072879 (+nanz)| norm 10.5923 (+nanz)| lr 8.57e-06 | 2749.40 ms | 47.3% bf16 MFU | 381369 tok/s | |
... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment