from datasets import load_dataset
fineweb100b = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-100BT", split="train")
./train_gpt2cu \
-i "dev/data/fineweb10B/fineweb_train_*.bin" \
-j "dev/data/fineweb10B/fineweb_val_*.bin" \
-o log124M \
-e "d12" \
-b 64 -t 1024 \
-d 524288 \
-r 1 \
-z 1 \
-c 0.1 \
-l 0.0006 \
-q 0.0 \
-u 700 \
-n 5000 \
-v 250 \
-s 20000 \
-h 1
mpirun -np 2 ./train_gpt2cu \
-i "dev/data/fineweb100B/fineweb_train_*.bin" \
-j "dev/data/fineweb100B/fineweb_val_*.bin" \
-o log124M_300B \
-v 250 -s 20000 -g 144 \
-h 1 \
-b 64 -t 1024 \
-d 524288 \
-r 0 \
-z 1 \
-c 0.1 \
-l 0.0006 \
-q 0.0 \
-u 700 \
-n 10000 \
-y 1 \
-x 565950 \
-e "d12"
mpirun -np 8 ./train_gpt2cu \
-i "dev/data/fineweb10B/fineweb_train_*.bin" \
-j "dev/data/fineweb10B/fineweb_val_*.bin" \
-o log350M \
-v 250 -s 100000 -g 144 \
-h 1 \
-b 8 -t 1024 \
-d 524288 \
-r 0 \
-z 1 \
-c 0.1 \
-l 0.0003 \
-q 0.0 \
-u 700 \
-n 2000 \
-x 60000 \
-y 1 \
-e "d24"
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.78 Driver Version: 550.78 CUDA Version: 12.4 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 4070 ... Off | 00000000:02:00.0 On | N/A |
| 50% 64C P2 272W / 285W | 13911MiB / 16376MiB | 100% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 12994 C ./train_gpt2cu 13904MiB |
+-----------------------------------------------------------------------------------------+
step 251/60000 | train loss 6.302280 | norm 1.1727 | lr 1.08e-04 | 16401.14 ms | 24.9% A100 fp16 MFU | 31968 tok/s
step 252/60000 | train loss 6.254554 | norm 1.4881 | lr 1.08e-04 | 16400.51 ms | 24.9% A100 fp16 MFU | 31968 tok/s
step 253/60000 | train loss 6.251244 | norm 1.2173 | lr 1.08e-04 | 16399.55 ms | 24.9% A100 fp16 MFU | 31968 tok/s
step 254/60000 | train loss 6.307840 | norm 1.6588 | lr 1.09e-04 | 16401.02 ms | 24.9% A100 fp16 MFU | 31968 tok/s
step 255/60000 | train loss 6.336817 | norm 1.0084 | lr 1.09e-04 | 16402.46 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 256/60000 | train loss 6.323288 | norm 1.7937 | lr 1.10e-04 | 16400.88 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 257/60000 | train loss 6.339058 | norm 1.2263 | lr 1.10e-04 | 16399.45 ms | 24.9% A100 fp16 MFU | 31968 tok/s
step 258/60000 | train loss 6.288810 | norm 0.9651 | lr 1.11e-04 | 16400.91 ms | 24.9% A100 fp16 MFU | 31968 tok/s
step 259/60000 | train loss 6.263037 | norm 1.1670 | lr 1.11e-04 | 16401.31 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 260/60000 | train loss 6.341669 | norm 1.3367 | lr 1.11e-04 | 16400.97 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 261/60000 | train loss 6.278675 | norm 1.7383 | lr 1.12e-04 | 16401.04 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 262/60000 | train loss 6.262439 | norm 0.9100 | lr 1.12e-04 | 16401.74 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 263/60000 | train loss 6.255550 | norm 1.0782 | lr 1.13e-04 | 16400.43 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 264/60000 | train loss 6.247169 | norm 1.5063 | lr 1.13e-04 | 16400.03 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 265/60000 | train loss 6.312371 | norm 0.9950 | lr 1.14e-04 | 16400.39 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 266/60000 | train loss 6.287064 | norm 1.1452 | lr 1.14e-04 | 16401.79 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 267/60000 | train loss 6.290064 | norm 1.1453 | lr 1.14e-04 | 16402.01 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 268/60000 | train loss 6.268078 | norm 1.3768 | lr 1.15e-04 | 16400.70 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 269/60000 | train loss 6.224830 | norm 1.4903 | lr 1.15e-04 | 16400.79 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 270/60000 | train loss 6.239734 | norm 1.3091 | lr 1.16e-04 | 16400.57 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 271/60000 | train loss 6.287425 | norm 1.3941 | lr 1.16e-04 | 16400.86 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 272/60000 | train loss 6.185336 | norm 1.1778 | lr 1.17e-04 | 16399.42 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 273/60000 | train loss 6.195509 | norm 1.3949 | lr 1.17e-04 | 16401.39 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 274/60000 | train loss 6.178558 | norm 1.0532 | lr 1.17e-04 | 16401.70 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 275/60000 | train loss 6.331632 | norm 1.3139 | lr 1.18e-04 | 16400.40 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 276/60000 | train loss 6.250790 | norm 1.2131 | lr 1.18e-04 | 16401.06 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 277/60000 | train loss 6.163811 | norm 0.8408 | lr 1.19e-04 | 16400.38 ms | 24.9% A100 fp16 MFU | 31967 tok/s
step 278/60000 | train loss 6.192279 | norm 0.9928 | lr 1.19e-04 | 16402.65 ms | 24.9% A100 fp16 MFU | 31967 tok/s
11.39 days to complete 60,000 iterations
```
Last active
June 4, 2024 17:55
-
-
Save bigsnarfdude/4c1455b1db4b3f4c681fa9b8827367ec to your computer and use it in GitHub Desktop.
llm.c-fineweb-larger-models.md
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment