-
-
Save Beomi/968e17ffd5840a5c17e490d00687c3d9 to your computer and use it in GitHub Desktop.
ZeRO-3 BERT Train(with Sampled dataset) Output Log with NCCL_DEBUG=INFO
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ds-huggingface) root@jupyter-beomi:~/2021.05.17.DeepSpeed_Huggingface# ./zero3_gpu_run_mlm.sh | |
[2021-05-16 22:44:59,482] [WARNING] [runner.py:122:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. | |
[2021-05-16 22:44:59,520] [INFO] [runner.py:360:main] cmd = /home/jovyan/anaconda3/envs/ds-huggingface/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=29500 run_mlm.py --seed 42 --model_type bert --tokenizer_name beomi/KcELECTRA-base --train_file ./sampled_20190101_20200611_v2.txt --num_train_epochs 2 --per_device_train_batch_size 32 --per_device_eval_batch_size 32 --do_train --output_dir ./test-bert-zero3 --fp16 --logging_first_step --max_seq_length 300 --deepspeed ./ds_zero3_1gpu.json | |
[2021-05-16 22:45:01,372] [INFO] [launch.py:73:main] 0 NCCL_DEBUG INFO | |
[2021-05-16 22:45:01,372] [INFO] [launch.py:80:main] WORLD INFO DICT: {'localhost': [0]} | |
[2021-05-16 22:45:01,372] [INFO] [launch.py:86:main] nnodes=1, num_local_procs=1, node_rank=0 | |
[2021-05-16 22:45:01,372] [INFO] [launch.py:101:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0]}) | |
[2021-05-16 22:45:01,372] [INFO] [launch.py:102:main] dist_world_size=1 | |
[2021-05-16 22:45:01,372] [INFO] [launch.py:104:main] Setting CUDA_VISIBLE_DEVICES=0 | |
[2021-05-16 22:45:05,033] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl | |
WARNING:__main__:Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: True | |
INFO:__main__:Training/evaluation parameters TrainingArguments(output_dir=./test-bert-zero3, overwrite_output_dir=False, do_train=True, do_eval=False, do_predict=False, evaluation_strategy=IntervalStrategy.NO, prediction_loss_only=False, per_device_train_batch_size=32, per_device_eval_batch_size=32, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=2.0, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_ratio=0.0, warmup_steps=0, logging_dir=runs/May16_22-45-04_jupyter-beomi, logging_strategy=IntervalStrategy.STEPS, logging_first_step=True, logging_steps=500, save_strategy=IntervalStrategy.STEPS, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=True, fp16_opt_level=O1, fp16_backend=auto, fp16_full_eval=False, local_rank=0, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name=./test-bert-zero3, disable_tqdm=False, remove_unused_columns=True, label_names=None, load_best_model_at_end=False, metric_for_best_model=None, greater_is_better=None, ignore_data_skip=False, sharded_ddp=[], deepspeed=./ds_zero3_1gpu.json, label_smoothing_factor=0.0, adafactor=False, group_by_length=False, length_column_name=length, report_to=['tensorboard'], ddp_find_unused_parameters=None, dataloader_pin_memory=True, skip_memory_metrics=False, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, _n_gpu=1, mp_parameters=) | |
WARNING:datasets.builder:Using custom data configuration default-43493ca3484df8f8 | |
Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/jovyan/.cache/huggingface/datasets/text/default-43493ca3484df8f8/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5... | |
Dataset text downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/text/default-43493ca3484df8f8/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data. | |
WARNING:__main__:You are instantiating a new config instance from scratch. | |
[INFO|configuration_utils.py:517] 2021-05-16 22:45:06,713 >> loading configuration file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/61dd2bdbb7e56ff51fdc66b6f0d1973d2d806cd616d38a149f1bfd2753babc3c.ba488f0d9624511a98ed83af3e8f6b33fe20b502e2cfb16ee9858a6b6f521982 | |
[INFO|configuration_utils.py:553] 2021-05-16 22:45:06,715 >> Model config ElectraConfig { | |
"architectures": [ | |
"ElectraForPreTraining" | |
], | |
"attention_probs_dropout_prob": 0.1, | |
"embedding_size": 768, | |
"hidden_act": "gelu", | |
"hidden_dropout_prob": 0.1, | |
"hidden_size": 768, | |
"initializer_range": 0.02, | |
"intermediate_size": 3072, | |
"layer_norm_eps": 1e-12, | |
"max_position_embeddings": 512, | |
"model_type": "electra", | |
"num_attention_heads": 12, | |
"num_hidden_layers": 12, | |
"pad_token_id": 0, | |
"position_embedding_type": "absolute", | |
"summary_activation": "gelu", | |
"summary_last_dropout": 0.1, | |
"summary_type": "first", | |
"summary_use_proj": true, | |
"tokenizer_class": "BertTokenizer", | |
"transformers_version": "4.7.0.dev0", | |
"type_vocab_size": 2, | |
"vocab_size": 50135 | |
} | |
[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:45:10,218 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/vocab.txt from cache at /home/jovyan/.cache/huggingface/transformers/5852208f13e8ee71a994c414a90812974669123460d08b55ead80024d9a2e025.2e854075a5d70b111391280c0fdbeeab1ad11deed5a20c29cbdaff40f39422c9 | |
[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:45:10,218 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/tokenizer.json from cache at None | |
[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:45:10,218 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/added_tokens.json from cache at None | |
[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:45:10,218 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/special_tokens_map.json from cache at /home/jovyan/.cache/huggingface/transformers/6bddca875f34b8afbae26136b9594ea80793c9598640f0bc94017555a0a1c113.31b83c6ab34462cefd974ed0df8dd4189e7b7b81b47315b7a10627f7ae120002 | |
[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:45:10,218 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/tokenizer_config.json from cache at /home/jovyan/.cache/huggingface/transformers/7263de953a4cd2b1f102b17d66d2138ab74d46fbf419d588e523f2e8189a5fbf.3cf6a609d624dad9e48921ddd3d07764cb2f8f3fc2a84d956416cf643eb1be18 | |
INFO:__main__:Training new model from scratch | |
100%|████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 24.11ba/s] | |
100%|████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00, 4.44ba/s] | |
[INFO|trainer.py:415] 2021-05-16 22:45:17,267 >> Using amp fp16 backend | |
[INFO|trainer.py:515] 2021-05-16 22:45:17,361 >> The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. | |
[2021-05-16 22:45:17,366] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.3.16, git-hash=unknown, git-branch=unknown | |
[2021-05-16 22:45:21,598] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 | |
jupyter-beomi:12928:12928 [0] NCCL INFO Bootstrap : Using [0]eth0:192.168.11.194<0> | |
jupyter-beomi:12928:12928 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation | |
jupyter-beomi:12928:12928 [0] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1] | |
jupyter-beomi:12928:12928 [0] NCCL INFO NET/Socket : Using [0]eth0:192.168.11.194<0> | |
jupyter-beomi:12928:12928 [0] NCCL INFO Using network Socket | |
NCCL version 2.7.8+cuda11.1 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 00/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 01/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 02/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 03/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 04/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 05/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 06/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 07/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 08/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 09/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 10/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 11/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 12/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 13/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 14/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 15/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 16/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 17/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 18/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 19/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 20/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 21/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 22/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 23/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 24/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 25/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 26/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 27/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 28/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 29/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 30/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 31/32 : 0 | |
jupyter-beomi:12928:13113 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [1] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [2] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [3] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [4] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [5] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [6] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [7] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [8] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [9] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [10] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [11] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [12] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [13] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [14] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [15] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [16] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [17] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [18] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [19] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [20] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [21] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [22] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [23] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [24] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [25] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [26] -1/-1/-1->0->-1|-1-> | |
jupyter-beomi:12928:13113 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,ffff0000 | |
jupyter-beomi:12928:13113 [0] NCCL INFO 32 coll channels, 32 p2p channels, 32 p2p channels per peer | |
jupyter-beomi:12928:13113 [0] NCCL INFO comm 0x7f1e94002dd0 rank 0 nranks 1 cudaDev 0 busId 86000 - Init COMPLETE | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
Using /home/jovyan/.cache/torch_extensions as PyTorch extensions root... | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
Detected CUDA files, patching ldflags | |
Emitting ninja build file /home/jovyan/.cache/torch_extensions/cpu_adam/build.ninja... | |
Building extension module cpu_adam... | |
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
ninja: no work to do. | |
Loading extension module cpu_adam... | |
Time to load cpu_adam op: 0.9731655120849609 seconds | |
Adam Optimizer #0 is created with AVX512 arithmetic capability. | |
Config: alpha=0.000050, betas=(0.900000, 0.999000), weight_decay=0.000000, adam_w=1 | |
[2021-05-16 22:45:24,577] [INFO] [engine.py:610:_configure_optimizer] Using DeepSpeed Optimizer param name adamw as basic optimizer | |
[2021-05-16 22:45:24,578] [INFO] [engine.py:615:_configure_optimizer] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam | |
Checking ZeRO support for optimizer=DeepSpeedCPUAdam type=<class 'deepspeed.ops.adam.cpu_adam.DeepSpeedCPUAdam'> | |
[2021-05-16 22:45:24,578] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer | |
Initializing ZeRO Stage 3 | |
[2021-05-16 22:45:24,664] [INFO] [utils.py:583:see_memory_usage] Stage 3 initialize beginning | |
/home/jovyan/anaconda3/envs/ds-huggingface/lib/python3.8/site-packages/torch/cuda/memory.py:373: FutureWarning: torch.cuda.memory_cached has been renamed to torch.cuda.memory_reserved | |
warnings.warn( | |
/home/jovyan/anaconda3/envs/ds-huggingface/lib/python3.8/site-packages/torch/cuda/memory.py:381: FutureWarning: torch.cuda.max_memory_cached has been renamed to torch.cuda.max_memory_reserved | |
warnings.warn( | |
[2021-05-16 22:45:24,666] [INFO] [utils.py:584:see_memory_usage] MA 0.24 GB Max_MA 0.24 GB CA 0.25 GB Max_CA 0 GB | |
[2021-05-16 22:45:24,667] [INFO] [utils.py:592:see_memory_usage] CPU Virtual Memory: used = 151.6 GB, percent = 32.1% | |
[2021-05-16 22:45:24,667] [INFO] [stage3.py:624:__init__] Reduce bucket size 589824 | |
[2021-05-16 22:45:24,667] [INFO] [stage3.py:625:__init__] Allgather bucket size 530841.6000000001 | |
Using /home/jovyan/.cache/torch_extensions as PyTorch extensions root... | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
Emitting ninja build file /home/jovyan/.cache/torch_extensions/utils/build.ninja... | |
Building extension module utils... | |
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
ninja: no work to do. | |
Loading extension module utils... | |
Time to load utils op: 0.6948151588439941 seconds | |
[2021-05-16 22:45:25,450] [INFO] [stage3.py:39:print_rank_0] FP16 params swapping is False, Max params in CPU is 1000000000.0 | |
[2021-05-16 22:45:25,527] [INFO] [utils.py:583:see_memory_usage] Before creating fp16 partitions | |
/home/jovyan/anaconda3/envs/ds-huggingface/lib/python3.8/site-packages/torch/cuda/memory.py:373: FutureWarning: torch.cuda.memory_cached has been renamed to torch.cuda.memory_reserved | |
warnings.warn( | |
/home/jovyan/anaconda3/envs/ds-huggingface/lib/python3.8/site-packages/torch/cuda/memory.py:381: FutureWarning: torch.cuda.max_memory_cached has been renamed to torch.cuda.max_memory_reserved | |
warnings.warn( | |
[2021-05-16 22:45:25,529] [INFO] [utils.py:584:see_memory_usage] MA 0.24 GB Max_MA 0.31 GB CA 0.32 GB Max_CA 0 GB | |
[2021-05-16 22:45:25,529] [INFO] [utils.py:592:see_memory_usage] CPU Virtual Memory: used = 151.6 GB, percent = 32.1% | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 00/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 01/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 02/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 03/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 04/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 05/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 06/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 07/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 08/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 09/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 10/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 11/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 12/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 13/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 14/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 15/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 16/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 17/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 18/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 19/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 20/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 21/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 22/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 23/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 24/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 25/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 26/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 27/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 28/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 29/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 30/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 31/32 : 0 | |
jupyter-beomi:12928:13146 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [1] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [2] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [3] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [4] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [5] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [6] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [7] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [8] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [9] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [10] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [11] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [12] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [13] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [14] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [15] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [16] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [17] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [18] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [19] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [20] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [21] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [22] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [23] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [24] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [25] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [26] -1/-1/-1->0->-1|-1-> | |
jupyter-beomi:12928:13146 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,ffff0000 | |
jupyter-beomi:12928:13146 [0] NCCL INFO 32 coll channels, 32 p2p channels, 32 p2p channels per peer | |
jupyter-beomi:12928:13146 [0] NCCL INFO comm 0x7f1e90002e20 rank 0 nranks 1 cudaDev 0 busId 86000 - Init COMPLETE | |
[2021-05-16 22:45:25,705] [INFO] [stage3.py:39:print_rank_0] fp16 group 0 has 1 subgroups | |
[2021-05-16 22:45:25,891] [INFO] [stage3.py:39:print_rank_0] Swappable FP32 Partitions: count=0 size= 0.00 GB | |
[2021-05-16 22:45:25,891] [INFO] [stage3.py:39:print_rank_0] In-Memory FP32 Partitions: count=1 size= 0.46 GB | |
[2021-05-16 22:45:26,590] [INFO] [stage3.py:819:__init__] optimizer state initialized | |
[2021-05-16 22:45:26,590] [INFO] [stage3.py:39:print_rank_0] Largest partitioned param numel = 124596695 | |
[2021-05-16 22:45:26,783] [INFO] [utils.py:583:see_memory_usage] After initializing ZeRO optimizer | |
[2021-05-16 22:45:26,784] [INFO] [utils.py:584:see_memory_usage] MA 0.46 GB Max_MA 0.61 GB CA 0.86 GB Max_CA 1 GB | |
[2021-05-16 22:45:26,785] [INFO] [utils.py:592:see_memory_usage] CPU Virtual Memory: used = 153.7 GB, percent = 32.6% | |
[2021-05-16 22:45:26,785] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adamw | |
[2021-05-16 22:45:26,785] [INFO] [engine.py:439:_configure_lr_scheduler] DeepSpeed using configured LR scheduler = WarmupLR | |
[2021-05-16 22:45:26,785] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = <deepspeed.runtime.lr_schedules.WarmupLR object at 0x7f1f4916eca0> | |
[2021-05-16 22:45:26,785] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[5e-05], mom=[[0.9, 0.999]] | |
[2021-05-16 22:45:26,785] [INFO] [config.py:747:print] DeepSpeedEngine configuration: | |
[2021-05-16 22:45:26,785] [INFO] [config.py:751:print] activation_checkpointing_config { | |
"partition_activations": false, | |
"contiguous_memory_optimization": false, | |
"cpu_checkpointing": false, | |
"number_checkpoints": null, | |
"synchronize_checkpoint_boundary": false, | |
"profile": false | |
} | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] allreduce_always_fp32 ........ False | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] amp_enabled .................. False | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] amp_params ................... False | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] checkpoint_tag_validation_enabled True | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] checkpoint_tag_validation_fail False | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] disable_allgather ............ False | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] dump_state ................... False | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 1000, 'delayed_shift': 2, 'min_scale': 1} | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] elasticity_enabled ........... False | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] flops_profiler_config ........ { | |
"enabled": false, | |
"profile_step": 1, | |
"module_depth": -1, | |
"top_modules": 3, | |
"detailed": true | |
} | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] fp16_enabled ................. True | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] global_rank .................. 0 | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] gradient_accumulation_steps .. 1 | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] gradient_clipping ............ 1.0 | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] gradient_predivide_factor .... 1.0 | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] initial_dynamic_scale ........ 65536 | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] loss_scale ................... 0 | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] memory_breakdown ............. False | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] optimizer_legacy_fusion ...... False | |
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print] optimizer_name ............... adamw | |
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print] optimizer_params ............. {'lr': 5e-05, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.0} | |
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} | |
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print] pld_enabled .................. False | |
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print] pld_params ................... False | |
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print] prescale_gradients ........... False | |
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print] scheduler_name ............... WarmupLR | |
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print] scheduler_params ............. {'warmup_min_lr': 0, 'warmup_max_lr': 5e-05, 'warmup_num_steps': 0} | |
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print] sparse_attention ............. None | |
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print] sparse_gradients_enabled ..... False | |
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print] steps_per_print .............. 2000 | |
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print] tensorboard_enabled .......... False | |
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print] tensorboard_job_name ......... DeepSpeedJobName | |
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print] tensorboard_output_path ...... | |
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print] train_batch_size ............. 32 | |
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print] train_micro_batch_size_per_gpu 32 | |
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print] wall_clock_breakdown ......... False | |
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print] world_size ................... 1 | |
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print] zero_allow_untested_optimizer False | |
[2021-05-16 22:45:26,788] [INFO] [config.py:751:print] zero_config .................. { | |
"stage": 3, | |
"contiguous_gradients": true, | |
"reduce_scatter": false, | |
"reduce_bucket_size": 5.898240e+05, | |
"allgather_partitions": true, | |
"allgather_bucket_size": 5.000000e+08, | |
"overlap_comm": true, | |
"load_from_fp32_weights": true, | |
"elastic_checkpoint": true, | |
"offload_param": { | |
"device": "cpu", | |
"nvme_path": null, | |
"buffer_count": 5, | |
"buffer_size": 1.000000e+08, | |
"max_in_cpu": 1.000000e+09, | |
"pin_memory": true | |
}, | |
"offload_optimizer": { | |
"device": "cpu", | |
"nvme_path": null, | |
"buffer_count": 4, | |
"pin_memory": true, | |
"pipeline_read": false, | |
"pipeline_write": false, | |
"fast_init": false, | |
"pipeline": false | |
}, | |
"sub_group_size": 1.000000e+14, | |
"prefetch_bucket_size": 5.308416e+05, | |
"param_persistence_threshold": 7.680000e+03, | |
"max_live_parameters": 1.000000e+09, | |
"max_reuse_distance": 1.000000e+09, | |
"gather_fp16_weights_on_model_save": true, | |
"find_unused_parameters": false | |
} | |
[2021-05-16 22:45:26,788] [INFO] [config.py:751:print] zero_enabled ................. True | |
[2021-05-16 22:45:26,788] [INFO] [config.py:751:print] zero_optimization_stage ...... 3 | |
[2021-05-16 22:45:26,788] [INFO] [config.py:753:print] json = { | |
"fp16": { | |
"enabled": true, | |
"loss_scale": 0, | |
"loss_scale_window": 1000, | |
"initial_scale_power": 16, | |
"hysteresis": 2, | |
"min_loss_scale": 1 | |
}, | |
"optimizer": { | |
"type": "AdamW", | |
"params": { | |
"lr": 5e-05, | |
"betas": [0.9, 0.999], | |
"eps": 1e-08, | |
"weight_decay": 0.0 | |
} | |
}, | |
"scheduler": { | |
"type": "WarmupLR", | |
"params": { | |
"warmup_min_lr": 0, | |
"warmup_max_lr": 5e-05, | |
"warmup_num_steps": 0 | |
} | |
}, | |
"zero_optimization": { | |
"stage": 3, | |
"offload_optimizer": { | |
"device": "cpu", | |
"pin_memory": true | |
}, | |
"offload_param": { | |
"device": "cpu", | |
"pin_memory": true | |
}, | |
"overlap_comm": true, | |
"contiguous_gradients": true, | |
"sub_group_size": 1.000000e+14, | |
"reduce_bucket_size": 5.898240e+05, | |
"stage3_prefetch_bucket_size": 5.308416e+05, | |
"stage3_param_persistence_threshold": 7.680000e+03, | |
"stage3_max_live_parameters": 1.000000e+09, | |
"stage3_max_reuse_distance": 1.000000e+09, | |
"stage3_gather_fp16_weights_on_model_save": true | |
}, | |
"gradient_accumulation_steps": 1, | |
"gradient_clipping": 1.0, | |
"steps_per_print": 2.000000e+03, | |
"train_batch_size": 32, | |
"train_micro_batch_size_per_gpu": 32, | |
"wall_clock_breakdown": false | |
} | |
Using /home/jovyan/.cache/torch_extensions as PyTorch extensions root... | |
No modifications detected for re-loaded extension module utils, skipping build step... | |
Loading extension module utils... | |
Time to load utils op: 0.003198385238647461 seconds | |
[INFO|trainer.py:1145] 2021-05-16 22:45:26,792 >> ***** Running training ***** | |
[INFO|trainer.py:1146] 2021-05-16 22:45:26,792 >> Num examples = 893 | |
[INFO|trainer.py:1147] 2021-05-16 22:45:26,792 >> Num Epochs = 2 | |
[INFO|trainer.py:1148] 2021-05-16 22:45:26,792 >> Instantaneous batch size per device = 32 | |
[INFO|trainer.py:1149] 2021-05-16 22:45:26,792 >> Total train batch size (w. parallel, distributed & accumulation) = 32 | |
[INFO|trainer.py:1150] 2021-05-16 22:45:26,793 >> Gradient Accumulation steps = 1 | |
[INFO|trainer.py:1151] 2021-05-16 22:45:26,793 >> Total optimization steps = 56 | |
0%| | 0/56 [00:00<?, ?it/s][2021-05-16 22:45:27,806] [INFO] [stage3.py:2700:_overflow_clean_up] [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 65536 | |
2%|█▏ | 1/56 [00:00<00:53, 1.03it/s][WARNING|trainer_pt_utils.py:777] 2021-05-16 22:45:27,807 >> tried to get lr value before scheduler/optimizer started stepping, returning lr=0 | |
{'loss': 10.9922, 'learning_rate': 0, 'epoch': 0.04} | |
2%|█▏ | 1/56 [00:00<00:53, 1.03it/s][2021-05-16 22:45:28,722] [INFO] [stage3.py:2700:_overflow_clean_up] [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768.0 | |
100%|████████████████████████████████████████████████████████████████████| 56/56 [00:53<00:00, 1.08it/s][INFO|trainer.py:1341] 2021-05-16 22:46:19,931 >> | |
Training completed. Do not forget to share your model on huggingface.co/models =) | |
{'train_runtime': 53.1384, 'train_samples_per_second': 1.054, 'epoch': 2.0} | |
100%|████████████████████████████████████████████████████████████████████| 56/56 [00:53<00:00, 1.05it/s] | |
[INFO|trainer.py:1885] 2021-05-16 22:46:20,225 >> Saving model checkpoint to ./test-bert-zero3 | |
[INFO|configuration_utils.py:351] 2021-05-16 22:46:20,229 >> Configuration saved in ./test-bert-zero3/config.json | |
[INFO|modeling_utils.py:889] 2021-05-16 22:46:20,244 >> Model weights saved in ./test-bert-zero3/pytorch_model.bin | |
[INFO|tokenization_utils_base.py:1924] 2021-05-16 22:46:20,247 >> tokenizer config file saved in ./test-bert-zero3/tokenizer_config.json | |
[INFO|tokenization_utils_base.py:1930] 2021-05-16 22:46:20,249 >> Special tokens file saved in ./test-bert-zero3/special_tokens_map.json | |
[2021-05-16 22:46:20,569] [INFO] [engine.py:1842:save_fp16_model] Saving model weights to ./test-bert-zero3/pytorch_model.bin | |
[INFO|trainer_pt_utils.py:907] 2021-05-16 22:46:20,971 >> ***** train metrics ***** | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >> epoch = 2.0 | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >> init_mem_cpu_alloc_delta = 0MB | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >> init_mem_cpu_peaked_delta = 0MB | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >> init_mem_gpu_alloc_delta = 0MB | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >> init_mem_gpu_peaked_delta = 0MB | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >> train_mem_cpu_alloc_delta = 5813MB | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >> train_mem_cpu_peaked_delta = 465MB | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >> train_mem_gpu_alloc_delta = 476MB | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >> train_mem_gpu_peaked_delta = 9633MB | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >> train_runtime = 0:00:53.13 | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >> train_samples = 893 | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >> train_samples_per_second = 1.054 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment