-
-
Save Beomi/ab133ccb72cf9160fc85cc4185c60f75 to your computer and use it in GitHub Desktop.
ZeRO-2 BERT Train(with Sampled dataset) Output Log with NCCL_DEBUG=INFO
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ds-huggingface) root@jupyter-beomi:~/2021.05.17.DeepSpeed_Huggingface# ./gpu_run_mlm.sh | |
[2021-05-16 22:50:42,950] [WARNING] [runner.py:122:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. | |
[2021-05-16 22:50:42,994] [INFO] [runner.py:360:main] cmd = /home/jovyan/anaconda3/envs/ds-huggingface/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=29500 run_mlm.py --seed 42 --model_type bert --tokenizer_name beomi/KcELECTRA-base --train_file ./sampled_20190101_20200611_v2.txt --num_train_epochs 2 --per_device_train_batch_size 32 --per_device_eval_batch_size 32 --do_train --output_dir ./test-bert-zero2 --fp16 --logging_first_step --max_seq_length 300 --deepspeed ./ds_zero2_1gpu.json | |
[2021-05-16 22:50:44,708] [INFO] [launch.py:73:main] 0 NCCL_DEBUG INFO | |
[2021-05-16 22:50:44,708] [INFO] [launch.py:80:main] WORLD INFO DICT: {'localhost': [0]} | |
[2021-05-16 22:50:44,708] [INFO] [launch.py:86:main] nnodes=1, num_local_procs=1, node_rank=0 | |
[2021-05-16 22:50:44,709] [INFO] [launch.py:101:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0]}) | |
[2021-05-16 22:50:44,709] [INFO] [launch.py:102:main] dist_world_size=1 | |
[2021-05-16 22:50:44,709] [INFO] [launch.py:104:main] Setting CUDA_VISIBLE_DEVICES=0 | |
[2021-05-16 22:50:48,263] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl | |
WARNING:__main__:Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: True | |
INFO:__main__:Training/evaluation parameters TrainingArguments(output_dir=./test-bert-zero2, overwrite_output_dir=False, do_train=True, do_eval=False, do_predict=False, evaluation_strategy=IntervalStrategy.NO, prediction_loss_only=False, per_device_train_batch_size=32, per_device_eval_batch_size=32, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=2.0, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_ratio=0.0, warmup_steps=0, logging_dir=runs/May16_22-50-48_jupyter-beomi, logging_strategy=IntervalStrategy.STEPS, logging_first_step=True, logging_steps=500, save_strategy=IntervalStrategy.STEPS, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=True, fp16_opt_level=O1, fp16_backend=auto, fp16_full_eval=False, local_rank=0, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name=./test-bert-zero2, disable_tqdm=False, remove_unused_columns=True, label_names=None, load_best_model_at_end=False, metric_for_best_model=None, greater_is_better=None, ignore_data_skip=False, sharded_ddp=[], deepspeed=./ds_zero2_1gpu.json, label_smoothing_factor=0.0, adafactor=False, group_by_length=False, length_column_name=length, report_to=['tensorboard'], ddp_find_unused_parameters=None, dataloader_pin_memory=True, skip_memory_metrics=False, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, _n_gpu=1, mp_parameters=) | |
WARNING:datasets.builder:Using custom data configuration default-43493ca3484df8f8 | |
WARNING:datasets.builder:Reusing dataset text (/home/jovyan/.cache/huggingface/datasets/text/default-43493ca3484df8f8/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5) | |
WARNING:__main__:You are instantiating a new config instance from scratch. | |
[INFO|configuration_utils.py:517] 2021-05-16 22:50:49,855 >> loading configuration file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/61dd2bdbb7e56ff51fdc66b6f0d1973d2d806cd616d38a149f1bfd2753babc3c.ba488f0d9624511a98ed83af3e8f6b33fe20b502e2cfb16ee9858a6b6f521982 | |
[INFO|configuration_utils.py:553] 2021-05-16 22:50:49,856 >> Model config ElectraConfig { | |
"architectures": [ | |
"ElectraForPreTraining" | |
], | |
"attention_probs_dropout_prob": 0.1, | |
"embedding_size": 768, | |
"hidden_act": "gelu", | |
"hidden_dropout_prob": 0.1, | |
"hidden_size": 768, | |
"initializer_range": 0.02, | |
"intermediate_size": 3072, | |
"layer_norm_eps": 1e-12, | |
"max_position_embeddings": 512, | |
"model_type": "electra", | |
"num_attention_heads": 12, | |
"num_hidden_layers": 12, | |
"pad_token_id": 0, | |
"position_embedding_type": "absolute", | |
"summary_activation": "gelu", | |
"summary_last_dropout": 0.1, | |
"summary_type": "first", | |
"summary_use_proj": true, | |
"tokenizer_class": "BertTokenizer", | |
"transformers_version": "4.7.0.dev0", | |
"type_vocab_size": 2, | |
"vocab_size": 50135 | |
} | |
[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:50:53,312 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/vocab.txt from cache at /home/jovyan/.cache/huggingface/transformers/5852208f13e8ee71a994c414a90812974669123460d08b55ead80024d9a2e025.2e854075a5d70b111391280c0fdbeeab1ad11deed5a20c29cbdaff40f39422c9 | |
[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:50:53,312 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/tokenizer.json from cache at None | |
[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:50:53,313 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/added_tokens.json from cache at None | |
[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:50:53,313 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/special_tokens_map.json from cache at /home/jovyan/.cache/huggingface/transformers/6bddca875f34b8afbae26136b9594ea80793c9598640f0bc94017555a0a1c113.31b83c6ab34462cefd974ed0df8dd4189e7b7b81b47315b7a10627f7ae120002 | |
[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:50:53,313 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/tokenizer_config.json from cache at /home/jovyan/.cache/huggingface/transformers/7263de953a4cd2b1f102b17d66d2138ab74d46fbf419d588e523f2e8189a5fbf.3cf6a609d624dad9e48921ddd3d07764cb2f8f3fc2a84d956416cf643eb1be18 | |
INFO:__main__:Training new model from scratch | |
100%|████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 21.38ba/s] | |
100%|████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00, 4.46ba/s] | |
[INFO|trainer.py:415] 2021-05-16 22:51:00,376 >> Using amp fp16 backend | |
[INFO|trainer.py:515] 2021-05-16 22:51:00,472 >> The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. | |
[2021-05-16 22:51:00,476] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.3.16, git-hash=unknown, git-branch=unknown | |
[2021-05-16 22:51:00,476] [WARNING] [config.py:79:_sanity_check] DeepSpeedConfig: cpu_offload is deprecated. Please use offload_optimizer. | |
[2021-05-16 22:51:04,728] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 | |
jupyter-beomi:14204:14204 [0] NCCL INFO Bootstrap : Using [0]eth0:192.168.11.194<0> | |
jupyter-beomi:14204:14204 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation | |
jupyter-beomi:14204:14204 [0] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1] | |
jupyter-beomi:14204:14204 [0] NCCL INFO NET/Socket : Using [0]eth0:192.168.11.194<0> | |
jupyter-beomi:14204:14204 [0] NCCL INFO Using network Socket | |
NCCL version 2.7.8+cuda11.1 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 00/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 01/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 02/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 03/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 04/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 05/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 06/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 07/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 08/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 09/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 10/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 11/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 12/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 13/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 14/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 15/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 16/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 17/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 18/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 19/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 20/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 21/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 22/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 23/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 24/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 25/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 26/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 27/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 28/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 29/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 30/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 31/32 : 0 | |
jupyter-beomi:14204:14388 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [1] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [2] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [3] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [4] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [5] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [6] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [7] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [8] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [9] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [10] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [11] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [12] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [13] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [14] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [15] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [16] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [17] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [18] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [19] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [20] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [21] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [22] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [23] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [24] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [25] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [26] -1/-1/-1->0->-1|-1-> | |
jupyter-beomi:14204:14388 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,ffff0000 | |
jupyter-beomi:14204:14388 [0] NCCL INFO 32 coll channels, 32 p2p channels, 32 p2p channels per peer | |
jupyter-beomi:14204:14388 [0] NCCL INFO comm 0x7fea50002dd0 rank 0 nranks 1 cudaDev 0 busId 86000 - Init COMPLETE | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
Using /home/jovyan/.cache/torch_extensions as PyTorch extensions root... | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
Detected CUDA files, patching ldflags | |
Emitting ninja build file /home/jovyan/.cache/torch_extensions/cpu_adam/build.ninja... | |
Building extension module cpu_adam... | |
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
ninja: no work to do. | |
Loading extension module cpu_adam... | |
Time to load cpu_adam op: 1.0352756977081299 seconds | |
Adam Optimizer #0 is created with AVX512 arithmetic capability. | |
Config: alpha=0.000050, betas=(0.900000, 0.999000), weight_decay=0.000000, adam_w=1 | |
[2021-05-16 22:51:07,855] [INFO] [engine.py:610:_configure_optimizer] Using DeepSpeed Optimizer param name adamw as basic optimizer | |
[2021-05-16 22:51:07,856] [INFO] [engine.py:615:_configure_optimizer] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam | |
Checking ZeRO support for optimizer=DeepSpeedCPUAdam type=<class 'deepspeed.ops.adam.cpu_adam.DeepSpeedCPUAdam'> | |
[2021-05-16 22:51:07,856] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer | |
[2021-05-16 22:51:07,856] [INFO] [stage2.py:102:__init__] Reduce bucket size 200000000.0 | |
[2021-05-16 22:51:07,856] [INFO] [stage2.py:103:__init__] Allgather bucket size 200000000.0 | |
[2021-05-16 22:51:07,856] [INFO] [stage2.py:104:__init__] CPU Offload: True | |
Using /home/jovyan/.cache/torch_extensions as PyTorch extensions root... | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
Emitting ninja build file /home/jovyan/.cache/torch_extensions/utils/build.ninja... | |
Building extension module utils... | |
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) | |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... | |
To disable this warning, you can either: | |
- Avoid using `tokenizers` before the fork if possible | |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) | |
ninja: no work to do. | |
Loading extension module utils... | |
Time to load utils op: 0.6900577545166016 seconds | |
[2021-05-16 22:51:09,779] [INFO] [stage2.py:381:__init__] optimizer state initialized | |
[2021-05-16 22:51:09,779] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adamw | |
[2021-05-16 22:51:09,779] [INFO] [engine.py:439:_configure_lr_scheduler] DeepSpeed using configured LR scheduler = WarmupLR | |
[2021-05-16 22:51:09,779] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = <deepspeed.runtime.lr_schedules.WarmupLR object at 0x7fedd40d79a0> | |
[2021-05-16 22:51:09,780] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[5e-05], mom=[[0.9, 0.999]] | |
[2021-05-16 22:51:09,780] [INFO] [config.py:747:print] DeepSpeedEngine configuration: | |
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print] activation_checkpointing_config { | |
"partition_activations": false, | |
"contiguous_memory_optimization": false, | |
"cpu_checkpointing": false, | |
"number_checkpoints": null, | |
"synchronize_checkpoint_boundary": false, | |
"profile": false | |
} | |
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} | |
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print] allreduce_always_fp32 ........ False | |
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print] amp_enabled .................. False | |
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print] amp_params ................... False | |
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print] checkpoint_tag_validation_enabled True | |
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print] checkpoint_tag_validation_fail False | |
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print] disable_allgather ............ False | |
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print] dump_state ................... False | |
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print] dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 1000, 'delayed_shift': 2, 'min_scale': 1} | |
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print] elasticity_enabled ........... False | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] flops_profiler_config ........ { | |
"enabled": false, | |
"profile_step": 1, | |
"module_depth": -1, | |
"top_modules": 3, | |
"detailed": true | |
} | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] fp16_enabled ................. True | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] global_rank .................. 0 | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] gradient_accumulation_steps .. 1 | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] gradient_clipping ............ 1.0 | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] gradient_predivide_factor .... 1.0 | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] initial_dynamic_scale ........ 65536 | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] loss_scale ................... 0 | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] memory_breakdown ............. False | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] optimizer_legacy_fusion ...... False | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] optimizer_name ............... adamw | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] optimizer_params ............. {'lr': 5e-05, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.0} | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] pld_enabled .................. False | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] pld_params ................... False | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] prescale_gradients ........... False | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] scheduler_name ............... WarmupLR | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] scheduler_params ............. {'warmup_min_lr': 0, 'warmup_max_lr': 5e-05, 'warmup_num_steps': 0} | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] sparse_attention ............. None | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] sparse_gradients_enabled ..... False | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] steps_per_print .............. 10 | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] tensorboard_enabled .......... False | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] tensorboard_job_name ......... DeepSpeedJobName | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] tensorboard_output_path ...... | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] train_batch_size ............. 32 | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] train_micro_batch_size_per_gpu 32 | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] wall_clock_breakdown ......... False | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] world_size ................... 1 | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] zero_allow_untested_optimizer False | |
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print] zero_config .................. { | |
"stage": 2, | |
"contiguous_gradients": true, | |
"reduce_scatter": true, | |
"reduce_bucket_size": 2.000000e+08, | |
"allgather_partitions": true, | |
"allgather_bucket_size": 2.000000e+08, | |
"overlap_comm": true, | |
"load_from_fp32_weights": true, | |
"elastic_checkpoint": true, | |
"offload_param": null, | |
"offload_optimizer": { | |
"device": "cpu", | |
"nvme_path": null, | |
"buffer_count": 4, | |
"pin_memory": false, | |
"pipeline_read": false, | |
"pipeline_write": false, | |
"fast_init": false | |
}, | |
"sub_group_size": 1.000000e+12, | |
"prefetch_bucket_size": 5.000000e+07, | |
"param_persistence_threshold": 1.000000e+05, | |
"max_live_parameters": 1.000000e+09, | |
"max_reuse_distance": 1.000000e+09, | |
"gather_fp16_weights_on_model_save": false, | |
"find_unused_parameters": false | |
} | |
[2021-05-16 22:51:09,782] [INFO] [config.py:751:print] zero_enabled ................. True | |
[2021-05-16 22:51:09,782] [INFO] [config.py:751:print] zero_optimization_stage ...... 2 | |
[2021-05-16 22:51:09,782] [INFO] [config.py:753:print] json = { | |
"fp16": { | |
"enabled": true, | |
"loss_scale": 0, | |
"loss_scale_window": 1000, | |
"initial_scale_power": 16, | |
"hysteresis": 2, | |
"min_loss_scale": 1 | |
}, | |
"optimizer": { | |
"type": "AdamW", | |
"params": { | |
"lr": 5e-05, | |
"betas": [0.9, 0.999], | |
"eps": 1e-08, | |
"weight_decay": 0.0 | |
} | |
}, | |
"scheduler": { | |
"type": "WarmupLR", | |
"params": { | |
"warmup_min_lr": 0, | |
"warmup_max_lr": 5e-05, | |
"warmup_num_steps": 0 | |
} | |
}, | |
"zero_optimization": { | |
"stage": 2, | |
"allgather_partitions": true, | |
"allgather_bucket_size": 2.000000e+08, | |
"overlap_comm": true, | |
"reduce_scatter": true, | |
"reduce_bucket_size": 2.000000e+08, | |
"contiguous_gradients": true, | |
"cpu_offload": true | |
}, | |
"gradient_accumulation_steps": 1, | |
"gradient_clipping": 1.0, | |
"train_batch_size": 32, | |
"train_micro_batch_size_per_gpu": 32 | |
} | |
Using /home/jovyan/.cache/torch_extensions as PyTorch extensions root... | |
No modifications detected for re-loaded extension module utils, skipping build step... | |
Loading extension module utils... | |
Time to load utils op: 0.002638101577758789 seconds | |
[INFO|trainer.py:1145] 2021-05-16 22:51:09,786 >> ***** Running training ***** | |
[INFO|trainer.py:1146] 2021-05-16 22:51:09,786 >> Num examples = 893 | |
[INFO|trainer.py:1147] 2021-05-16 22:51:09,786 >> Num Epochs = 2 | |
[INFO|trainer.py:1148] 2021-05-16 22:51:09,786 >> Instantaneous batch size per device = 32 | |
[INFO|trainer.py:1149] 2021-05-16 22:51:09,786 >> Total train batch size (w. parallel, distributed & accumulation) = 32 | |
[INFO|trainer.py:1150] 2021-05-16 22:51:09,786 >> Gradient Accumulation steps = 1 | |
[INFO|trainer.py:1151] 2021-05-16 22:51:09,786 >> Total optimization steps = 56 | |
0%| | 0/56 [00:00<?, ?it/s][2021-05-16 22:51:10,280] [INFO] [stage2.py:1407:step] [deepspeed] fp16 dynamic loss scale overflow! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 65536 | |
2%|█▏ | 1/56 [00:00<00:25, 2.18it/s][WARNING|trainer_pt_utils.py:777] 2021-05-16 22:51:10,281 >> tried to get lr value before scheduler/optimizer started stepping, returning lr=0 | |
{'loss': 10.9922, 'learning_rate': 0, 'epoch': 0.04} | |
2%|█▏ | 1/56 [00:00<00:25, 2.18it/s][2021-05-16 22:51:10,643] [INFO] [stage2.py:1407:step] [deepspeed] fp16 dynamic loss scale overflow! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768.0 | |
16%|███████████ | 9/56 [00:04<00:26, 1.75it/s][2021-05-16 22:51:15,240] [INFO] [logging.py:60:log_dist] [Rank 0] step=10, skipped=2, lr=[5e-05], mom=[[0.9, 0.999]] | |
[2021-05-16 22:51:15,241] [INFO] [timer.py:154:stop] 0/10, SamplesPerSec=58.39567776783072 | |
34%|███████████████████████ | 19/56 [00:10<00:20, 1.82it/s][2021-05-16 22:51:20,733] [INFO] [logging.py:60:log_dist] [Rank 0] step=20, skipped=2, lr=[5e-05], mom=[[0.9, 0.999]] | |
[2021-05-16 22:51:20,734] [INFO] [timer.py:154:stop] 0/20, SamplesPerSec=59.91298349642106 | |
52%|███████████████████████████████████▏ | 29/56 [00:15<00:14, 1.89it/s][2021-05-16 22:51:26,115] [INFO] [logging.py:60:log_dist] [Rank 0] step=30, skipped=2, lr=[5e-05], mom=[[0.9, 0.999]] | |
[2021-05-16 22:51:26,116] [INFO] [timer.py:154:stop] 0/30, SamplesPerSec=60.660391420264496 | |
70%|███████████████████████████████████████████████▎ | 39/56 [00:21<00:09, 1.84it/s][2021-05-16 22:51:31,562] [INFO] [logging.py:60:log_dist] [Rank 0] step=40, skipped=2, lr=[5e-05], mom=[[0.9, 0.999]] | |
[2021-05-16 22:51:31,562] [INFO] [timer.py:154:stop] 0/40, SamplesPerSec=60.921882866787925 | |
88%|███████████████████████████████████████████████████████████▌ | 49/56 [00:26<00:03, 1.86it/s][2021-05-16 22:51:37,007] [INFO] [logging.py:60:log_dist] [Rank 0] step=50, skipped=2, lr=[5e-05], mom=[[0.9, 0.999]] | |
[2021-05-16 22:51:37,007] [INFO] [timer.py:154:stop] 0/50, SamplesPerSec=61.07305240178895 | |
100%|████████████████████████████████████████████████████████████████████| 56/56 [00:30<00:00, 1.87it/s][INFO|trainer.py:1341] 2021-05-16 22:51:40,241 >> | |
Training completed. Do not forget to share your model on huggingface.co/models =) | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 00/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 01/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 02/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 03/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 04/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 05/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 06/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 07/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 08/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 09/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 10/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 11/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 12/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 13/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 14/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 15/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 16/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 17/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 18/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 19/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 20/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 21/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 22/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 23/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 24/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 25/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 26/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 27/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 28/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 29/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 30/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 31/32 : 0 | |
jupyter-beomi:14204:14515 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [1] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [2] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [3] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [4] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [5] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [6] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [7] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [8] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [9] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [10] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [11] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [12] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [13] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [14] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [15] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [16] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [17] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [18] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [19] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [20] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [21] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [22] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [23] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [24] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [25] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [26] -1/-1/-1->0->-1|-1-> | |
jupyter-beomi:14204:14515 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,ffff0000 | |
jupyter-beomi:14204:14515 [0] NCCL INFO 32 coll channels, 32 p2p channels, 32 p2p channels per peer | |
jupyter-beomi:14204:14515 [0] NCCL INFO comm 0x7fe5f0002dd0 rank 0 nranks 1 cudaDev 0 busId 86000 - Init COMPLETE | |
{'train_runtime': 30.4552, 'train_samples_per_second': 1.839, 'epoch': 2.0} | |
100%|████████████████████████████████████████████████████████████████████| 56/56 [00:30<00:00, 1.84it/s] | |
[INFO|trainer.py:1885] 2021-05-16 22:51:40,526 >> Saving model checkpoint to ./test-bert-zero2 | |
[INFO|configuration_utils.py:351] 2021-05-16 22:51:40,530 >> Configuration saved in ./test-bert-zero2/config.json | |
[INFO|modeling_utils.py:889] 2021-05-16 22:51:41,285 >> Model weights saved in ./test-bert-zero2/pytorch_model.bin | |
[INFO|tokenization_utils_base.py:1924] 2021-05-16 22:51:41,288 >> tokenizer config file saved in ./test-bert-zero2/tokenizer_config.json | |
[INFO|tokenization_utils_base.py:1930] 2021-05-16 22:51:41,289 >> Special tokens file saved in ./test-bert-zero2/special_tokens_map.json | |
[INFO|trainer_pt_utils.py:907] 2021-05-16 22:51:41,402 >> ***** train metrics ***** | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,402 >> epoch = 2.0 | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,402 >> init_mem_cpu_alloc_delta = 0MB | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,402 >> init_mem_cpu_peaked_delta = 0MB | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,403 >> init_mem_gpu_alloc_delta = 0MB | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,403 >> init_mem_gpu_peaked_delta = 0MB | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,403 >> train_mem_cpu_alloc_delta = 5737MB | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,403 >> train_mem_cpu_peaked_delta = 462MB | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,403 >> train_mem_gpu_alloc_delta = 312MB | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,403 >> train_mem_gpu_peaked_delta = 12823MB | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,403 >> train_runtime = 0:00:30.45 | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,403 >> train_samples = 893 | |
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,403 >> train_samples_per_second = 1.839 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment