Beomi/ZeRO2_BERT_Sample_Log.txt Secret

## ZeRO2_BERT_Sample_Log.txt
(ds-huggingface) root@jupyter-beomi:~/2021.05.17.DeepSpeed_Huggingface# ./gpu_run_mlm.sh
[2021-05-16 22:50:42,950] [WARNING] [runner.py:122:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
[2021-05-16 22:50:42,994] [INFO] [runner.py:360:main] cmd = /home/jovyan/anaconda3/envs/ds-huggingface/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=29500 run_mlm.py --seed 42 --model_type bert --tokenizer_name beomi/KcELECTRA-base --train_file ./sampled_20190101_20200611_v2.txt --num_train_epochs 2 --per_device_train_batch_size 32 --per_device_eval_batch_size 32 --do_train --output_dir ./test-bert-zero2 --fp16 --logging_first_step --max_seq_length 300 --deepspeed ./ds_zero2_1gpu.json
[2021-05-16 22:50:44,708] [INFO] [launch.py:73:main] 0 NCCL_DEBUG INFO
[2021-05-16 22:50:44,708] [INFO] [launch.py:80:main] WORLD INFO DICT: {'localhost': [0]}
[2021-05-16 22:50:44,708] [INFO] [launch.py:86:main] nnodes=1, num_local_procs=1, node_rank=0
[2021-05-16 22:50:44,709] [INFO] [launch.py:101:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0]})
[2021-05-16 22:50:44,709] [INFO] [launch.py:102:main] dist_world_size=1
[2021-05-16 22:50:44,709] [INFO] [launch.py:104:main] Setting CUDA_VISIBLE_DEVICES=0
[2021-05-16 22:50:48,263] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl
WARNING:__main__:Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: True
INFO:__main__:Training/evaluation parameters TrainingArguments(output_dir=./test-bert-zero2, overwrite_output_dir=False, do_train=True, do_eval=False, do_predict=False, evaluation_strategy=IntervalStrategy.NO, prediction_loss_only=False, per_device_train_batch_size=32, per_device_eval_batch_size=32, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=2.0, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_ratio=0.0, warmup_steps=0, logging_dir=runs/May16_22-50-48_jupyter-beomi, logging_strategy=IntervalStrategy.STEPS, logging_first_step=True, logging_steps=500, save_strategy=IntervalStrategy.STEPS, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=True, fp16_opt_level=O1, fp16_backend=auto, fp16_full_eval=False, local_rank=0, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name=./test-bert-zero2, disable_tqdm=False, remove_unused_columns=True, label_names=None, load_best_model_at_end=False, metric_for_best_model=None, greater_is_better=None, ignore_data_skip=False, sharded_ddp=[], deepspeed=./ds_zero2_1gpu.json, label_smoothing_factor=0.0, adafactor=False, group_by_length=False, length_column_name=length, report_to=['tensorboard'], ddp_find_unused_parameters=None, dataloader_pin_memory=True, skip_memory_metrics=False, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, _n_gpu=1, mp_parameters=)
WARNING:datasets.builder:Using custom data configuration default-43493ca3484df8f8
WARNING:datasets.builder:Reusing dataset text (/home/jovyan/.cache/huggingface/datasets/text/default-43493ca3484df8f8/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)
WARNING:__main__:You are instantiating a new config instance from scratch.
[INFO|configuration_utils.py:517] 2021-05-16 22:50:49,855 >> loading configuration file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/61dd2bdbb7e56ff51fdc66b6f0d1973d2d806cd616d38a149f1bfd2753babc3c.ba488f0d9624511a98ed83af3e8f6b33fe20b502e2cfb16ee9858a6b6f521982
[INFO|configuration_utils.py:553] 2021-05-16 22:50:49,856 >> Model config ElectraConfig {
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "tokenizer_class": "BertTokenizer",
  "transformers_version": "4.7.0.dev0",
  "type_vocab_size": 2,
  "vocab_size": 50135
}

[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:50:53,312 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/vocab.txt from cache at /home/jovyan/.cache/huggingface/transformers/5852208f13e8ee71a994c414a90812974669123460d08b55ead80024d9a2e025.2e854075a5d70b111391280c0fdbeeab1ad11deed5a20c29cbdaff40f39422c9
[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:50:53,312 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/tokenizer.json from cache at None
[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:50:53,313 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:50:53,313 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/special_tokens_map.json from cache at /home/jovyan/.cache/huggingface/transformers/6bddca875f34b8afbae26136b9594ea80793c9598640f0bc94017555a0a1c113.31b83c6ab34462cefd974ed0df8dd4189e7b7b81b47315b7a10627f7ae120002
[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:50:53,313 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/tokenizer_config.json from cache at /home/jovyan/.cache/huggingface/transformers/7263de953a4cd2b1f102b17d66d2138ab74d46fbf419d588e523f2e8189a5fbf.3cf6a609d624dad9e48921ddd3d07764cb2f8f3fc2a84d956416cf643eb1be18
INFO:__main__:Training new model from scratch
100%|████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 21.38ba/s]
100%|████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  4.46ba/s]
[INFO|trainer.py:415] 2021-05-16 22:51:00,376 >> Using amp fp16 backend
[INFO|trainer.py:515] 2021-05-16 22:51:00,472 >> The following columns in the training set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask.
[2021-05-16 22:51:00,476] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.3.16, git-hash=unknown, git-branch=unknown
[2021-05-16 22:51:00,476] [WARNING] [config.py:79:_sanity_check] DeepSpeedConfig: cpu_offload is deprecated. Please use offload_optimizer.
[2021-05-16 22:51:04,728] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
jupyter-beomi:14204:14204 [0] NCCL INFO Bootstrap : Using [0]eth0:192.168.11.194<0>
jupyter-beomi:14204:14204 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation

jupyter-beomi:14204:14204 [0] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
jupyter-beomi:14204:14204 [0] NCCL INFO NET/Socket : Using [0]eth0:192.168.11.194<0>
jupyter-beomi:14204:14204 [0] NCCL INFO Using network Socket
NCCL version 2.7.8+cuda11.1
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 00/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 01/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 02/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 03/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 04/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 05/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 06/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 07/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 08/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 09/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 10/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 11/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 12/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 13/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 14/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 15/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 16/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 17/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 18/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 19/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 20/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 21/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 22/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 23/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 24/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 25/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 26/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 27/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 28/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 29/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 30/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Channel 31/32 :    0
jupyter-beomi:14204:14388 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [1] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [2] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [3] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [4] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [5] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [6] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [7] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [8] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [9] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [10] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [11] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [12] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [13] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [14] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [15] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [16] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [17] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [18] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [19] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [20] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [21] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [22] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [23] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [24] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [25] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [26] -1/-1/-1->0->-1|-1->
jupyter-beomi:14204:14388 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,ffff0000
jupyter-beomi:14204:14388 [0] NCCL INFO 32 coll channels, 32 p2p channels, 32 p2p channels per peer
jupyter-beomi:14204:14388 [0] NCCL INFO comm 0x7fea50002dd0 rank 0 nranks 1 cudaDev 0 busId 86000 - Init COMPLETE
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Using /home/jovyan/.cache/torch_extensions as PyTorch extensions root...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected CUDA files, patching ldflags
Emitting ninja build file /home/jovyan/.cache/torch_extensions/cpu_adam/build.ninja...
Building extension module cpu_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
ninja: no work to do.
Loading extension module cpu_adam...
Time to load cpu_adam op: 1.0352756977081299 seconds
Adam Optimizer #0 is created with AVX512 arithmetic capability.
Config: alpha=0.000050, betas=(0.900000, 0.999000), weight_decay=0.000000, adam_w=1
[2021-05-16 22:51:07,855] [INFO] [engine.py:610:_configure_optimizer] Using DeepSpeed Optimizer param name adamw as basic optimizer
[2021-05-16 22:51:07,856] [INFO] [engine.py:615:_configure_optimizer] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam
Checking ZeRO support for optimizer=DeepSpeedCPUAdam type=<class 'deepspeed.ops.adam.cpu_adam.DeepSpeedCPUAdam'>
[2021-05-16 22:51:07,856] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer
[2021-05-16 22:51:07,856] [INFO] [stage2.py:102:__init__] Reduce bucket size 200000000.0
[2021-05-16 22:51:07,856] [INFO] [stage2.py:103:__init__] Allgather bucket size 200000000.0
[2021-05-16 22:51:07,856] [INFO] [stage2.py:104:__init__] CPU Offload: True
Using /home/jovyan/.cache/torch_extensions as PyTorch extensions root...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Emitting ninja build file /home/jovyan/.cache/torch_extensions/utils/build.ninja...
Building extension module utils...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
ninja: no work to do.
Loading extension module utils...
Time to load utils op: 0.6900577545166016 seconds
[2021-05-16 22:51:09,779] [INFO] [stage2.py:381:__init__] optimizer state initialized
[2021-05-16 22:51:09,779] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adamw
[2021-05-16 22:51:09,779] [INFO] [engine.py:439:_configure_lr_scheduler] DeepSpeed using configured LR scheduler = WarmupLR
[2021-05-16 22:51:09,779] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = <deepspeed.runtime.lr_schedules.WarmupLR object at 0x7fedd40d79a0>
[2021-05-16 22:51:09,780] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[5e-05], mom=[[0.9, 0.999]]
[2021-05-16 22:51:09,780] [INFO] [config.py:747:print] DeepSpeedEngine configuration:
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print]   activation_checkpointing_config  {
    "partition_activations": false,
    "contiguous_memory_optimization": false,
    "cpu_checkpointing": false,
    "number_checkpoints": null,
    "synchronize_checkpoint_boundary": false,
    "profile": false
}
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print]   aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print]   allreduce_always_fp32 ........ False
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print]   amp_enabled .................. False
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print]   amp_params ................... False
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print]   checkpoint_tag_validation_enabled  True
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print]   checkpoint_tag_validation_fail  False
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print]   disable_allgather ............ False
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print]   dump_state ................... False
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print]   dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 1000, 'delayed_shift': 2, 'min_scale': 1}
[2021-05-16 22:51:09,780] [INFO] [config.py:751:print]   elasticity_enabled ........... False
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   flops_profiler_config ........ {
    "enabled": false,
    "profile_step": 1,
    "module_depth": -1,
    "top_modules": 3,
    "detailed": true
}
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   fp16_enabled ................. True
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   global_rank .................. 0
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   gradient_accumulation_steps .. 1
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   gradient_clipping ............ 1.0
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   gradient_predivide_factor .... 1.0
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   initial_dynamic_scale ........ 65536
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   loss_scale ................... 0
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   memory_breakdown ............. False
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   optimizer_legacy_fusion ...... False
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   optimizer_name ............... adamw
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   optimizer_params ............. {'lr': 5e-05, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.0}
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   pld_enabled .................. False
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   pld_params ................... False
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   prescale_gradients ........... False
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   scheduler_name ............... WarmupLR
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   scheduler_params ............. {'warmup_min_lr': 0, 'warmup_max_lr': 5e-05, 'warmup_num_steps': 0}
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   sparse_attention ............. None
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   sparse_gradients_enabled ..... False
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   steps_per_print .............. 10
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   tensorboard_enabled .......... False
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   tensorboard_job_name ......... DeepSpeedJobName
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   tensorboard_output_path ......
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   train_batch_size ............. 32
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   train_micro_batch_size_per_gpu  32
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   wall_clock_breakdown ......... False
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   world_size ................... 1
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   zero_allow_untested_optimizer  False
[2021-05-16 22:51:09,781] [INFO] [config.py:751:print]   zero_config .................. {
    "stage": 2,
    "contiguous_gradients": true,
    "reduce_scatter": true,
    "reduce_bucket_size": 2.000000e+08,
    "allgather_partitions": true,
    "allgather_bucket_size": 2.000000e+08,
    "overlap_comm": true,
    "load_from_fp32_weights": true,
    "elastic_checkpoint": true,
    "offload_param": null,
    "offload_optimizer": {
        "device": "cpu",
        "nvme_path": null,
        "buffer_count": 4,
        "pin_memory": false,
        "pipeline_read": false,
        "pipeline_write": false,
        "fast_init": false
    },
    "sub_group_size": 1.000000e+12,
    "prefetch_bucket_size": 5.000000e+07,
    "param_persistence_threshold": 1.000000e+05,
    "max_live_parameters": 1.000000e+09,
    "max_reuse_distance": 1.000000e+09,
    "gather_fp16_weights_on_model_save": false,
    "find_unused_parameters": false
}
[2021-05-16 22:51:09,782] [INFO] [config.py:751:print]   zero_enabled ................. True
[2021-05-16 22:51:09,782] [INFO] [config.py:751:print]   zero_optimization_stage ...... 2
[2021-05-16 22:51:09,782] [INFO] [config.py:753:print]   json = {
    "fp16": {
        "enabled": true,
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": 5e-05,
            "betas": [0.9, 0.999],
            "eps": 1e-08,
            "weight_decay": 0.0
        }
    },
    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": 0,
            "warmup_max_lr": 5e-05,
            "warmup_num_steps": 0
        }
    },
    "zero_optimization": {
        "stage": 2,
        "allgather_partitions": true,
        "allgather_bucket_size": 2.000000e+08,
        "overlap_comm": true,
        "reduce_scatter": true,
        "reduce_bucket_size": 2.000000e+08,
        "contiguous_gradients": true,
        "cpu_offload": true
    },
    "gradient_accumulation_steps": 1,
    "gradient_clipping": 1.0,
    "train_batch_size": 32,
    "train_micro_batch_size_per_gpu": 32
}
Using /home/jovyan/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.002638101577758789 seconds
[INFO|trainer.py:1145] 2021-05-16 22:51:09,786 >> ***** Running training *****
[INFO|trainer.py:1146] 2021-05-16 22:51:09,786 >>   Num examples = 893
[INFO|trainer.py:1147] 2021-05-16 22:51:09,786 >>   Num Epochs = 2
[INFO|trainer.py:1148] 2021-05-16 22:51:09,786 >>   Instantaneous batch size per device = 32
[INFO|trainer.py:1149] 2021-05-16 22:51:09,786 >>   Total train batch size (w. parallel, distributed & accumulation) = 32
[INFO|trainer.py:1150] 2021-05-16 22:51:09,786 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:1151] 2021-05-16 22:51:09,786 >>   Total optimization steps = 56
  0%|                                                                             | 0/56 [00:00<?, ?it/s][2021-05-16 22:51:10,280] [INFO] [stage2.py:1407:step] [deepspeed] fp16 dynamic loss scale overflow! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 65536
  2%|█▏                                                                   | 1/56 [00:00<00:25,  2.18it/s][WARNING|trainer_pt_utils.py:777] 2021-05-16 22:51:10,281 >> tried to get lr value before scheduler/optimizer started stepping, returning lr=0
{'loss': 10.9922, 'learning_rate': 0, 'epoch': 0.04}
  2%|█▏                                                                   | 1/56 [00:00<00:25,  2.18it/s][2021-05-16 22:51:10,643] [INFO] [stage2.py:1407:step] [deepspeed] fp16 dynamic loss scale overflow! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768.0
 16%|███████████                                                          | 9/56 [00:04<00:26,  1.75it/s][2021-05-16 22:51:15,240] [INFO] [logging.py:60:log_dist] [Rank 0] step=10, skipped=2, lr=[5e-05], mom=[[0.9, 0.999]]
[2021-05-16 22:51:15,241] [INFO] [timer.py:154:stop] 0/10, SamplesPerSec=58.39567776783072
 34%|███████████████████████                                             | 19/56 [00:10<00:20,  1.82it/s][2021-05-16 22:51:20,733] [INFO] [logging.py:60:log_dist] [Rank 0] step=20, skipped=2, lr=[5e-05], mom=[[0.9, 0.999]]
[2021-05-16 22:51:20,734] [INFO] [timer.py:154:stop] 0/20, SamplesPerSec=59.91298349642106
 52%|███████████████████████████████████▏                                | 29/56 [00:15<00:14,  1.89it/s][2021-05-16 22:51:26,115] [INFO] [logging.py:60:log_dist] [Rank 0] step=30, skipped=2, lr=[5e-05], mom=[[0.9, 0.999]]
[2021-05-16 22:51:26,116] [INFO] [timer.py:154:stop] 0/30, SamplesPerSec=60.660391420264496
 70%|███████████████████████████████████████████████▎                    | 39/56 [00:21<00:09,  1.84it/s][2021-05-16 22:51:31,562] [INFO] [logging.py:60:log_dist] [Rank 0] step=40, skipped=2, lr=[5e-05], mom=[[0.9, 0.999]]
[2021-05-16 22:51:31,562] [INFO] [timer.py:154:stop] 0/40, SamplesPerSec=60.921882866787925
 88%|███████████████████████████████████████████████████████████▌        | 49/56 [00:26<00:03,  1.86it/s][2021-05-16 22:51:37,007] [INFO] [logging.py:60:log_dist] [Rank 0] step=50, skipped=2, lr=[5e-05], mom=[[0.9, 0.999]]
[2021-05-16 22:51:37,007] [INFO] [timer.py:154:stop] 0/50, SamplesPerSec=61.07305240178895
100%|████████████████████████████████████████████████████████████████████| 56/56 [00:30<00:00,  1.87it/s][INFO|trainer.py:1341] 2021-05-16 22:51:40,241 >>

Training completed. Do not forget to share your model on huggingface.co/models =)


jupyter-beomi:14204:14515 [0] NCCL INFO Channel 00/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 01/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 02/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 03/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 04/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 05/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 06/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 07/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 08/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 09/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 10/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 11/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 12/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 13/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 14/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 15/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 16/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 17/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 18/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 19/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 20/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 21/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 22/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 23/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 24/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 25/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 26/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 27/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 28/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 29/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 30/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Channel 31/32 :    0
jupyter-beomi:14204:14515 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [1] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [2] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [3] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [4] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [5] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [6] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [7] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [8] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [9] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [10] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [11] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [12] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [13] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [14] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [15] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [16] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [17] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [18] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [19] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [20] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [21] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [22] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [23] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [24] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [25] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [26] -1/-1/-1->0->-1|-1->
jupyter-beomi:14204:14515 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,ffff0000
jupyter-beomi:14204:14515 [0] NCCL INFO 32 coll channels, 32 p2p channels, 32 p2p channels per peer
jupyter-beomi:14204:14515 [0] NCCL INFO comm 0x7fe5f0002dd0 rank 0 nranks 1 cudaDev 0 busId 86000 - Init COMPLETE
{'train_runtime': 30.4552, 'train_samples_per_second': 1.839, 'epoch': 2.0}
100%|████████████████████████████████████████████████████████████████████| 56/56 [00:30<00:00,  1.84it/s]
[INFO|trainer.py:1885] 2021-05-16 22:51:40,526 >> Saving model checkpoint to ./test-bert-zero2
[INFO|configuration_utils.py:351] 2021-05-16 22:51:40,530 >> Configuration saved in ./test-bert-zero2/config.json
[INFO|modeling_utils.py:889] 2021-05-16 22:51:41,285 >> Model weights saved in ./test-bert-zero2/pytorch_model.bin
[INFO|tokenization_utils_base.py:1924] 2021-05-16 22:51:41,288 >> tokenizer config file saved in ./test-bert-zero2/tokenizer_config.json
[INFO|tokenization_utils_base.py:1930] 2021-05-16 22:51:41,289 >> Special tokens file saved in ./test-bert-zero2/special_tokens_map.json
[INFO|trainer_pt_utils.py:907] 2021-05-16 22:51:41,402 >> ***** train metrics *****
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,402 >>   epoch                      =        2.0
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,402 >>   init_mem_cpu_alloc_delta   =        0MB
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,402 >>   init_mem_cpu_peaked_delta  =        0MB
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,403 >>   init_mem_gpu_alloc_delta   =        0MB
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,403 >>   init_mem_gpu_peaked_delta  =        0MB
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,403 >>   train_mem_cpu_alloc_delta  =     5737MB
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,403 >>   train_mem_cpu_peaked_delta =      462MB
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,403 >>   train_mem_gpu_alloc_delta  =      312MB
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,403 >>   train_mem_gpu_peaked_delta =    12823MB
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,403 >>   train_runtime              = 0:00:30.45
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,403 >>   train_samples              =        893
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:51:41,403 >>   train_samples_per_second   =      1.839