Beomi/ZeRO3_BERT_Sample_Log.txt Secret

## ZeRO3_BERT_Sample_Log.txt
(ds-huggingface) root@jupyter-beomi:~/2021.05.17.DeepSpeed_Huggingface# ./zero3_gpu_run_mlm.sh
[2021-05-16 22:44:59,482] [WARNING] [runner.py:122:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
[2021-05-16 22:44:59,520] [INFO] [runner.py:360:main] cmd = /home/jovyan/anaconda3/envs/ds-huggingface/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=29500 run_mlm.py --seed 42 --model_type bert --tokenizer_name beomi/KcELECTRA-base --train_file ./sampled_20190101_20200611_v2.txt --num_train_epochs 2 --per_device_train_batch_size 32 --per_device_eval_batch_size 32 --do_train --output_dir ./test-bert-zero3 --fp16 --logging_first_step --max_seq_length 300 --deepspeed ./ds_zero3_1gpu.json
[2021-05-16 22:45:01,372] [INFO] [launch.py:73:main] 0 NCCL_DEBUG INFO
[2021-05-16 22:45:01,372] [INFO] [launch.py:80:main] WORLD INFO DICT: {'localhost': [0]}
[2021-05-16 22:45:01,372] [INFO] [launch.py:86:main] nnodes=1, num_local_procs=1, node_rank=0
[2021-05-16 22:45:01,372] [INFO] [launch.py:101:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0]})
[2021-05-16 22:45:01,372] [INFO] [launch.py:102:main] dist_world_size=1
[2021-05-16 22:45:01,372] [INFO] [launch.py:104:main] Setting CUDA_VISIBLE_DEVICES=0
[2021-05-16 22:45:05,033] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl
WARNING:__main__:Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: True
INFO:__main__:Training/evaluation parameters TrainingArguments(output_dir=./test-bert-zero3, overwrite_output_dir=False, do_train=True, do_eval=False, do_predict=False, evaluation_strategy=IntervalStrategy.NO, prediction_loss_only=False, per_device_train_batch_size=32, per_device_eval_batch_size=32, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=2.0, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_ratio=0.0, warmup_steps=0, logging_dir=runs/May16_22-45-04_jupyter-beomi, logging_strategy=IntervalStrategy.STEPS, logging_first_step=True, logging_steps=500, save_strategy=IntervalStrategy.STEPS, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=True, fp16_opt_level=O1, fp16_backend=auto, fp16_full_eval=False, local_rank=0, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name=./test-bert-zero3, disable_tqdm=False, remove_unused_columns=True, label_names=None, load_best_model_at_end=False, metric_for_best_model=None, greater_is_better=None, ignore_data_skip=False, sharded_ddp=[], deepspeed=./ds_zero3_1gpu.json, label_smoothing_factor=0.0, adafactor=False, group_by_length=False, length_column_name=length, report_to=['tensorboard'], ddp_find_unused_parameters=None, dataloader_pin_memory=True, skip_memory_metrics=False, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, _n_gpu=1, mp_parameters=)
WARNING:datasets.builder:Using custom data configuration default-43493ca3484df8f8
Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/jovyan/.cache/huggingface/datasets/text/default-43493ca3484df8f8/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...
Dataset text downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/text/default-43493ca3484df8f8/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.
WARNING:__main__:You are instantiating a new config instance from scratch.
[INFO|configuration_utils.py:517] 2021-05-16 22:45:06,713 >> loading configuration file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/61dd2bdbb7e56ff51fdc66b6f0d1973d2d806cd616d38a149f1bfd2753babc3c.ba488f0d9624511a98ed83af3e8f6b33fe20b502e2cfb16ee9858a6b6f521982
[INFO|configuration_utils.py:553] 2021-05-16 22:45:06,715 >> Model config ElectraConfig {
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "tokenizer_class": "BertTokenizer",
  "transformers_version": "4.7.0.dev0",
  "type_vocab_size": 2,
  "vocab_size": 50135
}

[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:45:10,218 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/vocab.txt from cache at /home/jovyan/.cache/huggingface/transformers/5852208f13e8ee71a994c414a90812974669123460d08b55ead80024d9a2e025.2e854075a5d70b111391280c0fdbeeab1ad11deed5a20c29cbdaff40f39422c9
[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:45:10,218 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/tokenizer.json from cache at None
[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:45:10,218 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:45:10,218 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/special_tokens_map.json from cache at /home/jovyan/.cache/huggingface/transformers/6bddca875f34b8afbae26136b9594ea80793c9598640f0bc94017555a0a1c113.31b83c6ab34462cefd974ed0df8dd4189e7b7b81b47315b7a10627f7ae120002
[INFO|tokenization_utils_base.py:1717] 2021-05-16 22:45:10,218 >> loading file https://huggingface.co/beomi/KcELECTRA-base/resolve/main/tokenizer_config.json from cache at /home/jovyan/.cache/huggingface/transformers/7263de953a4cd2b1f102b17d66d2138ab74d46fbf419d588e523f2e8189a5fbf.3cf6a609d624dad9e48921ddd3d07764cb2f8f3fc2a84d956416cf643eb1be18
INFO:__main__:Training new model from scratch
100%|████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 24.11ba/s]
100%|████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  4.44ba/s]
[INFO|trainer.py:415] 2021-05-16 22:45:17,267 >> Using amp fp16 backend
[INFO|trainer.py:515] 2021-05-16 22:45:17,361 >> The following columns in the training set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask.
[2021-05-16 22:45:17,366] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.3.16, git-hash=unknown, git-branch=unknown
[2021-05-16 22:45:21,598] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
jupyter-beomi:12928:12928 [0] NCCL INFO Bootstrap : Using [0]eth0:192.168.11.194<0>
jupyter-beomi:12928:12928 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation

jupyter-beomi:12928:12928 [0] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
jupyter-beomi:12928:12928 [0] NCCL INFO NET/Socket : Using [0]eth0:192.168.11.194<0>
jupyter-beomi:12928:12928 [0] NCCL INFO Using network Socket
NCCL version 2.7.8+cuda11.1
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 00/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 01/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 02/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 03/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 04/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 05/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 06/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 07/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 08/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 09/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 10/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 11/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 12/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 13/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 14/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 15/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 16/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 17/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 18/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 19/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 20/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 21/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 22/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 23/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 24/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 25/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 26/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 27/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 28/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 29/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 30/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Channel 31/32 :    0
jupyter-beomi:12928:13113 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [1] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [2] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [3] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [4] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [5] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [6] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [7] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [8] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [9] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [10] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [11] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [12] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [13] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [14] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [15] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [16] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [17] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [18] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [19] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [20] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [21] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [22] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [23] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [24] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [25] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [26] -1/-1/-1->0->-1|-1->
jupyter-beomi:12928:13113 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,ffff0000
jupyter-beomi:12928:13113 [0] NCCL INFO 32 coll channels, 32 p2p channels, 32 p2p channels per peer
jupyter-beomi:12928:13113 [0] NCCL INFO comm 0x7f1e94002dd0 rank 0 nranks 1 cudaDev 0 busId 86000 - Init COMPLETE
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Using /home/jovyan/.cache/torch_extensions as PyTorch extensions root...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected CUDA files, patching ldflags
Emitting ninja build file /home/jovyan/.cache/torch_extensions/cpu_adam/build.ninja...
Building extension module cpu_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
ninja: no work to do.
Loading extension module cpu_adam...
Time to load cpu_adam op: 0.9731655120849609 seconds
Adam Optimizer #0 is created with AVX512 arithmetic capability.
Config: alpha=0.000050, betas=(0.900000, 0.999000), weight_decay=0.000000, adam_w=1
[2021-05-16 22:45:24,577] [INFO] [engine.py:610:_configure_optimizer] Using DeepSpeed Optimizer param name adamw as basic optimizer
[2021-05-16 22:45:24,578] [INFO] [engine.py:615:_configure_optimizer] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam
Checking ZeRO support for optimizer=DeepSpeedCPUAdam type=<class 'deepspeed.ops.adam.cpu_adam.DeepSpeedCPUAdam'>
[2021-05-16 22:45:24,578] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer
Initializing ZeRO Stage 3
[2021-05-16 22:45:24,664] [INFO] [utils.py:583:see_memory_usage] Stage 3 initialize beginning
/home/jovyan/anaconda3/envs/ds-huggingface/lib/python3.8/site-packages/torch/cuda/memory.py:373: FutureWarning: torch.cuda.memory_cached has been renamed to torch.cuda.memory_reserved
  warnings.warn(
/home/jovyan/anaconda3/envs/ds-huggingface/lib/python3.8/site-packages/torch/cuda/memory.py:381: FutureWarning: torch.cuda.max_memory_cached has been renamed to torch.cuda.max_memory_reserved
  warnings.warn(
[2021-05-16 22:45:24,666] [INFO] [utils.py:584:see_memory_usage] MA 0.24 GB         Max_MA 0.24 GB         CA 0.25 GB         Max_CA 0 GB
[2021-05-16 22:45:24,667] [INFO] [utils.py:592:see_memory_usage] CPU Virtual Memory:  used = 151.6 GB, percent = 32.1%
[2021-05-16 22:45:24,667] [INFO] [stage3.py:624:__init__] Reduce bucket size 589824
[2021-05-16 22:45:24,667] [INFO] [stage3.py:625:__init__] Allgather bucket size 530841.6000000001
Using /home/jovyan/.cache/torch_extensions as PyTorch extensions root...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Emitting ninja build file /home/jovyan/.cache/torch_extensions/utils/build.ninja...
Building extension module utils...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
ninja: no work to do.
Loading extension module utils...
Time to load utils op: 0.6948151588439941 seconds
[2021-05-16 22:45:25,450] [INFO] [stage3.py:39:print_rank_0] FP16 params swapping is False, Max params in CPU is 1000000000.0
[2021-05-16 22:45:25,527] [INFO] [utils.py:583:see_memory_usage] Before creating fp16 partitions
/home/jovyan/anaconda3/envs/ds-huggingface/lib/python3.8/site-packages/torch/cuda/memory.py:373: FutureWarning: torch.cuda.memory_cached has been renamed to torch.cuda.memory_reserved
  warnings.warn(
/home/jovyan/anaconda3/envs/ds-huggingface/lib/python3.8/site-packages/torch/cuda/memory.py:381: FutureWarning: torch.cuda.max_memory_cached has been renamed to torch.cuda.max_memory_reserved
  warnings.warn(
[2021-05-16 22:45:25,529] [INFO] [utils.py:584:see_memory_usage] MA 0.24 GB         Max_MA 0.31 GB         CA 0.32 GB         Max_CA 0 GB
[2021-05-16 22:45:25,529] [INFO] [utils.py:592:see_memory_usage] CPU Virtual Memory:  used = 151.6 GB, percent = 32.1%
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 00/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 01/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 02/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 03/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 04/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 05/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 06/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 07/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 08/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 09/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 10/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 11/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 12/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 13/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 14/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 15/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 16/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 17/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 18/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 19/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 20/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 21/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 22/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 23/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 24/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 25/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 26/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 27/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 28/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 29/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 30/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Channel 31/32 :    0
jupyter-beomi:12928:13146 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [1] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [2] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [3] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [4] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [5] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [6] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [7] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [8] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [9] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [10] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [11] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [12] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [13] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [14] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [15] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [16] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [17] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [18] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [19] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [20] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [21] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [22] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [23] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [24] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [25] -1/-1/-1->0->-1|-1->0->-1/-1/-1 [26] -1/-1/-1->0->-1|-1->
jupyter-beomi:12928:13146 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,ffff0000
jupyter-beomi:12928:13146 [0] NCCL INFO 32 coll channels, 32 p2p channels, 32 p2p channels per peer
jupyter-beomi:12928:13146 [0] NCCL INFO comm 0x7f1e90002e20 rank 0 nranks 1 cudaDev 0 busId 86000 - Init COMPLETE
[2021-05-16 22:45:25,705] [INFO] [stage3.py:39:print_rank_0] fp16 group 0 has 1 subgroups
[2021-05-16 22:45:25,891] [INFO] [stage3.py:39:print_rank_0] Swappable FP32 Partitions: count=0 size= 0.00 GB
[2021-05-16 22:45:25,891] [INFO] [stage3.py:39:print_rank_0] In-Memory FP32 Partitions: count=1 size= 0.46 GB
[2021-05-16 22:45:26,590] [INFO] [stage3.py:819:__init__] optimizer state initialized
[2021-05-16 22:45:26,590] [INFO] [stage3.py:39:print_rank_0] Largest partitioned param numel = 124596695
[2021-05-16 22:45:26,783] [INFO] [utils.py:583:see_memory_usage] After initializing ZeRO optimizer
[2021-05-16 22:45:26,784] [INFO] [utils.py:584:see_memory_usage] MA 0.46 GB         Max_MA 0.61 GB         CA 0.86 GB         Max_CA 1 GB
[2021-05-16 22:45:26,785] [INFO] [utils.py:592:see_memory_usage] CPU Virtual Memory:  used = 153.7 GB, percent = 32.6%
[2021-05-16 22:45:26,785] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adamw
[2021-05-16 22:45:26,785] [INFO] [engine.py:439:_configure_lr_scheduler] DeepSpeed using configured LR scheduler = WarmupLR
[2021-05-16 22:45:26,785] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = <deepspeed.runtime.lr_schedules.WarmupLR object at 0x7f1f4916eca0>
[2021-05-16 22:45:26,785] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[5e-05], mom=[[0.9, 0.999]]
[2021-05-16 22:45:26,785] [INFO] [config.py:747:print] DeepSpeedEngine configuration:
[2021-05-16 22:45:26,785] [INFO] [config.py:751:print]   activation_checkpointing_config  {
    "partition_activations": false,
    "contiguous_memory_optimization": false,
    "cpu_checkpointing": false,
    "number_checkpoints": null,
    "synchronize_checkpoint_boundary": false,
    "profile": false
}
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   allreduce_always_fp32 ........ False
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   amp_enabled .................. False
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   amp_params ................... False
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   checkpoint_tag_validation_enabled  True
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   checkpoint_tag_validation_fail  False
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   disable_allgather ............ False
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   dump_state ................... False
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 1000, 'delayed_shift': 2, 'min_scale': 1}
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   elasticity_enabled ........... False
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   flops_profiler_config ........ {
    "enabled": false,
    "profile_step": 1,
    "module_depth": -1,
    "top_modules": 3,
    "detailed": true
}
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   fp16_enabled ................. True
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   global_rank .................. 0
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   gradient_accumulation_steps .. 1
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   gradient_clipping ............ 1.0
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   gradient_predivide_factor .... 1.0
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   initial_dynamic_scale ........ 65536
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   loss_scale ................... 0
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   memory_breakdown ............. False
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   optimizer_legacy_fusion ...... False
[2021-05-16 22:45:26,786] [INFO] [config.py:751:print]   optimizer_name ............... adamw
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print]   optimizer_params ............. {'lr': 5e-05, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.0}
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print]   pld_enabled .................. False
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print]   pld_params ................... False
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print]   prescale_gradients ........... False
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print]   scheduler_name ............... WarmupLR
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print]   scheduler_params ............. {'warmup_min_lr': 0, 'warmup_max_lr': 5e-05, 'warmup_num_steps': 0}
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print]   sparse_attention ............. None
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print]   sparse_gradients_enabled ..... False
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print]   steps_per_print .............. 2000
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print]   tensorboard_enabled .......... False
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print]   tensorboard_job_name ......... DeepSpeedJobName
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print]   tensorboard_output_path ......
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print]   train_batch_size ............. 32
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print]   train_micro_batch_size_per_gpu  32
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print]   wall_clock_breakdown ......... False
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print]   world_size ................... 1
[2021-05-16 22:45:26,787] [INFO] [config.py:751:print]   zero_allow_untested_optimizer  False
[2021-05-16 22:45:26,788] [INFO] [config.py:751:print]   zero_config .................. {
    "stage": 3,
    "contiguous_gradients": true,
    "reduce_scatter": false,
    "reduce_bucket_size": 5.898240e+05,
    "allgather_partitions": true,
    "allgather_bucket_size": 5.000000e+08,
    "overlap_comm": true,
    "load_from_fp32_weights": true,
    "elastic_checkpoint": true,
    "offload_param": {
        "device": "cpu",
        "nvme_path": null,
        "buffer_count": 5,
        "buffer_size": 1.000000e+08,
        "max_in_cpu": 1.000000e+09,
        "pin_memory": true
    },
    "offload_optimizer": {
        "device": "cpu",
        "nvme_path": null,
        "buffer_count": 4,
        "pin_memory": true,
        "pipeline_read": false,
        "pipeline_write": false,
        "fast_init": false,
        "pipeline": false
    },
    "sub_group_size": 1.000000e+14,
    "prefetch_bucket_size": 5.308416e+05,
    "param_persistence_threshold": 7.680000e+03,
    "max_live_parameters": 1.000000e+09,
    "max_reuse_distance": 1.000000e+09,
    "gather_fp16_weights_on_model_save": true,
    "find_unused_parameters": false
}
[2021-05-16 22:45:26,788] [INFO] [config.py:751:print]   zero_enabled ................. True
[2021-05-16 22:45:26,788] [INFO] [config.py:751:print]   zero_optimization_stage ...... 3
[2021-05-16 22:45:26,788] [INFO] [config.py:753:print]   json = {
    "fp16": {
        "enabled": true,
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": 5e-05,
            "betas": [0.9, 0.999],
            "eps": 1e-08,
            "weight_decay": 0.0
        }
    },
    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": 0,
            "warmup_max_lr": 5e-05,
            "warmup_num_steps": 0
        }
    },
    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1.000000e+14,
        "reduce_bucket_size": 5.898240e+05,
        "stage3_prefetch_bucket_size": 5.308416e+05,
        "stage3_param_persistence_threshold": 7.680000e+03,
        "stage3_max_live_parameters": 1.000000e+09,
        "stage3_max_reuse_distance": 1.000000e+09,
        "stage3_gather_fp16_weights_on_model_save": true
    },
    "gradient_accumulation_steps": 1,
    "gradient_clipping": 1.0,
    "steps_per_print": 2.000000e+03,
    "train_batch_size": 32,
    "train_micro_batch_size_per_gpu": 32,
    "wall_clock_breakdown": false
}
Using /home/jovyan/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.003198385238647461 seconds
[INFO|trainer.py:1145] 2021-05-16 22:45:26,792 >> ***** Running training *****
[INFO|trainer.py:1146] 2021-05-16 22:45:26,792 >>   Num examples = 893
[INFO|trainer.py:1147] 2021-05-16 22:45:26,792 >>   Num Epochs = 2
[INFO|trainer.py:1148] 2021-05-16 22:45:26,792 >>   Instantaneous batch size per device = 32
[INFO|trainer.py:1149] 2021-05-16 22:45:26,792 >>   Total train batch size (w. parallel, distributed & accumulation) = 32
[INFO|trainer.py:1150] 2021-05-16 22:45:26,793 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:1151] 2021-05-16 22:45:26,793 >>   Total optimization steps = 56
  0%|                                                                             | 0/56 [00:00<?, ?it/s][2021-05-16 22:45:27,806] [INFO] [stage3.py:2700:_overflow_clean_up] [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 65536
  2%|█▏                                                                   | 1/56 [00:00<00:53,  1.03it/s][WARNING|trainer_pt_utils.py:777] 2021-05-16 22:45:27,807 >> tried to get lr value before scheduler/optimizer started stepping, returning lr=0
{'loss': 10.9922, 'learning_rate': 0, 'epoch': 0.04}
  2%|█▏                                                                   | 1/56 [00:00<00:53,  1.03it/s][2021-05-16 22:45:28,722] [INFO] [stage3.py:2700:_overflow_clean_up] [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768.0
100%|████████████████████████████████████████████████████████████████████| 56/56 [00:53<00:00,  1.08it/s][INFO|trainer.py:1341] 2021-05-16 22:46:19,931 >>

Training completed. Do not forget to share your model on huggingface.co/models =)


{'train_runtime': 53.1384, 'train_samples_per_second': 1.054, 'epoch': 2.0}
100%|████████████████████████████████████████████████████████████████████| 56/56 [00:53<00:00,  1.05it/s]
[INFO|trainer.py:1885] 2021-05-16 22:46:20,225 >> Saving model checkpoint to ./test-bert-zero3
[INFO|configuration_utils.py:351] 2021-05-16 22:46:20,229 >> Configuration saved in ./test-bert-zero3/config.json
[INFO|modeling_utils.py:889] 2021-05-16 22:46:20,244 >> Model weights saved in ./test-bert-zero3/pytorch_model.bin
[INFO|tokenization_utils_base.py:1924] 2021-05-16 22:46:20,247 >> tokenizer config file saved in ./test-bert-zero3/tokenizer_config.json
[INFO|tokenization_utils_base.py:1930] 2021-05-16 22:46:20,249 >> Special tokens file saved in ./test-bert-zero3/special_tokens_map.json
[2021-05-16 22:46:20,569] [INFO] [engine.py:1842:save_fp16_model] Saving model weights to ./test-bert-zero3/pytorch_model.bin
[INFO|trainer_pt_utils.py:907] 2021-05-16 22:46:20,971 >> ***** train metrics *****
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >>   epoch                      =        2.0
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >>   init_mem_cpu_alloc_delta   =        0MB
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >>   init_mem_cpu_peaked_delta  =        0MB
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >>   init_mem_gpu_alloc_delta   =        0MB
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >>   init_mem_gpu_peaked_delta  =        0MB
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >>   train_mem_cpu_alloc_delta  =     5813MB
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >>   train_mem_cpu_peaked_delta =      465MB
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >>   train_mem_gpu_alloc_delta  =      476MB
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >>   train_mem_gpu_peaked_delta =     9633MB
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >>   train_runtime              = 0:00:53.13
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >>   train_samples              =        893
[INFO|trainer_pt_utils.py:912] 2021-05-16 22:46:20,972 >>   train_samples_per_second   =      1.054