Beomi/exbert_train_result.txt

## exbert_train_result.txt
(exbert-transformers) root@jupyter-beomi:~/exbert-transformers/examples/pytorch/language-modeling# ./exbert_pretrain.sh
2021-06-10 05:57:55.339746: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2021-06-10 05:57:55.339796: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do nothave a GPU set up on your machine.
06/10/2021 05:57:58 - WARNING - __main__ -   Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False
06/10/2021 05:57:58 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_on_each_node=True,
logging_dir=runs/Jun10_05-57-58_jupyter-beomi,
logging_first_step=True,
logging_steps=500,
logging_strategy=IntervalStrategy.STEPS,
lr_scheduler_type=SchedulerType.LINEAR,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
no_cuda=False,
num_train_epochs=1.0,
output_dir=./exbert-mlm,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=8,
prediction_loss_only=False,
push_to_hub=False,
remove_unused_columns=True,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=./exbert-mlm,
save_steps=500,
save_strategy=IntervalStrategy.STEPS,
save_total_limit=None,
seed=42,
sharded_ddp=[],
skip_memory_metrics=True,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_legacy_prediction_loop=False,
warmup_ratio=0.0,
warmup_steps=0,
weight_decay=0.0,
)
06/10/2021 05:57:59 - WARNING - datasets.builder -   Using custom data configuration default-477b0faef5910d5a
Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/jovyan/.cache/huggingface/datasets/text/default-477b0faef5910d5a/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...
Dataset text downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/text/default-477b0faef5910d5a/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.
06/10/2021 05:57:59 - WARNING - __main__ -   You are instantiating a new config instance from scratch.
[INFO|tokenization_utils_base.py:1651] 2021-06-10 05:57:59,979 >> Didn't find file exbert/added_tokens.json. We won't loadit.
[INFO|tokenization_utils_base.py:1651] 2021-06-10 05:57:59,980 >> Didn't find file exbert/special_tokens_map.json. We won't load it.
[INFO|tokenization_utils_base.py:1651] 2021-06-10 05:57:59,980 >> Didn't find file exbert/tokenizer_config.json. We won't load it.
[INFO|tokenization_utils_base.py:1651] 2021-06-10 05:57:59,980 >> Didn't find file exbert/tokenizer.json. We won't load it.
[INFO|tokenization_utils_base.py:1715] 2021-06-10 05:57:59,981 >> loading file exbert/vocab.txt
[INFO|tokenization_utils_base.py:1715] 2021-06-10 05:57:59,981 >> loading file None
[INFO|tokenization_utils_base.py:1715] 2021-06-10 05:57:59,981 >> loading file None
[INFO|tokenization_utils_base.py:1715] 2021-06-10 05:57:59,981 >> loading file None
[INFO|tokenization_utils_base.py:1715] 2021-06-10 05:57:59,981 >> loading file None
06/10/2021 05:58:00 - INFO - __main__ -   Training new model from scratch
[INFO|configuration_utils.py:517] 2021-06-10 05:58:03,805 >> loading configuration file https://huggingface.co/beomi/kcbert-base/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/10de039f2f91b0c6fbd30fad5bf8a7468a20701212ed12f9f5e610edb99c55d1.d8a72131e15fd1d856f1b39abf4eff31d458aeeca0a4192df898ca699ec7d779
[INFO|configuration_utils.py:553] 2021-06-10 05:58:03,807 >> Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 300,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.7.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

[INFO|modeling_utils.py:1155] 2021-06-10 05:58:04,609 >> loading weights file https://huggingface.co/beomi/kcbert-base/resolve/main/pytorch_model.bin from cache at /home/jovyan/.cache/huggingface/transformers/1c204bf1f008ee734eeb5ce678b148d14fa298802ce16d879c92a22a52527a0e.6cdf570ee57a7f6a5c727c436a4c26d8e9601ddaa1377ebcb16b7285d76125cd
[WARNING|modeling_utils.py:1330] 2021-06-10 05:58:07,188 >> Some weights of the model checkpoint at beomi/kcbert-base werenot used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[INFO|modeling_utils.py:1347] 2021-06-10 05:58:07,188 >> All the weights of BertModel were initialized from the model checkpoint at beomi/kcbert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
100%|████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.40ba/s]
100%|████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  3.25ba/s]
[INFO|trainer.py:514] 2021-06-10 05:58:17,960 >> The following columns in the training set  don't have a corresponding argument in `exBertForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:1147] 2021-06-10 05:58:17,976 >> ***** Running training *****
[INFO|trainer.py:1148] 2021-06-10 05:58:17,976 >>   Num examples = 452
[INFO|trainer.py:1149] 2021-06-10 05:58:17,976 >>   Num Epochs = 1
[INFO|trainer.py:1150] 2021-06-10 05:58:17,976 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:1151] 2021-06-10 05:58:17,976 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:1152] 2021-06-10 05:58:17,976 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:1153] 2021-06-10 05:58:17,976 >>   Total optimization steps = 57
{'loss': 10.7157, 'learning_rate': 4.912280701754386e-05, 'epoch': 0.02}
100%|██████████████████████████████████████████████████████████████████████████████████████| 57/57 [00:18<00:00,  3.41it/s][INFO|trainer.py:1343] 2021-06-10 05:58:36,148 >>

Training completed. Do not forget to share your model on huggingface.co/models =)


{'train_runtime': 18.1725, 'train_samples_per_second': 24.873, 'train_steps_per_second': 3.137, 'train_loss': 5.573559560273823, 'epoch': 1.0}
100%|██████████████████████████████████████████████████████████████████████████████████████| 57/57 [00:18<00:00,  3.14it/s]
[INFO|trainer.py:1894] 2021-06-10 05:58:36,152 >> Saving model checkpoint to ./exbert-mlm
[INFO|configuration_utils.py:351] 2021-06-10 05:58:36,155 >> Configuration saved in ./exbert-mlm/config.json
[INFO|modeling_utils.py:889] 2021-06-10 05:58:37,568 >> Model weights saved in ./exbert-mlm/pytorch_model.bin
[INFO|tokenization_utils_base.py:1924] 2021-06-10 05:58:37,571 >> tokenizer config file saved in ./exbert-mlm/tokenizer_config.json
[INFO|tokenization_utils_base.py:1930] 2021-06-10 05:58:37,573 >> Special tokens file saved in ./exbert-mlm/special_tokens_map.json
[INFO|trainer_pt_utils.py:907] 2021-06-10 05:58:37,617 >> ***** train metrics *****
[INFO|trainer_pt_utils.py:912] 2021-06-10 05:58:37,617 >>   epoch                    =        1.0
[INFO|trainer_pt_utils.py:912] 2021-06-10 05:58:37,617 >>   train_loss               =     5.5736
[INFO|trainer_pt_utils.py:912] 2021-06-10 05:58:37,617 >>   train_runtime            = 0:00:18.17
[INFO|trainer_pt_utils.py:912] 2021-06-10 05:58:37,618 >>   train_samples            =        452
[INFO|trainer_pt_utils.py:912] 2021-06-10 05:58:37,618 >>   train_samples_per_second =     24.873
[INFO|trainer_pt_utils.py:912] 2021-06-10 05:58:37,618 >>   train_steps_per_second   =      3.137
(exbert-transformers) root@jupyter-beomi:~/exbert-transformers/examples/pytorch/language-modeling#
	(exbert-transformers) root@jupyter-beomi:~/exbert-transformers/examples/pytorch/language-modeling# ./exbert_pretrain.sh
	2021-06-10 05:57:55.339746: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
	2021-06-10 05:57:55.339796: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do nothave a GPU set up on your machine.
	06/10/2021 05:57:58 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False
	06/10/2021 05:57:58 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
	_n_gpu=1,
	adafactor=False,
	adam_beta1=0.9,
	adam_beta2=0.999,
	adam_epsilon=1e-08,
	dataloader_drop_last=False,
	dataloader_num_workers=0,
	dataloader_pin_memory=True,
	ddp_find_unused_parameters=None,
	debug=[],
	deepspeed=None,
	disable_tqdm=False,
	do_eval=False,
	do_predict=False,
	do_train=True,
	eval_accumulation_steps=None,
	eval_steps=500,
	evaluation_strategy=IntervalStrategy.NO,
	fp16=False,
	fp16_backend=auto,
	fp16_full_eval=False,
	fp16_opt_level=O1,
	gradient_accumulation_steps=1,
	greater_is_better=None,
	group_by_length=False,
	ignore_data_skip=False,
	label_names=None,
	label_smoothing_factor=0.0,
	learning_rate=5e-05,
	length_column_name=length,
	load_best_model_at_end=False,
	local_rank=-1,
	log_on_each_node=True,
	logging_dir=runs/Jun10_05-57-58_jupyter-beomi,
	logging_first_step=True,
	logging_steps=500,
	logging_strategy=IntervalStrategy.STEPS,
	lr_scheduler_type=SchedulerType.LINEAR,
	max_grad_norm=1.0,
	max_steps=-1,
	metric_for_best_model=None,
	mp_parameters=,
	no_cuda=False,
	num_train_epochs=1.0,
	output_dir=./exbert-mlm,
	overwrite_output_dir=False,
	past_index=-1,
	per_device_eval_batch_size=8,
	per_device_train_batch_size=8,
	prediction_loss_only=False,
	push_to_hub=False,
	remove_unused_columns=True,
	report_to=['tensorboard'],
	resume_from_checkpoint=None,
	run_name=./exbert-mlm,
	save_steps=500,
	save_strategy=IntervalStrategy.STEPS,
	save_total_limit=None,
	seed=42,
	sharded_ddp=[],
	skip_memory_metrics=True,
	tpu_metrics_debug=False,
	tpu_num_cores=None,
	use_legacy_prediction_loop=False,
	warmup_ratio=0.0,
	warmup_steps=0,
	weight_decay=0.0,
	)
	06/10/2021 05:57:59 - WARNING - datasets.builder - Using custom data configuration default-477b0faef5910d5a
	Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/jovyan/.cache/huggingface/datasets/text/default-477b0faef5910d5a/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...
	Dataset text downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/text/default-477b0faef5910d5a/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.
	06/10/2021 05:57:59 - WARNING - __main__ - You are instantiating a new config instance from scratch.
	[INFO\|tokenization_utils_base.py:1651] 2021-06-10 05:57:59,979 >> Didn't find file exbert/added_tokens.json. We won't loadit.
	[INFO\|tokenization_utils_base.py:1651] 2021-06-10 05:57:59,980 >> Didn't find file exbert/special_tokens_map.json. We won't load it.
	[INFO\|tokenization_utils_base.py:1651] 2021-06-10 05:57:59,980 >> Didn't find file exbert/tokenizer_config.json. We won't load it.
	[INFO\|tokenization_utils_base.py:1651] 2021-06-10 05:57:59,980 >> Didn't find file exbert/tokenizer.json. We won't load it.
	[INFO\|tokenization_utils_base.py:1715] 2021-06-10 05:57:59,981 >> loading file exbert/vocab.txt
	[INFO\|tokenization_utils_base.py:1715] 2021-06-10 05:57:59,981 >> loading file None
	[INFO\|tokenization_utils_base.py:1715] 2021-06-10 05:57:59,981 >> loading file None
	[INFO\|tokenization_utils_base.py:1715] 2021-06-10 05:57:59,981 >> loading file None
	[INFO\|tokenization_utils_base.py:1715] 2021-06-10 05:57:59,981 >> loading file None
	06/10/2021 05:58:00 - INFO - __main__ - Training new model from scratch
	[INFO\|configuration_utils.py:517] 2021-06-10 05:58:03,805 >> loading configuration file https://huggingface.co/beomi/kcbert-base/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/10de039f2f91b0c6fbd30fad5bf8a7468a20701212ed12f9f5e610edb99c55d1.d8a72131e15fd1d856f1b39abf4eff31d458aeeca0a4192df898ca699ec7d779
	[INFO\|configuration_utils.py:553] 2021-06-10 05:58:03,807 >> Model config BertConfig {
	"architectures": [
	"BertForMaskedLM"
	],
	"attention_probs_dropout_prob": 0.1,
	"directionality": "bidi",
	"gradient_checkpointing": false,
	"hidden_act": "gelu",
	"hidden_dropout_prob": 0.1,
	"hidden_size": 768,
	"initializer_range": 0.02,
	"intermediate_size": 3072,
	"layer_norm_eps": 1e-12,
	"max_position_embeddings": 300,
	"model_type": "bert",
	"num_attention_heads": 12,
	"num_hidden_layers": 12,
	"pad_token_id": 0,
	"pooler_fc_size": 768,
	"pooler_num_attention_heads": 12,
	"pooler_num_fc_layers": 3,
	"pooler_size_per_head": 128,
	"pooler_type": "first_token_transform",
	"position_embedding_type": "absolute",
	"transformers_version": "4.7.0.dev0",
	"type_vocab_size": 2,
	"use_cache": true,
	"vocab_size": 30000
	}

	[INFO\|modeling_utils.py:1155] 2021-06-10 05:58:04,609 >> loading weights file https://huggingface.co/beomi/kcbert-base/resolve/main/pytorch_model.bin from cache at /home/jovyan/.cache/huggingface/transformers/1c204bf1f008ee734eeb5ce678b148d14fa298802ce16d879c92a22a52527a0e.6cdf570ee57a7f6a5c727c436a4c26d8e9601ddaa1377ebcb16b7285d76125cd
	[WARNING\|modeling_utils.py:1330] 2021-06-10 05:58:07,188 >> Some weights of the model checkpoint at beomi/kcbert-base werenot used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
	- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
	- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
	[INFO\|modeling_utils.py:1347] 2021-06-10 05:58:07,188 >> All the weights of BertModel were initialized from the model checkpoint at beomi/kcbert-base.
	If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
	100%\|████████████████████████████████████████████████████████████████████████████████████████\| 4/4 [00:02<00:00, 1.40ba/s]
	100%\|████████████████████████████████████████████████████████████████████████████████████████\| 4/4 [00:01<00:00, 3.25ba/s]
	[INFO\|trainer.py:514] 2021-06-10 05:58:17,960 >> The following columns in the training set don't have a corresponding argument in `exBertForMaskedLM.forward` and have been ignored: special_tokens_mask.
	[INFO\|trainer.py:1147] 2021-06-10 05:58:17,976 >> *** Running training ***
	[INFO\|trainer.py:1148] 2021-06-10 05:58:17,976 >> Num examples = 452
	[INFO\|trainer.py:1149] 2021-06-10 05:58:17,976 >> Num Epochs = 1
	[INFO\|trainer.py:1150] 2021-06-10 05:58:17,976 >> Instantaneous batch size per device = 8
	[INFO\|trainer.py:1151] 2021-06-10 05:58:17,976 >> Total train batch size (w. parallel, distributed & accumulation) = 8
	[INFO\|trainer.py:1152] 2021-06-10 05:58:17,976 >> Gradient Accumulation steps = 1
	[INFO\|trainer.py:1153] 2021-06-10 05:58:17,976 >> Total optimization steps = 57
	{'loss': 10.7157, 'learning_rate': 4.912280701754386e-05, 'epoch': 0.02}
	100%\|██████████████████████████████████████████████████████████████████████████████████████\| 57/57 [00:18<00:00, 3.41it/s][INFO\|trainer.py:1343] 2021-06-10 05:58:36,148 >>

	Training completed. Do not forget to share your model on huggingface.co/models =)


	{'train_runtime': 18.1725, 'train_samples_per_second': 24.873, 'train_steps_per_second': 3.137, 'train_loss': 5.573559560273823, 'epoch': 1.0}
	100%\|██████████████████████████████████████████████████████████████████████████████████████\| 57/57 [00:18<00:00, 3.14it/s]
	[INFO\|trainer.py:1894] 2021-06-10 05:58:36,152 >> Saving model checkpoint to ./exbert-mlm
	[INFO\|configuration_utils.py:351] 2021-06-10 05:58:36,155 >> Configuration saved in ./exbert-mlm/config.json
	[INFO\|modeling_utils.py:889] 2021-06-10 05:58:37,568 >> Model weights saved in ./exbert-mlm/pytorch_model.bin
	[INFO\|tokenization_utils_base.py:1924] 2021-06-10 05:58:37,571 >> tokenizer config file saved in ./exbert-mlm/tokenizer_config.json
	[INFO\|tokenization_utils_base.py:1930] 2021-06-10 05:58:37,573 >> Special tokens file saved in ./exbert-mlm/special_tokens_map.json
	[INFO\|trainer_pt_utils.py:907] 2021-06-10 05:58:37,617 >> *** train metrics ***
	[INFO\|trainer_pt_utils.py:912] 2021-06-10 05:58:37,617 >> epoch = 1.0
	[INFO\|trainer_pt_utils.py:912] 2021-06-10 05:58:37,617 >> train_loss = 5.5736
	[INFO\|trainer_pt_utils.py:912] 2021-06-10 05:58:37,617 >> train_runtime = 0:00:18.17
	[INFO\|trainer_pt_utils.py:912] 2021-06-10 05:58:37,618 >> train_samples = 452
	[INFO\|trainer_pt_utils.py:912] 2021-06-10 05:58:37,618 >> train_samples_per_second = 24.873
	[INFO\|trainer_pt_utils.py:912] 2021-06-10 05:58:37,618 >> train_steps_per_second = 3.137
	(exbert-transformers) root@jupyter-beomi:~/exbert-transformers/examples/pytorch/language-modeling#