Created
August 19, 2019 16:51
-
-
Save taylanbil/1710f985564f0831d31701521f12cf08 to your computer and use it in GitHub Desktop.
1 GPU chip on small dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Fri Aug 16 19:07:26 UTC 2019 | |
#!/bin/bash | |
taskname=fairseq_e2e_gpu | |
vol_fairseq=/home/taylanbil/fairseq/ | |
vol_data=/home/taylanbil/data/wmt18_en_de_bpej32k | |
vol_data=/home/taylanbil/data/dummy | |
python_cli="ipython -i" | |
python_cli="python" | |
other_flags=" | |
--clip-norm 0.0 \ | |
--num-workers=2 \ | |
" | |
$python_cli $vol_fairseq/train.py \ | |
$vol_data \ | |
--arch=transformer_vaswani_wmt_en_de_big \ | |
--max-source-positions=64 \ | |
--max-target-positions=64 \ | |
--required-batch-size-multiple=8 \ | |
--max-tokens=4096 \ | |
--no-save \ | |
--attention-dropout=0.1 \ | |
--no-progress-bar \ | |
--criterion=label_smoothed_cross_entropy \ | |
--log-interval=10 \ | |
--source-lang=en \ | |
--lr-scheduler=inverse_sqrt \ | |
--min-lr 1e-09 \ | |
--skip-invalid-size-inputs-valid-test \ | |
--target-lang=de \ | |
--label-smoothing=0.1 \ | |
--curriculum=4 \ | |
--max-epoch=50 \ | |
--update-freq=1 \ | |
--optimizer adam \ | |
--warmup-init-lr 1e-07 \ | |
--lr 0.0005 \ | |
--warmup-updates 4000 \ | |
--adam-betas='(0.9,0.98)' \ | |
--share-all-embeddings \ | |
--dropout 0.3 \ | |
--weight-decay 0.0 \ | |
--fp16 \ | |
--distributed-world-size=1 \ | |
--valid-subset=valid | |
-------------- | |
nohup: ignoring input | |
Namespace(activation_dropout=0.0, activation_fn='relu', adam_betas='(0.9,0.98)', adam_eps=1e-08, adaptive_input=False, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, arch='transformer_vaswani_wmt_en_de_big', attention_dropout=0.1, best_checkpoint_metric='loss', bucket_cap_mb=25, clip_norm=25, cpu=False, criterion='label_smoothed_cross_entropy', curriculum=4, data='/home/taylanbil/data/dummy', dataset_impl='cached', ddp_backend='c10d', decoder_attention_heads=16, decoder_embed_dim=1024, decoder_embed_path=None, decoder_ffn_embed_dim=4096, decoder_input_dim=1024, decoder_layers=6, decoder_learned_pos=False, decoder_normalize_before=False, decoder_output_dim=1024, device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_port=-1, distributed_rank=0, distributed_world_size=1, dropout=0.3, encoder_attention_heads=16, encoder_embed_dim=1024, encoder_embed_path=None, encoder_ffn_embed_dim=4096, encoder_layers=6, encoder_learned_pos=False, encoder_normalize_before=False, find_unused_parameters=False, fix_batches_to_gpus=False, fp16=True, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, keep_interval_updates=-1, keep_last_epochs=-1, label_smoothing=0.1, lazy_load=False, left_pad_source='True', left_pad_target='False', log_format=None, log_interval=10, lr=[0.0005], lr_scheduler='inverse_sqrt', max_epoch=50, max_sentences=None, max_sentences_valid=None, max_source_positions=64, max_target_positions=64, max_tokens=4096, max_tokens_valid=4096, max_update=0, maximize_best_checkpoint_metric=False, memory_efficient_fp16=False, min_loss_scale=0.0001, min_lr=1e-09, no_epoch_checkpoints=False, no_last_checkpoints=False, no_progress_bar=True, no_save=True, no_save_optimizer_state=False, no_token_positional_embeddings=False, num_workers=0, optimizer='adam', optimizer_overrides='{}', raw_text=False, required_batch_size_multiple=8, reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, restore_file='checkpoint_last.pt', save_dir='checkpoints', save_interval=1, save_interval_updates=0, seed=1, sentence_avg=False, share_all_embeddings=True, share_decoder_input_output_embed=False, skip_invalid_size_inputs_valid_test=True, source_lang='en', target_lang='de', task='translation', tbmf_wrapper=False, tensorboard_logdir='', threshold_loss_scale=None, train_subset='train', update_freq=[1], upsample_primary=1, use_bmuf=False, user_dir=None, valid_subset='valid', validate_interval=1, warmup_init_lr=1e-07, warmup_updates=4000, weight_decay=0.0) | |
| [en] dictionary: 35662 types | |
| [de] dictionary: 35662 types | |
| /home/taylanbil/data/dummy valid en-de 3004 examples | |
TransformerModel( | |
(encoder): TransformerEncoder( | |
(embed_tokens): Embedding(35662, 1024, padding_idx=1) | |
(embed_positions): SinusoidalPositionalEmbedding() | |
(layers): ModuleList( | |
(0): TransformerEncoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(1): TransformerEncoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(2): TransformerEncoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(3): TransformerEncoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(4): TransformerEncoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(5): TransformerEncoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
) | |
(decoder): TransformerDecoder( | |
(embed_tokens): Embedding(35662, 1024, padding_idx=1) | |
(embed_positions): SinusoidalPositionalEmbedding() | |
(layers): ModuleList( | |
(0): TransformerDecoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(encoder_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(encoder_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(1): TransformerDecoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(encoder_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(encoder_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(2): TransformerDecoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(encoder_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(encoder_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(3): TransformerDecoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(encoder_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(encoder_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(4): TransformerDecoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(encoder_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(encoder_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(5): TransformerDecoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(encoder_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(encoder_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
) | |
) | |
| model transformer_vaswani_wmt_en_de_big, criterion LabelSmoothedCrossEntropyCriterion | |
| num. model params: 212875264 (num. trained: 212875264) | |
| training on 1 GPUs | |
| max tokens per GPU = 4096 and max sentences per GPU = None | |
| no existing checkpoint found checkpoints/checkpoint_last.pt | |
| loading train data for epoch 0 | |
| /home/taylanbil/data/dummy train en-de 3004 examples | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 001: 10 / 28 loss=15.859, nll_loss=15.860, ppl=59486.29, wps=13644, ups=4, wpb=2603.636, bsz=144.727, num_updates=11, lr=1.47473e-06, gnorm=6.899, clip=0.000, oom=0.000, loss_scale=128.000, wall=2, train_wall=2 | |
| epoch 001: 20 / 28 loss=15.582, nll_loss=15.553, ppl=48059.34, wps=14312, ups=5, wpb=2758.095, bsz=118.095, num_updates=21, lr=2.72448e-06, gnorm=6.217, clip=0.000, oom=0.000, loss_scale=128.000, wall=4, train_wall=4 | |
| epoch 001 | loss 15.364 | nll_loss 15.310 | ppl 40624.96 | wps 14778 | ups 5 | wpb 2838.786 | bsz 104.714 | num_updates 28 | lr 3.5993e-06 | gnorm 5.727 | clip 0.000 | oom 0.000 | loss_scale 128.000 | wall 6 | train_wall 5 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 001 | valid on 'valid' subset | loss 14.218 | nll_loss 14.016 | ppl 16566.42 | num_updates 28 | |
| epoch 002: 10 / 28 loss=14.151, nll_loss=13.952, ppl=15852.06, wps=13589, ups=3, wpb=2603.636, bsz=144.727, num_updates=39, lr=4.97403e-06, gnorm=4.301, clip=0.000, oom=0.000, loss_scale=128.000, wall=10, train_wall=7 | |
| epoch 002: 20 / 28 loss=14.065, nll_loss=13.854, ppl=14805.47, wps=14288, ups=3, wpb=2758.095, bsz=118.095, num_updates=49, lr=6.22378e-06, gnorm=3.563, clip=0.000, oom=0.000, loss_scale=128.000, wall=12, train_wall=9 | |
| epoch 002 | loss 14.018 | nll_loss 13.801 | ppl 14274.67 | wps 14764 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 56 | lr 7.0986e-06 | gnorm 3.197 | clip 0.000 | oom 0.000 | loss_scale 128.000 | wall 13 | train_wall 10 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 002 | valid on 'valid' subset | loss 13.544 | nll_loss 13.251 | ppl 9747.46 | num_updates 56 | |
| epoch 003: 10 / 28 loss=13.516, nll_loss=13.237, ppl=9653.57, wps=13626, ups=3, wpb=2603.636, bsz=144.727, num_updates=67, lr=8.47333e-06, gnorm=3.629, clip=0.000, oom=0.000, loss_scale=128.000, wall=17, train_wall=12 | |
| epoch 003: 20 / 28 loss=13.507, nll_loss=13.229, ppl=9602.30, wps=14289, ups=4, wpb=2758.095, bsz=118.095, num_updates=77, lr=9.72308e-06, gnorm=2.824, clip=0.000, oom=0.000, loss_scale=128.000, wall=19, train_wall=14 | |
| epoch 003 | loss 13.497 | nll_loss 13.219 | ppl 9534.48 | wps 14756 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 84 | lr 1.05979e-05 | gnorm 2.525 | clip 0.000 | oom 0.000 | loss_scale 128.000 | wall 20 | train_wall 15 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 003 | valid on 'valid' subset | loss 13.131 | nll_loss 12.790 | ppl 7079.95 | num_updates 84 | |
| epoch 004: 10 / 28 loss=13.107, nll_loss=12.780, ppl=7033.67, wps=13590, ups=3, wpb=2603.636, bsz=144.727, num_updates=95, lr=1.19726e-05, gnorm=2.940, clip=0.000, oom=0.000, loss_scale=128.000, wall=24, train_wall=17 | |
| epoch 004: 20 / 28 loss=13.109, nll_loss=12.786, ppl=7060.79, wps=14269, ups=4, wpb=2758.095, bsz=118.095, num_updates=105, lr=1.32224e-05, gnorm=2.339, clip=0.000, oom=0.000, loss_scale=128.000, wall=26, train_wall=19 | |
| epoch 004 | loss 13.106 | nll_loss 12.782 | ppl 7044.82 | wps 14733 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 112 | lr 1.40972e-05 | gnorm 2.115 | clip 0.000 | oom 0.000 | loss_scale 128.000 | wall 28 | train_wall 20 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 004 | valid on 'valid' subset | loss 12.745 | nll_loss 12.362 | ppl 5262.65 | num_updates 112 | |
| epoch 005: 10 / 28 loss=12.810, nll_loss=12.452, ppl=5601.94, wps=14269, ups=3, wpb=2768.636, bsz=115.636, num_updates=123, lr=1.54719e-05, gnorm=2.277, clip=0.000, oom=0.000, loss_scale=128.000, wall=32, train_wall=22 | |
| epoch 005: 20 / 28 loss=12.770, nll_loss=12.407, ppl=5430.36, wps=14613, ups=4, wpb=2841.667, bsz=107.048, num_updates=133, lr=1.67217e-05, gnorm=2.022, clip=0.000, oom=0.000, loss_scale=128.000, wall=34, train_wall=23 | |
| epoch 005 | loss 12.718 | nll_loss 12.348 | ppl 5213.19 | wps 14652 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 140 | lr 1.75965e-05 | gnorm 1.949 | clip 0.000 | oom 0.000 | loss_scale 128.000 | wall 35 | train_wall 25 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 005 | valid on 'valid' subset | loss 12.268 | nll_loss 11.824 | ppl 3624.94 | num_updates 140 | |
| epoch 006: 10 / 28 loss=12.384, nll_loss=11.970, ppl=4011.47, wps=14643, ups=3, wpb=2850.273, bsz=113.455, num_updates=151, lr=1.89712e-05, gnorm=2.203, clip=0.000, oom=0.000, loss_scale=128.000, wall=39, train_wall=27 | |
| epoch 006: 20 / 28 loss=12.337, nll_loss=11.914, ppl=3858.28, wps=14610, ups=4, wpb=2837.238, bsz=105.333, num_updates=161, lr=2.0221e-05, gnorm=2.370, clip=0.000, oom=0.000, loss_scale=128.000, wall=41, train_wall=28 | |
| epoch 006 | loss 12.282 | nll_loss 11.851 | ppl 3693.25 | wps 14631 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 168 | lr 2.10958e-05 | gnorm 2.219 | clip 0.000 | oom 0.000 | loss_scale 128.000 | wall 42 | train_wall 29 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 006 | valid on 'valid' subset | loss 11.894 | nll_loss 11.367 | ppl 2640.60 | num_updates 168 | |
| epoch 007: 10 / 28 loss=11.945, nll_loss=11.459, ppl=2815.32, wps=14223, ups=3, wpb=2775.182, bsz=110.909, num_updates=179, lr=2.24705e-05, gnorm=1.756, clip=0.000, oom=0.000, loss_scale=128.000, wall=46, train_wall=31 | |
| epoch 007: 20 / 28 loss=11.914, nll_loss=11.420, ppl=2740.27, wps=14591, ups=4, wpb=2849.000, bsz=103.810, num_updates=189, lr=2.37203e-05, gnorm=1.562, clip=0.000, oom=0.000, loss_scale=128.000, wall=48, train_wall=33 | |
| epoch 007 | loss 11.870 | nll_loss 11.369 | ppl 2644.07 | wps 14524 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 196 | lr 2.45951e-05 | gnorm 1.505 | clip 0.000 | oom 0.000 | loss_scale 128.000 | wall 50 | train_wall 34 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 007 | valid on 'valid' subset | loss 11.550 | nll_loss 10.965 | ppl 1999.44 | num_updates 196 | |
| epoch 008: 10 / 28 loss=11.571, nll_loss=11.017, ppl=2071.73, wps=14354, ups=3, wpb=2779.000, bsz=119.273, num_updates=207, lr=2.59698e-05, gnorm=1.344, clip=0.000, oom=0.000, loss_scale=128.000, wall=54, train_wall=36 | |
| epoch 008: 20 / 28 loss=11.573, nll_loss=11.015, ppl=2069.41, wps=14347, ups=4, wpb=2778.619, bsz=110.667, num_updates=217, lr=2.72196e-05, gnorm=1.568, clip=0.000, oom=0.000, loss_scale=128.000, wall=56, train_wall=38 | |
| epoch 008 | loss 11.600 | nll_loss 11.042 | ppl 2108.75 | wps 14624 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 224 | lr 2.80944e-05 | gnorm 1.748 | clip 0.000 | oom 0.000 | loss_scale 128.000 | wall 57 | train_wall 39 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 008 | valid on 'valid' subset | loss 11.384 | nll_loss 10.755 | ppl 1727.65 | num_updates 224 | |
| WARNING: overflow detected, setting loss scale to: 64.0 | |
| epoch 009: 10 / 28 loss=11.483, nll_loss=10.893, ppl=1901.18, wps=13113, ups=2, wpb=2893.900, bsz=113.600, num_updates=234, lr=2.93442e-05, gnorm=2.239, clip=0.000, oom=0.000, loss_scale=64.000, wall=61, train_wall=41 | |
| epoch 009: 20 / 28 loss=11.489, nll_loss=10.894, ppl=1903.25, wps=13946, ups=3, wpb=2889.900, bsz=103.000, num_updates=244, lr=3.05939e-05, gnorm=1.934, clip=0.000, oom=0.000, loss_scale=64.000, wall=63, train_wall=43 | |
| epoch 009 | loss 11.459 | nll_loss 10.858 | ppl 1856.00 | wps 13926 | ups 4 | wpb 2830.556 | bsz 104.741 | num_updates 251 | lr 3.14687e-05 | gnorm 1.908 | clip 0.000 | oom 0.000 | loss_scale 64.000 | wall 64 | train_wall 44 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 009 | valid on 'valid' subset | loss 11.340 | nll_loss 10.659 | ppl 1616.93 | num_updates 251 | |
| epoch 010: 10 / 28 loss=11.333, nll_loss=10.700, ppl=1663.86, wps=14367, ups=3, wpb=2796.273, bsz=102.909, num_updates=262, lr=3.28435e-05, gnorm=1.715, clip=0.000, oom=0.000, loss_scale=64.000, wall=68, train_wall=46 | |
| epoch 010: 20 / 28 loss=11.328, nll_loss=10.692, ppl=1654.74, wps=14430, ups=4, wpb=2811.381, bsz=106.095, num_updates=272, lr=3.40932e-05, gnorm=1.911, clip=0.000, oom=0.000, loss_scale=64.000, wall=70, train_wall=48 | |
| epoch 010 | loss 11.326 | nll_loss 10.687 | ppl 1648.72 | wps 14538 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 279 | lr 3.4968e-05 | gnorm 1.863 | clip 0.000 | oom 0.000 | loss_scale 64.000 | wall 72 | train_wall 49 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 010 | valid on 'valid' subset | loss 11.153 | nll_loss 10.438 | ppl 1387.61 | num_updates 279 | |
| epoch 011: 10 / 28 loss=11.214, nll_loss=10.551, ppl=1500.15, wps=14413, ups=3, wpb=2842.636, bsz=106.182, num_updates=290, lr=3.63428e-05, gnorm=1.602, clip=0.000, oom=0.000, loss_scale=64.000, wall=76, train_wall=51 | |
| epoch 011: 20 / 28 loss=11.248, nll_loss=10.587, ppl=1538.44, wps=14638, ups=4, wpb=2869.048, bsz=101.524, num_updates=300, lr=3.75925e-05, gnorm=1.875, clip=0.000, oom=0.000, loss_scale=64.000, wall=78, train_wall=53 | |
| epoch 011 | loss 11.223 | nll_loss 10.556 | ppl 1505.36 | wps 14473 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 307 | lr 3.84673e-05 | gnorm 1.841 | clip 0.000 | oom 0.000 | loss_scale 64.000 | wall 79 | train_wall 54 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 011 | valid on 'valid' subset | loss 11.061 | nll_loss 10.316 | ppl 1275.01 | num_updates 307 | |
| epoch 012: 10 / 28 loss=11.138, nll_loss=10.455, ppl=1403.79, wps=14411, ups=3, wpb=2824.273, bsz=105.818, num_updates=318, lr=3.98421e-05, gnorm=1.720, clip=0.000, oom=0.000, loss_scale=64.000, wall=83, train_wall=56 | |
| epoch 012: 20 / 28 loss=11.128, nll_loss=10.438, ppl=1387.65, wps=14447, ups=4, wpb=2824.524, bsz=106.095, num_updates=328, lr=4.10918e-05, gnorm=1.626, clip=0.000, oom=0.000, loss_scale=64.000, wall=85, train_wall=57 | |
| epoch 012 | loss 11.121 | nll_loss 10.429 | ppl 1378.87 | wps 14473 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 335 | lr 4.19666e-05 | gnorm 1.596 | clip 0.000 | oom 0.000 | loss_scale 64.000 | wall 86 | train_wall 59 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 012 | valid on 'valid' subset | loss 10.925 | nll_loss 10.158 | ppl 1142.24 | num_updates 335 | |
| epoch 013: 10 / 28 loss=11.040, nll_loss=10.336, ppl=1292.27, wps=14544, ups=3, wpb=2842.273, bsz=114.182, num_updates=346, lr=4.33414e-05, gnorm=2.332, clip=0.000, oom=0.000, loss_scale=64.000, wall=90, train_wall=61 | |
| epoch 013: 20 / 28 loss=11.082, nll_loss=10.377, ppl=1330.04, wps=14590, ups=3, wpb=2845.238, bsz=106.857, num_updates=356, lr=4.45911e-05, gnorm=2.456, clip=0.000, oom=0.000, loss_scale=64.000, wall=92, train_wall=62 | |
| epoch 013 | loss 11.082 | nll_loss 10.377 | ppl 1329.90 | wps 14575 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 363 | lr 4.54659e-05 | gnorm 2.312 | clip 0.000 | oom 0.000 | loss_scale 64.000 | wall 94 | train_wall 64 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 013 | valid on 'valid' subset | loss 10.904 | nll_loss 10.135 | ppl 1124.70 | num_updates 363 | |
| epoch 014: 10 / 28 loss=10.994, nll_loss=10.279, ppl=1242.30, wps=14534, ups=3, wpb=2850.909, bsz=105.091, num_updates=374, lr=4.68407e-05, gnorm=2.242, clip=0.000, oom=0.000, loss_scale=64.000, wall=98, train_wall=66 | |
| epoch 014: 20 / 28 loss=11.035, nll_loss=10.321, ppl=1279.44, wps=14621, ups=4, wpb=2853.571, bsz=99.619, num_updates=384, lr=4.80904e-05, gnorm=2.211, clip=0.000, oom=0.000, loss_scale=64.000, wall=100, train_wall=67 | |
| epoch 014 | loss 11.021 | nll_loss 10.306 | ppl 1266.27 | wps 14521 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 391 | lr 4.89652e-05 | gnorm 2.305 | clip 0.000 | oom 0.000 | loss_scale 64.000 | wall 101 | train_wall 68 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 014 | valid on 'valid' subset | loss 10.846 | nll_loss 10.052 | ppl 1061.62 | num_updates 391 | |
| epoch 015: 10 / 28 loss=10.946, nll_loss=10.215, ppl=1188.31, wps=14725, ups=3, wpb=2838.091, bsz=103.273, num_updates=402, lr=5.034e-05, gnorm=2.227, clip=0.000, oom=0.000, loss_scale=64.000, wall=105, train_wall=70 | |
| epoch 015: 20 / 28 loss=10.976, nll_loss=10.250, ppl=1218.04, wps=14769, ups=4, wpb=2870.476, bsz=101.905, num_updates=412, lr=5.15897e-05, gnorm=2.149, clip=0.000, oom=0.000, loss_scale=64.000, wall=107, train_wall=72 | |
| epoch 015 | loss 10.963 | nll_loss 10.236 | ppl 1206.07 | wps 14607 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 419 | lr 5.24645e-05 | gnorm 2.176 | clip 0.000 | oom 0.000 | loss_scale 64.000 | wall 108 | train_wall 73 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 015 | valid on 'valid' subset | loss 10.732 | nll_loss 9.932 | ppl 977.00 | num_updates 419 | |
| epoch 016: 10 / 28 loss=10.864, nll_loss=10.124, ppl=1116.28, wps=15085, ups=3, wpb=2876.909, bsz=109.818, num_updates=430, lr=5.38393e-05, gnorm=2.000, clip=0.000, oom=0.000, loss_scale=64.000, wall=112, train_wall=75 | |
| epoch 016: 20 / 28 loss=10.846, nll_loss=10.103, ppl=1099.59, wps=14776, ups=4, wpb=2855.762, bsz=112.000, num_updates=440, lr=5.5089e-05, gnorm=2.136, clip=0.000, oom=0.000, loss_scale=64.000, wall=114, train_wall=77 | |
| epoch 016 | loss 10.869 | nll_loss 10.129 | ppl 1119.67 | wps 14750 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 447 | lr 5.59638e-05 | gnorm 2.057 | clip 0.000 | oom 0.000 | loss_scale 64.000 | wall 116 | train_wall 78 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 016 | valid on 'valid' subset | loss 10.633 | nll_loss 9.817 | ppl 902.24 | num_updates 447 | |
| WARNING: overflow detected, setting loss scale to: 32.0 | |
| epoch 017: 10 / 28 loss=10.632, nll_loss=9.866, ppl=933.12, wps=12318, ups=2, wpb=2720.000, bsz=129.600, num_updates=457, lr=5.72136e-05, gnorm=2.825, clip=0.000, oom=0.000, loss_scale=32.000, wall=120, train_wall=80 | |
| epoch 017: 20 / 28 loss=10.748, nll_loss=9.994, ppl=1019.91, wps=13573, ups=3, wpb=2792.650, bsz=111.800, num_updates=467, lr=5.84633e-05, gnorm=2.321, clip=0.000, oom=0.000, loss_scale=32.000, wall=122, train_wall=82 | |
| epoch 017 | loss 10.787 | nll_loss 10.037 | ppl 1050.88 | wps 13848 | ups 4 | wpb 2814.481 | bsz 106.222 | num_updates 474 | lr 5.93382e-05 | gnorm 2.239 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 123 | train_wall 83 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 017 | valid on 'valid' subset | loss 10.515 | nll_loss 9.693 | ppl 827.67 | num_updates 474 | |
| epoch 018: 10 / 28 loss=10.608, nll_loss=9.840, ppl=916.80, wps=14207, ups=3, wpb=2760.818, bsz=106.545, num_updates=485, lr=6.07129e-05, gnorm=1.954, clip=0.000, oom=0.000, loss_scale=32.000, wall=127, train_wall=85 | |
| epoch 018: 20 / 28 loss=10.672, nll_loss=9.909, ppl=961.26, wps=14477, ups=4, wpb=2822.333, bsz=105.333, num_updates=495, lr=6.19626e-05, gnorm=2.118, clip=0.000, oom=0.000, loss_scale=32.000, wall=129, train_wall=87 | |
| epoch 018 | loss 10.686 | nll_loss 9.923 | ppl 970.57 | wps 14534 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 502 | lr 6.28375e-05 | gnorm 2.183 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 130 | train_wall 88 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 018 | valid on 'valid' subset | loss 10.332 | nll_loss 9.492 | ppl 719.95 | num_updates 502 | |
| epoch 019: 10 / 28 loss=10.622, nll_loss=9.853, ppl=924.84, wps=14897, ups=3, wpb=2921.182, bsz=96.364, num_updates=513, lr=6.42122e-05, gnorm=2.083, clip=0.000, oom=0.000, loss_scale=32.000, wall=134, train_wall=90 | |
| epoch 019: 20 / 28 loss=10.533, nll_loss=9.754, ppl=863.55, wps=14335, ups=4, wpb=2809.810, bsz=104.952, num_updates=523, lr=6.54619e-05, gnorm=2.289, clip=0.000, oom=0.000, loss_scale=32.000, wall=136, train_wall=92 | |
| epoch 019 | loss 10.560 | nll_loss 9.784 | ppl 881.50 | wps 14477 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 530 | lr 6.63368e-05 | gnorm 2.310 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 138 | train_wall 93 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 019 | valid on 'valid' subset | loss 10.188 | nll_loss 9.330 | ppl 643.59 | num_updates 530 | |
| epoch 020: 10 / 28 loss=10.463, nll_loss=9.675, ppl=817.45, wps=14599, ups=3, wpb=2841.273, bsz=111.273, num_updates=541, lr=6.77115e-05, gnorm=2.600, clip=0.000, oom=0.000, loss_scale=32.000, wall=142, train_wall=95 | |
| epoch 020: 20 / 28 loss=10.463, nll_loss=9.671, ppl=814.98, wps=14602, ups=4, wpb=2853.048, bsz=106.095, num_updates=551, lr=6.89612e-05, gnorm=2.390, clip=0.000, oom=0.000, loss_scale=32.000, wall=144, train_wall=97 | |
| epoch 020 | loss 10.459 | nll_loss 9.667 | ppl 812.84 | wps 14580 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 558 | lr 6.98361e-05 | gnorm 2.397 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 145 | train_wall 98 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 020 | valid on 'valid' subset | loss 10.011 | nll_loss 9.131 | ppl 560.84 | num_updates 558 | |
| epoch 021: 10 / 28 loss=10.412, nll_loss=9.613, ppl=783.26, wps=14525, ups=3, wpb=2832.364, bsz=99.273, num_updates=569, lr=7.12108e-05, gnorm=2.986, clip=0.000, oom=0.000, loss_scale=32.000, wall=149, train_wall=100 | |
| epoch 021: 20 / 28 loss=10.395, nll_loss=9.595, ppl=773.29, wps=14591, ups=4, wpb=2858.095, bsz=101.143, num_updates=579, lr=7.24605e-05, gnorm=2.954, clip=0.000, oom=0.000, loss_scale=32.000, wall=151, train_wall=101 | |
| epoch 021 | loss 10.365 | nll_loss 9.561 | ppl 755.33 | wps 14474 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 586 | lr 7.33354e-05 | gnorm 2.884 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 152 | train_wall 103 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 021 | valid on 'valid' subset | loss 9.891 | nll_loss 9.007 | ppl 514.63 | num_updates 586 | |
| epoch 022: 10 / 28 loss=10.166, nll_loss=9.345, ppl=650.29, wps=14089, ups=3, wpb=2742.091, bsz=102.182, num_updates=597, lr=7.47101e-05, gnorm=2.507, clip=0.000, oom=0.000, loss_scale=32.000, wall=156, train_wall=105 | |
| epoch 022: 20 / 28 loss=10.160, nll_loss=9.335, ppl=645.67, wps=14392, ups=4, wpb=2830.000, bsz=108.381, num_updates=607, lr=7.59598e-05, gnorm=2.505, clip=0.000, oom=0.000, loss_scale=32.000, wall=158, train_wall=106 | |
| epoch 022 | loss 10.194 | nll_loss 9.372 | ppl 662.68 | wps 14484 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 614 | lr 7.68347e-05 | gnorm 2.540 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 160 | train_wall 108 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 022 | valid on 'valid' subset | loss 9.680 | nll_loss 8.762 | ppl 434.03 | num_updates 614 | |
| epoch 023: 10 / 28 loss=10.091, nll_loss=9.258, ppl=612.15, wps=14969, ups=3, wpb=2889.545, bsz=101.091, num_updates=625, lr=7.82094e-05, gnorm=2.454, clip=0.000, oom=0.000, loss_scale=32.000, wall=164, train_wall=109 | |
| epoch 023: 20 / 28 loss=10.097, nll_loss=9.265, ppl=615.11, wps=15036, ups=4, wpb=2905.857, bsz=94.286, num_updates=635, lr=7.94591e-05, gnorm=2.379, clip=0.000, oom=0.000, loss_scale=32.000, wall=166, train_wall=111 | |
| epoch 023 | loss 10.044 | nll_loss 9.205 | ppl 590.33 | wps 14618 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 642 | lr 8.0334e-05 | gnorm 2.682 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 167 | train_wall 112 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 023 | valid on 'valid' subset | loss 9.556 | nll_loss 8.622 | ppl 393.93 | num_updates 642 | |
| epoch 024: 10 / 28 loss=9.834, nll_loss=8.971, ppl=501.66, wps=14165, ups=3, wpb=2752.091, bsz=112.000, num_updates=653, lr=8.17087e-05, gnorm=2.865, clip=0.000, oom=0.000, loss_scale=32.000, wall=171, train_wall=114 | |
| epoch 024: 20 / 28 loss=9.850, nll_loss=8.986, ppl=507.17, wps=14421, ups=4, wpb=2805.952, bsz=111.048, num_updates=663, lr=8.29584e-05, gnorm=2.781, clip=0.000, oom=0.000, loss_scale=32.000, wall=173, train_wall=116 | |
| epoch 024 | loss 9.903 | nll_loss 9.045 | ppl 528.28 | wps 14609 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 670 | lr 8.38333e-05 | gnorm 2.761 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 174 | train_wall 117 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 024 | valid on 'valid' subset | loss 9.245 | nll_loss 8.265 | ppl 307.63 | num_updates 670 | |
| epoch 025: 10 / 28 loss=9.764, nll_loss=8.893, ppl=475.42, wps=14791, ups=3, wpb=2895.364, bsz=90.182, num_updates=681, lr=8.5208e-05, gnorm=2.493, clip=0.000, oom=0.000, loss_scale=32.000, wall=178, train_wall=119 | |
| epoch 025: 20 / 28 loss=9.654, nll_loss=8.770, ppl=436.65, wps=14309, ups=4, wpb=2797.952, bsz=104.952, num_updates=691, lr=8.64577e-05, gnorm=2.646, clip=0.000, oom=0.000, loss_scale=32.000, wall=180, train_wall=121 | |
| epoch 025 | loss 9.710 | nll_loss 8.830 | ppl 455.00 | wps 14490 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 698 | lr 8.73326e-05 | gnorm 2.839 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 182 | train_wall 122 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 025 | valid on 'valid' subset | loss 9.085 | nll_loss 8.068 | ppl 268.29 | num_updates 698 | |
| epoch 026: 10 / 28 loss=9.655, nll_loss=8.765, ppl=434.92, wps=14779, ups=3, wpb=2856.909, bsz=97.091, num_updates=709, lr=8.87073e-05, gnorm=3.195, clip=0.000, oom=0.000, loss_scale=32.000, wall=186, train_wall=124 | |
| epoch 026: 20 / 28 loss=9.524, nll_loss=8.621, ppl=393.71, wps=14249, ups=4, wpb=2787.286, bsz=108.762, num_updates=719, lr=8.9957e-05, gnorm=3.126, clip=0.000, oom=0.000, loss_scale=32.000, wall=188, train_wall=126 | |
| epoch 026 | loss 9.571 | nll_loss 8.674 | ppl 408.46 | wps 14483 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 726 | lr 9.08319e-05 | gnorm 3.039 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 189 | train_wall 127 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 026 | valid on 'valid' subset | loss 8.794 | nll_loss 7.757 | ppl 216.36 | num_updates 726 | |
| epoch 027: 10 / 28 loss=9.373, nll_loss=8.455, ppl=351.01, wps=14716, ups=3, wpb=2873.182, bsz=98.545, num_updates=737, lr=9.22066e-05, gnorm=2.757, clip=0.000, oom=0.000, loss_scale=32.000, wall=193, train_wall=129 | |
| epoch 027: 20 / 28 loss=9.404, nll_loss=8.488, ppl=359.10, wps=14917, ups=4, wpb=2905.905, bsz=96.952, num_updates=747, lr=9.34563e-05, gnorm=2.818, clip=0.000, oom=0.000, loss_scale=32.000, wall=195, train_wall=131 | |
| epoch 027 | loss 9.356 | nll_loss 8.435 | ppl 346.01 | wps 14539 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 754 | lr 9.43312e-05 | gnorm 3.002 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 196 | train_wall 132 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 027 | valid on 'valid' subset | loss 8.665 | nll_loss 7.630 | ppl 198.08 | num_updates 754 | |
| epoch 028: 10 / 28 loss=9.342, nll_loss=8.417, ppl=341.74, wps=14947, ups=3, wpb=2862.727, bsz=89.091, num_updates=765, lr=9.57059e-05, gnorm=3.494, clip=0.000, oom=0.000, loss_scale=32.000, wall=200, train_wall=134 | |
| epoch 028: 20 / 28 loss=9.274, nll_loss=8.340, ppl=324.10, wps=14673, ups=4, wpb=2853.095, bsz=97.714, num_updates=775, lr=9.69556e-05, gnorm=3.270, clip=0.000, oom=0.000, loss_scale=32.000, wall=202, train_wall=136 | |
| epoch 028 | loss 9.222 | nll_loss 8.282 | ppl 311.28 | wps 14532 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 782 | lr 9.78305e-05 | gnorm 3.174 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 204 | train_wall 137 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 028 | valid on 'valid' subset | loss 8.387 | nll_loss 7.320 | ppl 159.81 | num_updates 782 | |
| epoch 029: 10 / 28 loss=9.010, nll_loss=8.043, ppl=263.82, wps=14094, ups=3, wpb=2733.545, bsz=101.818, num_updates=793, lr=9.92052e-05, gnorm=3.434, clip=0.000, oom=0.000, loss_scale=32.000, wall=208, train_wall=139 | |
| epoch 029: 20 / 28 loss=9.023, nll_loss=8.058, ppl=266.56, wps=14370, ups=4, wpb=2818.762, bsz=106.286, num_updates=803, lr=0.000100455, gnorm=3.319, clip=0.000, oom=0.000, loss_scale=32.000, wall=210, train_wall=140 | |
| epoch 029 | loss 9.030 | nll_loss 8.067 | ppl 268.11 | wps 14502 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 810 | lr 0.00010133 | gnorm 3.265 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 211 | train_wall 142 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 029 | valid on 'valid' subset | loss 8.120 | nll_loss 7.021 | ppl 129.85 | num_updates 810 | |
| epoch 030: 10 / 28 loss=8.821, nll_loss=7.836, ppl=228.44, wps=14355, ups=3, wpb=2803.909, bsz=114.182, num_updates=821, lr=0.000102704, gnorm=3.589, clip=0.000, oom=0.000, loss_scale=32.000, wall=215, train_wall=144 | |
| epoch 030: 20 / 28 loss=8.887, nll_loss=7.906, ppl=239.90, wps=14453, ups=4, wpb=2823.190, bsz=106.476, num_updates=831, lr=0.000103954, gnorm=3.584, clip=0.000, oom=0.000, loss_scale=32.000, wall=217, train_wall=145 | |
| epoch 030 | loss 8.914 | nll_loss 7.936 | ppl 244.87 | wps 14504 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 838 | lr 0.000104829 | gnorm 3.527 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 218 | train_wall 147 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 030 | valid on 'valid' subset | loss 7.961 | nll_loss 6.835 | ppl 114.14 | num_updates 838 | |
| epoch 031: 10 / 28 loss=8.737, nll_loss=7.737, ppl=213.27, wps=14267, ups=3, wpb=2809.545, bsz=99.273, num_updates=849, lr=0.000106204, gnorm=3.521, clip=0.000, oom=0.000, loss_scale=32.000, wall=222, train_wall=149 | |
| epoch 031: 20 / 28 loss=8.710, nll_loss=7.707, ppl=208.89, wps=14478, ups=4, wpb=2840.238, bsz=104.571, num_updates=859, lr=0.000107454, gnorm=3.398, clip=0.000, oom=0.000, loss_scale=32.000, wall=224, train_wall=150 | |
| epoch 031 | loss 8.717 | nll_loss 7.716 | ppl 210.30 | wps 14478 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 866 | lr 0.000108328 | gnorm 3.480 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 226 | train_wall 151 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 031 | valid on 'valid' subset | loss 7.775 | nll_loss 6.598 | ppl 96.89 | num_updates 866 | |
| epoch 032: 10 / 28 loss=8.667, nll_loss=7.660, ppl=202.27, wps=14560, ups=3, wpb=2790.182, bsz=98.545, num_updates=877, lr=0.000109703, gnorm=4.003, clip=0.000, oom=0.000, loss_scale=32.000, wall=230, train_wall=153 | |
| epoch 032: 20 / 28 loss=8.630, nll_loss=7.615, ppl=196.03, wps=14511, ups=4, wpb=2822.000, bsz=101.143, num_updates=887, lr=0.000110953, gnorm=3.827, clip=0.000, oom=0.000, loss_scale=32.000, wall=232, train_wall=155 | |
| epoch 032 | loss 8.627 | nll_loss 7.612 | ppl 195.70 | wps 14523 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 894 | lr 0.000111828 | gnorm 3.841 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 233 | train_wall 156 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 032 | valid on 'valid' subset | loss 7.645 | nll_loss 6.478 | ppl 89.14 | num_updates 894 | |
| epoch 033: 10 / 28 loss=8.444, nll_loss=7.410, ppl=170.03, wps=14383, ups=3, wpb=2820.273, bsz=98.545, num_updates=905, lr=0.000113202, gnorm=3.562, clip=0.000, oom=0.000, loss_scale=32.000, wall=237, train_wall=158 | |
| epoch 033: 20 / 28 loss=8.410, nll_loss=7.370, ppl=165.37, wps=14484, ups=4, wpb=2824.190, bsz=103.429, num_updates=915, lr=0.000114452, gnorm=3.459, clip=0.000, oom=0.000, loss_scale=32.000, wall=239, train_wall=160 | |
| epoch 033 | loss 8.418 | nll_loss 7.379 | ppl 166.41 | wps 14523 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 922 | lr 0.000115327 | gnorm 3.451 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 240 | train_wall 161 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 033 | valid on 'valid' subset | loss 7.418 | nll_loss 6.205 | ppl 73.76 | num_updates 922 | |
| epoch 034: 10 / 28 loss=8.120, nll_loss=7.048, ppl=132.29, wps=14462, ups=3, wpb=2809.727, bsz=108.000, num_updates=933, lr=0.000116702, gnorm=3.229, clip=0.000, oom=0.000, loss_scale=32.000, wall=244, train_wall=163 | |
| epoch 034: 20 / 28 loss=8.194, nll_loss=7.129, ppl=139.95, wps=14720, ups=3, wpb=2875.905, bsz=100.000, num_updates=943, lr=0.000117951, gnorm=3.176, clip=0.000, oom=0.000, loss_scale=32.000, wall=246, train_wall=165 | |
| epoch 034 | loss 8.172 | nll_loss 7.104 | ppl 137.56 | wps 14544 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 950 | lr 0.000118826 | gnorm 3.303 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 248 | train_wall 166 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 034 | valid on 'valid' subset | loss 7.091 | nll_loss 5.847 | ppl 57.57 | num_updates 950 | |
| epoch 035: 10 / 28 loss=8.110, nll_loss=7.033, ppl=130.93, wps=15224, ups=3, wpb=3016.636, bsz=96.000, num_updates=961, lr=0.000120201, gnorm=3.301, clip=0.000, oom=0.000, loss_scale=32.000, wall=252, train_wall=168 | |
| epoch 035: 20 / 28 loss=8.013, nll_loss=6.925, ppl=121.55, wps=14565, ups=4, wpb=2861.429, bsz=97.143, num_updates=971, lr=0.000121451, gnorm=3.371, clip=0.000, oom=0.000, loss_scale=32.000, wall=254, train_wall=170 | |
| epoch 035 | loss 7.977 | nll_loss 6.883 | ppl 118.03 | wps 14472 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 978 | lr 0.000122326 | gnorm 3.444 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 255 | train_wall 171 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 035 | valid on 'valid' subset | loss 6.890 | nll_loss 5.620 | ppl 49.19 | num_updates 978 | |
| epoch 036: 10 / 28 loss=7.694, nll_loss=6.570, ppl=95.00, wps=13936, ups=3, wpb=2745.636, bsz=118.909, num_updates=989, lr=0.0001237, gnorm=3.472, clip=0.000, oom=0.000, loss_scale=32.000, wall=259, train_wall=173 | |
| epoch 036: 20 / 28 loss=7.786, nll_loss=6.670, ppl=101.82, wps=14367, ups=4, wpb=2825.190, bsz=108.381, num_updates=999, lr=0.00012495, gnorm=3.510, clip=0.000, oom=0.000, loss_scale=32.000, wall=261, train_wall=175 | |
| epoch 036 | loss 7.812 | nll_loss 6.698 | ppl 103.83 | wps 14501 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1006 | lr 0.000125825 | gnorm 3.488 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 263 | train_wall 176 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 036 | valid on 'valid' subset | loss 6.618 | nll_loss 5.285 | ppl 38.99 | num_updates 1006 | |
| epoch 037: 10 / 28 loss=7.566, nll_loss=6.423, ppl=85.79, wps=14406, ups=3, wpb=2779.364, bsz=99.636, num_updates=1017, lr=0.0001272, gnorm=3.583, clip=0.000, oom=0.000, loss_scale=32.000, wall=267, train_wall=178 | |
| epoch 037: 20 / 28 loss=7.630, nll_loss=6.494, ppl=90.11, wps=14563, ups=4, wpb=2836.714, bsz=100.190, num_updates=1027, lr=0.000128449, gnorm=3.776, clip=0.000, oom=0.000, loss_scale=32.000, wall=269, train_wall=180 | |
| epoch 037 | loss 7.632 | nll_loss 6.494 | ppl 90.12 | wps 14539 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1034 | lr 0.000129324 | gnorm 3.768 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 270 | train_wall 181 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 037 | valid on 'valid' subset | loss 6.553 | nll_loss 5.254 | ppl 38.17 | num_updates 1034 | |
| epoch 038: 10 / 28 loss=7.506, nll_loss=6.356, ppl=81.91, wps=15031, ups=3, wpb=2912.636, bsz=96.000, num_updates=1045, lr=0.000130699, gnorm=3.704, clip=0.000, oom=0.000, loss_scale=32.000, wall=274, train_wall=183 | |
| epoch 038: 20 / 28 loss=7.447, nll_loss=6.288, ppl=78.15, wps=14738, ups=4, wpb=2876.762, bsz=102.476, num_updates=1055, lr=0.000131949, gnorm=3.597, clip=0.000, oom=0.000, loss_scale=32.000, wall=276, train_wall=184 | |
| epoch 038 | loss 7.444 | nll_loss 6.284 | ppl 77.92 | wps 14563 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1062 | lr 0.000132823 | gnorm 3.645 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 277 | train_wall 186 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 038 | valid on 'valid' subset | loss 6.316 | nll_loss 4.942 | ppl 30.74 | num_updates 1062 | |
| epoch 039: 10 / 28 loss=7.321, nll_loss=6.142, ppl=70.63, wps=15042, ups=3, wpb=2891.636, bsz=98.545, num_updates=1073, lr=0.000134198, gnorm=3.872, clip=0.000, oom=0.000, loss_scale=32.000, wall=281, train_wall=188 | |
| epoch 039: 20 / 28 loss=7.314, nll_loss=6.135, ppl=70.29, wps=14696, ups=4, wpb=2846.810, bsz=100.000, num_updates=1083, lr=0.000135448, gnorm=3.816, clip=0.000, oom=0.000, loss_scale=32.000, wall=283, train_wall=189 | |
| epoch 039 | loss 7.314 | nll_loss 6.135 | ppl 70.26 | wps 14601 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1090 | lr 0.000136323 | gnorm 3.905 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 285 | train_wall 191 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 039 | valid on 'valid' subset | loss 6.244 | nll_loss 4.870 | ppl 29.24 | num_updates 1090 | |
| epoch 040: 10 / 28 loss=7.121, nll_loss=5.918, ppl=60.47, wps=14728, ups=3, wpb=2857.909, bsz=94.545, num_updates=1101, lr=0.000137697, gnorm=3.568, clip=0.000, oom=0.000, loss_scale=32.000, wall=289, train_wall=192 | |
| epoch 040: 20 / 28 loss=7.157, nll_loss=5.959, ppl=62.22, wps=14902, ups=4, wpb=2893.333, bsz=91.238, num_updates=1111, lr=0.000138947, gnorm=3.512, clip=0.000, oom=0.000, loss_scale=32.000, wall=291, train_wall=194 | |
| epoch 040 | loss 7.070 | nll_loss 5.861 | ppl 58.13 | wps 14497 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1118 | lr 0.000139822 | gnorm 3.493 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 292 | train_wall 195 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 040 | valid on 'valid' subset | loss 5.846 | nll_loss 4.434 | ppl 21.62 | num_updates 1118 | |
| epoch 041: 10 / 28 loss=6.883, nll_loss=5.654, ppl=50.35, wps=14404, ups=3, wpb=2838.273, bsz=101.818, num_updates=1129, lr=0.000141197, gnorm=3.607, clip=0.000, oom=0.000, loss_scale=32.000, wall=296, train_wall=197 | |
| epoch 041: 20 / 28 loss=6.859, nll_loss=5.624, ppl=49.33, wps=14399, ups=3, wpb=2841.667, bsz=108.952, num_updates=1139, lr=0.000142447, gnorm=3.577, clip=0.000, oom=0.000, loss_scale=32.000, wall=298, train_wall=199 | |
| epoch 041 | loss 6.865 | nll_loss 5.630 | ppl 49.52 | wps 14456 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1146 | lr 0.000143321 | gnorm 3.531 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 299 | train_wall 200 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 041 | valid on 'valid' subset | loss 5.553 | nll_loss 4.065 | ppl 16.74 | num_updates 1146 | |
| epoch 042: 10 / 28 loss=6.597, nll_loss=5.330, ppl=40.24, wps=14598, ups=3, wpb=2826.091, bsz=95.636, num_updates=1157, lr=0.000144696, gnorm=3.448, clip=0.000, oom=0.000, loss_scale=32.000, wall=303, train_wall=202 | |
| epoch 042: 20 / 28 loss=6.617, nll_loss=5.352, ppl=40.85, wps=14561, ups=4, wpb=2846.476, bsz=104.571, num_updates=1167, lr=0.000145946, gnorm=3.477, clip=0.000, oom=0.000, loss_scale=32.000, wall=305, train_wall=204 | |
| epoch 042 | loss 6.639 | nll_loss 5.376 | ppl 41.52 | wps 14474 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1174 | lr 0.000146821 | gnorm 3.480 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 307 | train_wall 205 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 042 | valid on 'valid' subset | loss 5.427 | nll_loss 3.936 | ppl 15.31 | num_updates 1174 | |
| epoch 043: 10 / 28 loss=6.411, nll_loss=5.120, ppl=34.78, wps=14474, ups=3, wpb=2850.455, bsz=109.818, num_updates=1185, lr=0.000148195, gnorm=3.499, clip=0.000, oom=0.000, loss_scale=32.000, wall=311, train_wall=207 | |
| epoch 043: 20 / 28 loss=6.443, nll_loss=5.154, ppl=35.60, wps=14550, ups=4, wpb=2844.238, bsz=104.381, num_updates=1195, lr=0.000149445, gnorm=3.578, clip=0.000, oom=0.000, loss_scale=32.000, wall=313, train_wall=209 | |
| epoch 043 | loss 6.507 | nll_loss 5.226 | ppl 37.42 | wps 14526 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1202 | lr 0.00015032 | gnorm 3.831 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 314 | train_wall 210 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 043 | valid on 'valid' subset | loss 5.231 | nll_loss 3.691 | ppl 12.91 | num_updates 1202 | |
| epoch 044: 10 / 28 loss=6.364, nll_loss=5.065, ppl=33.47, wps=15029, ups=3, wpb=2880.545, bsz=86.182, num_updates=1213, lr=0.000151695, gnorm=3.492, clip=0.000, oom=0.000, loss_scale=32.000, wall=318, train_wall=212 | |
| epoch 044: 20 / 28 loss=6.306, nll_loss=4.997, ppl=31.94, wps=14670, ups=4, wpb=2858.952, bsz=100.000, num_updates=1223, lr=0.000152944, gnorm=3.490, clip=0.000, oom=0.000, loss_scale=32.000, wall=320, train_wall=214 | |
| epoch 044 | loss 6.295 | nll_loss 4.985 | ppl 31.67 | wps 14544 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1230 | lr 0.000153819 | gnorm 3.499 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 321 | train_wall 215 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 044 | valid on 'valid' subset | loss 5.039 | nll_loss 3.459 | ppl 11.00 | num_updates 1230 | |
| epoch 045: 10 / 28 loss=5.930, nll_loss=4.575, ppl=23.83, wps=14298, ups=3, wpb=2734.636, bsz=112.364, num_updates=1241, lr=0.000155194, gnorm=3.567, clip=0.000, oom=0.000, loss_scale=32.000, wall=325, train_wall=217 | |
| epoch 045: 20 / 28 loss=6.081, nll_loss=4.743, ppl=26.78, wps=14514, ups=4, wpb=2817.476, bsz=107.238, num_updates=1251, lr=0.000156444, gnorm=3.670, clip=0.000, oom=0.000, loss_scale=32.000, wall=327, train_wall=219 | |
| epoch 045 | loss 6.106 | nll_loss 4.770 | ppl 27.28 | wps 14566 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1258 | lr 0.000157319 | gnorm 3.569 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 329 | train_wall 220 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 045 | valid on 'valid' subset | loss 4.858 | nll_loss 3.257 | ppl 9.56 | num_updates 1258 | |
| epoch 046: 10 / 28 loss=5.741, nll_loss=4.362, ppl=20.57, wps=13919, ups=3, wpb=2737.545, bsz=118.545, num_updates=1269, lr=0.000158693, gnorm=3.653, clip=0.000, oom=0.000, loss_scale=32.000, wall=333, train_wall=222 | |
| epoch 046: 20 / 28 loss=5.878, nll_loss=4.514, ppl=22.85, wps=14388, ups=4, wpb=2824.429, bsz=108.571, num_updates=1279, lr=0.000159943, gnorm=3.603, clip=0.000, oom=0.000, loss_scale=32.000, wall=335, train_wall=224 | |
| epoch 046 | loss 5.910 | nll_loss 4.549 | ppl 23.41 | wps 14514 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1286 | lr 0.000160818 | gnorm 3.575 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 336 | train_wall 225 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 046 | valid on 'valid' subset | loss 4.582 | nll_loss 2.946 | ppl 7.71 | num_updates 1286 | |
| epoch 047: 10 / 28 loss=5.770, nll_loss=4.394, ppl=21.02, wps=14554, ups=3, wpb=2873.455, bsz=109.091, num_updates=1297, lr=0.000162193, gnorm=3.954, clip=0.000, oom=0.000, loss_scale=32.000, wall=340, train_wall=227 | |
| epoch 047: 20 / 28 loss=5.787, nll_loss=4.411, ppl=21.27, wps=14704, ups=4, wpb=2901.810, bsz=103.619, num_updates=1307, lr=0.000163442, gnorm=3.611, clip=0.000, oom=0.000, loss_scale=32.000, wall=342, train_wall=228 | |
| epoch 047 | loss 5.741 | nll_loss 4.358 | ppl 20.51 | wps 14438 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1314 | lr 0.000164317 | gnorm 3.586 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 343 | train_wall 230 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 047 | valid on 'valid' subset | loss 4.529 | nll_loss 2.891 | ppl 7.42 | num_updates 1314 | |
| epoch 048: 10 / 28 loss=5.595, nll_loss=4.191, ppl=18.26, wps=14540, ups=3, wpb=2885.364, bsz=101.091, num_updates=1325, lr=0.000165692, gnorm=4.080, clip=0.000, oom=0.000, loss_scale=32.000, wall=347, train_wall=232 | |
| epoch 048: 20 / 28 loss=5.657, nll_loss=4.260, ppl=19.16, wps=14215, ups=4, wpb=2799.714, bsz=109.524, num_updates=1335, lr=0.000166942, gnorm=4.461, clip=0.000, oom=0.000, loss_scale=32.000, wall=349, train_wall=233 | |
| epoch 048 | loss 5.710 | nll_loss 4.318 | ppl 19.95 | wps 14444 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1342 | lr 0.000167816 | gnorm 4.202 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 351 | train_wall 235 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 048 | valid on 'valid' subset | loss 4.372 | nll_loss 2.673 | ppl 6.38 | num_updates 1342 | |
| epoch 049: 10 / 28 loss=5.585, nll_loss=4.179, ppl=18.12, wps=15364, ups=3, wpb=3006.000, bsz=83.273, num_updates=1353, lr=0.000169191, gnorm=3.424, clip=0.000, oom=0.000, loss_scale=32.000, wall=355, train_wall=236 | |
| epoch 049: 20 / 28 loss=5.471, nll_loss=4.049, ppl=16.56, wps=14929, ups=4, wpb=2918.238, bsz=94.286, num_updates=1363, lr=0.000170441, gnorm=3.431, clip=0.000, oom=0.000, loss_scale=32.000, wall=357, train_wall=238 | |
| epoch 049 | loss 5.387 | nll_loss 3.955 | ppl 15.51 | wps 14481 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1370 | lr 0.000171316 | gnorm 3.358 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 358 | train_wall 239 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 049 | valid on 'valid' subset | loss 4.085 | nll_loss 2.354 | ppl 5.11 | num_updates 1370 | |
| epoch 050: 10 / 28 loss=4.973, nll_loss=3.484, ppl=11.19, wps=14040, ups=3, wpb=2763.909, bsz=118.909, num_updates=1381, lr=0.00017269, gnorm=3.269, clip=0.000, oom=0.000, loss_scale=32.000, wall=362, train_wall=241 | |
| epoch 050: 20 / 28 loss=5.080, nll_loss=3.607, ppl=12.18, wps=14452, ups=4, wpb=2823.619, bsz=104.190, num_updates=1391, lr=0.00017394, gnorm=3.159, clip=0.000, oom=0.000, loss_scale=32.000, wall=364, train_wall=243 | |
| epoch 050 | loss 5.146 | nll_loss 3.681 | ppl 12.82 | wps 14521 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1398 | lr 0.000174815 | gnorm 3.281 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 365 | train_wall 244 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 050 | valid on 'valid' subset | loss 3.945 | nll_loss 2.180 | ppl 4.53 | num_updates 1398 | |
| done training in 367.2 seconds | |
Fri Aug 16 19:13:45 UTC 2019 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Fri Aug 16 19:07:26 UTC 2019 | |
#!/bin/bash | |
taskname=fairseq_e2e_gpu | |
vol_fairseq=/home/taylanbil/fairseq/ | |
vol_data=/home/taylanbil/data/wmt18_en_de_bpej32k | |
vol_data=/home/taylanbil/data/dummy | |
python_cli="ipython -i" | |
python_cli="python" | |
other_flags=" | |
--clip-norm 0.0 \ | |
--num-workers=2 \ | |
" | |
$python_cli $vol_fairseq/train.py \ | |
$vol_data \ | |
--arch=transformer_vaswani_wmt_en_de_big \ | |
--max-source-positions=64 \ | |
--max-target-positions=64 \ | |
--required-batch-size-multiple=8 \ | |
--max-tokens=4096 \ | |
--no-save \ | |
--attention-dropout=0.1 \ | |
--no-progress-bar \ | |
--criterion=label_smoothed_cross_entropy \ | |
--log-interval=10 \ | |
--source-lang=en \ | |
--lr-scheduler=inverse_sqrt \ | |
--min-lr 1e-09 \ | |
--skip-invalid-size-inputs-valid-test \ | |
--target-lang=de \ | |
--label-smoothing=0.1 \ | |
--curriculum=4 \ | |
--max-epoch=50 \ | |
--update-freq=1 \ | |
--optimizer adam \ | |
--warmup-init-lr 1e-07 \ | |
--lr 0.0005 \ | |
--warmup-updates 4000 \ | |
--adam-betas='(0.9,0.98)' \ | |
--share-all-embeddings \ | |
--dropout 0.3 \ | |
--weight-decay 0.0 \ | |
--fp16 \ | |
--distributed-world-size=1 \ | |
--valid-subset=valid | |
-------------- | |
nohup: ignoring input | |
Namespace(activation_dropout=0.0, activation_fn='relu', adam_betas='(0.9,0.98)', adam_eps=1e-08, adaptive_input=False, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, arch='transformer_vaswani_wmt_en_de_big', attention_dropout=0.1, best_checkpoint_metric='loss', bucket_cap_mb=25, clip_norm=25, cpu=False, criterion='label_smoothed_cross_entropy', curriculum=4, data='/home/taylanbil/data/dummy', dataset_impl='cached', ddp_backend='c10d', decoder_attention_heads=16, decoder_embed_dim=1024, decoder_embed_path=None, decoder_ffn_embed_dim=4096, decoder_input_dim=1024, decoder_layers=6, decoder_learned_pos=False, decoder_normalize_before=False, decoder_output_dim=1024, device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_port=-1, distributed_rank=0, distributed_world_size=1, dropout=0.3, encoder_attention_heads=16, encoder_embed_dim=1024, encoder_embed_path=None, encoder_ffn_embed_dim=4096, encoder_layers=6, encoder_learned_pos=False, encoder_normalize_before=False, find_unused_parameters=False, fix_batches_to_gpus=False, fp16=True, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, keep_interval_updates=-1, keep_last_epochs=-1, label_smoothing=0.1, lazy_load=False, left_pad_source='True', left_pad_target='False', log_format=None, log_interval=10, lr=[0.0005], lr_scheduler='inverse_sqrt', max_epoch=50, max_sentences=None, max_sentences_valid=None, max_source_positions=64, max_target_positions=64, max_tokens=4096, max_tokens_valid=4096, max_update=0, maximize_best_checkpoint_metric=False, memory_efficient_fp16=False, min_loss_scale=0.0001, min_lr=1e-09, no_epoch_checkpoints=False, no_last_checkpoints=False, no_progress_bar=True, no_save=True, no_save_optimizer_state=False, no_token_positional_embeddings=False, num_workers=0, optimizer='adam', optimizer_overrides='{}', raw_text=False, required_batch_size_multiple=8, reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, restore_file='checkpoint_last.pt', save_dir='checkpoints', save_interval=1, save_interval_updates=0, seed=1, sentence_avg=False, share_all_embeddings=True, share_decoder_input_output_embed=False, skip_invalid_size_inputs_valid_test=True, source_lang='en', target_lang='de', task='translation', tbmf_wrapper=False, tensorboard_logdir='', threshold_loss_scale=None, train_subset='train', update_freq=[1], upsample_primary=1, use_bmuf=False, user_dir=None, valid_subset='valid', validate_interval=1, warmup_init_lr=1e-07, warmup_updates=4000, weight_decay=0.0) | |
| [en] dictionary: 35662 types | |
| [de] dictionary: 35662 types | |
| /home/taylanbil/data/dummy valid en-de 3004 examples | |
TransformerModel( | |
(encoder): TransformerEncoder( | |
(embed_tokens): Embedding(35662, 1024, padding_idx=1) | |
(embed_positions): SinusoidalPositionalEmbedding() | |
(layers): ModuleList( | |
(0): TransformerEncoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(1): TransformerEncoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(2): TransformerEncoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(3): TransformerEncoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(4): TransformerEncoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(5): TransformerEncoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
) | |
(decoder): TransformerDecoder( | |
(embed_tokens): Embedding(35662, 1024, padding_idx=1) | |
(embed_positions): SinusoidalPositionalEmbedding() | |
(layers): ModuleList( | |
(0): TransformerDecoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(encoder_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(encoder_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(1): TransformerDecoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(encoder_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(encoder_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(2): TransformerDecoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(encoder_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(encoder_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(3): TransformerDecoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(encoder_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(encoder_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(4): TransformerDecoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(encoder_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(encoder_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
(5): TransformerDecoderLayer( | |
(self_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(self_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(encoder_attn): MultiheadAttention( | |
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
) | |
(encoder_attn_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
(fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
(fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
(final_layer_norm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) | |
) | |
) | |
) | |
) | |
| model transformer_vaswani_wmt_en_de_big, criterion LabelSmoothedCrossEntropyCriterion | |
| num. model params: 212875264 (num. trained: 212875264) | |
| training on 1 GPUs | |
| max tokens per GPU = 4096 and max sentences per GPU = None | |
| no existing checkpoint found checkpoints/checkpoint_last.pt | |
| loading train data for epoch 0 | |
| /home/taylanbil/data/dummy train en-de 3004 examples | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 001: 10 / 28 loss=15.859, nll_loss=15.860, ppl=59486.29, wps=13644, ups=4, wpb=2603.636, bsz=144.727, num_updates=11, lr=1.47473e-06, gnorm=6.899, clip=0.000, oom=0.000, loss_scale=128.000, wall=2, train_wall=2 | |
| epoch 001: 20 / 28 loss=15.582, nll_loss=15.553, ppl=48059.34, wps=14312, ups=5, wpb=2758.095, bsz=118.095, num_updates=21, lr=2.72448e-06, gnorm=6.217, clip=0.000, oom=0.000, loss_scale=128.000, wall=4, train_wall=4 | |
| epoch 001 | loss 15.364 | nll_loss 15.310 | ppl 40624.96 | wps 14778 | ups 5 | wpb 2838.786 | bsz 104.714 | num_updates 28 | lr 3.5993e-06 | gnorm 5.727 | clip 0.000 | oom 0.000 | loss_scale 128.000 | wall 6 | train_wall 5 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 001 | valid on 'valid' subset | loss 14.218 | nll_loss 14.016 | ppl 16566.42 | num_updates 28 | |
| epoch 002: 10 / 28 loss=14.151, nll_loss=13.952, ppl=15852.06, wps=13589, ups=3, wpb=2603.636, bsz=144.727, num_updates=39, lr=4.97403e-06, gnorm=4.301, clip=0.000, oom=0.000, loss_scale=128.000, wall=10, train_wall=7 | |
| epoch 002: 20 / 28 loss=14.065, nll_loss=13.854, ppl=14805.47, wps=14288, ups=3, wpb=2758.095, bsz=118.095, num_updates=49, lr=6.22378e-06, gnorm=3.563, clip=0.000, oom=0.000, loss_scale=128.000, wall=12, train_wall=9 | |
| epoch 002 | loss 14.018 | nll_loss 13.801 | ppl 14274.67 | wps 14764 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 56 | lr 7.0986e-06 | gnorm 3.197 | clip 0.000 | oom 0.000 | loss_scale 128.000 | wall 13 | train_wall 10 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 002 | valid on 'valid' subset | loss 13.544 | nll_loss 13.251 | ppl 9747.46 | num_updates 56 | |
| epoch 003: 10 / 28 loss=13.516, nll_loss=13.237, ppl=9653.57, wps=13626, ups=3, wpb=2603.636, bsz=144.727, num_updates=67, lr=8.47333e-06, gnorm=3.629, clip=0.000, oom=0.000, loss_scale=128.000, wall=17, train_wall=12 | |
| epoch 003: 20 / 28 loss=13.507, nll_loss=13.229, ppl=9602.30, wps=14289, ups=4, wpb=2758.095, bsz=118.095, num_updates=77, lr=9.72308e-06, gnorm=2.824, clip=0.000, oom=0.000, loss_scale=128.000, wall=19, train_wall=14 | |
| epoch 003 | loss 13.497 | nll_loss 13.219 | ppl 9534.48 | wps 14756 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 84 | lr 1.05979e-05 | gnorm 2.525 | clip 0.000 | oom 0.000 | loss_scale 128.000 | wall 20 | train_wall 15 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 003 | valid on 'valid' subset | loss 13.131 | nll_loss 12.790 | ppl 7079.95 | num_updates 84 | |
| epoch 004: 10 / 28 loss=13.107, nll_loss=12.780, ppl=7033.67, wps=13590, ups=3, wpb=2603.636, bsz=144.727, num_updates=95, lr=1.19726e-05, gnorm=2.940, clip=0.000, oom=0.000, loss_scale=128.000, wall=24, train_wall=17 | |
| epoch 004: 20 / 28 loss=13.109, nll_loss=12.786, ppl=7060.79, wps=14269, ups=4, wpb=2758.095, bsz=118.095, num_updates=105, lr=1.32224e-05, gnorm=2.339, clip=0.000, oom=0.000, loss_scale=128.000, wall=26, train_wall=19 | |
| epoch 004 | loss 13.106 | nll_loss 12.782 | ppl 7044.82 | wps 14733 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 112 | lr 1.40972e-05 | gnorm 2.115 | clip 0.000 | oom 0.000 | loss_scale 128.000 | wall 28 | train_wall 20 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 004 | valid on 'valid' subset | loss 12.745 | nll_loss 12.362 | ppl 5262.65 | num_updates 112 | |
| epoch 005: 10 / 28 loss=12.810, nll_loss=12.452, ppl=5601.94, wps=14269, ups=3, wpb=2768.636, bsz=115.636, num_updates=123, lr=1.54719e-05, gnorm=2.277, clip=0.000, oom=0.000, loss_scale=128.000, wall=32, train_wall=22 | |
| epoch 005: 20 / 28 loss=12.770, nll_loss=12.407, ppl=5430.36, wps=14613, ups=4, wpb=2841.667, bsz=107.048, num_updates=133, lr=1.67217e-05, gnorm=2.022, clip=0.000, oom=0.000, loss_scale=128.000, wall=34, train_wall=23 | |
| epoch 005 | loss 12.718 | nll_loss 12.348 | ppl 5213.19 | wps 14652 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 140 | lr 1.75965e-05 | gnorm 1.949 | clip 0.000 | oom 0.000 | loss_scale 128.000 | wall 35 | train_wall 25 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 005 | valid on 'valid' subset | loss 12.268 | nll_loss 11.824 | ppl 3624.94 | num_updates 140 | |
| epoch 006: 10 / 28 loss=12.384, nll_loss=11.970, ppl=4011.47, wps=14643, ups=3, wpb=2850.273, bsz=113.455, num_updates=151, lr=1.89712e-05, gnorm=2.203, clip=0.000, oom=0.000, loss_scale=128.000, wall=39, train_wall=27 | |
| epoch 006: 20 / 28 loss=12.337, nll_loss=11.914, ppl=3858.28, wps=14610, ups=4, wpb=2837.238, bsz=105.333, num_updates=161, lr=2.0221e-05, gnorm=2.370, clip=0.000, oom=0.000, loss_scale=128.000, wall=41, train_wall=28 | |
| epoch 006 | loss 12.282 | nll_loss 11.851 | ppl 3693.25 | wps 14631 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 168 | lr 2.10958e-05 | gnorm 2.219 | clip 0.000 | oom 0.000 | loss_scale 128.000 | wall 42 | train_wall 29 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 006 | valid on 'valid' subset | loss 11.894 | nll_loss 11.367 | ppl 2640.60 | num_updates 168 | |
| epoch 007: 10 / 28 loss=11.945, nll_loss=11.459, ppl=2815.32, wps=14223, ups=3, wpb=2775.182, bsz=110.909, num_updates=179, lr=2.24705e-05, gnorm=1.756, clip=0.000, oom=0.000, loss_scale=128.000, wall=46, train_wall=31 | |
| epoch 007: 20 / 28 loss=11.914, nll_loss=11.420, ppl=2740.27, wps=14591, ups=4, wpb=2849.000, bsz=103.810, num_updates=189, lr=2.37203e-05, gnorm=1.562, clip=0.000, oom=0.000, loss_scale=128.000, wall=48, train_wall=33 | |
| epoch 007 | loss 11.870 | nll_loss 11.369 | ppl 2644.07 | wps 14524 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 196 | lr 2.45951e-05 | gnorm 1.505 | clip 0.000 | oom 0.000 | loss_scale 128.000 | wall 50 | train_wall 34 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 007 | valid on 'valid' subset | loss 11.550 | nll_loss 10.965 | ppl 1999.44 | num_updates 196 | |
| epoch 008: 10 / 28 loss=11.571, nll_loss=11.017, ppl=2071.73, wps=14354, ups=3, wpb=2779.000, bsz=119.273, num_updates=207, lr=2.59698e-05, gnorm=1.344, clip=0.000, oom=0.000, loss_scale=128.000, wall=54, train_wall=36 | |
| epoch 008: 20 / 28 loss=11.573, nll_loss=11.015, ppl=2069.41, wps=14347, ups=4, wpb=2778.619, bsz=110.667, num_updates=217, lr=2.72196e-05, gnorm=1.568, clip=0.000, oom=0.000, loss_scale=128.000, wall=56, train_wall=38 | |
| epoch 008 | loss 11.600 | nll_loss 11.042 | ppl 2108.75 | wps 14624 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 224 | lr 2.80944e-05 | gnorm 1.748 | clip 0.000 | oom 0.000 | loss_scale 128.000 | wall 57 | train_wall 39 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 008 | valid on 'valid' subset | loss 11.384 | nll_loss 10.755 | ppl 1727.65 | num_updates 224 | |
| WARNING: overflow detected, setting loss scale to: 64.0 | |
| epoch 009: 10 / 28 loss=11.483, nll_loss=10.893, ppl=1901.18, wps=13113, ups=2, wpb=2893.900, bsz=113.600, num_updates=234, lr=2.93442e-05, gnorm=2.239, clip=0.000, oom=0.000, loss_scale=64.000, wall=61, train_wall=41 | |
| epoch 009: 20 / 28 loss=11.489, nll_loss=10.894, ppl=1903.25, wps=13946, ups=3, wpb=2889.900, bsz=103.000, num_updates=244, lr=3.05939e-05, gnorm=1.934, clip=0.000, oom=0.000, loss_scale=64.000, wall=63, train_wall=43 | |
| epoch 009 | loss 11.459 | nll_loss 10.858 | ppl 1856.00 | wps 13926 | ups 4 | wpb 2830.556 | bsz 104.741 | num_updates 251 | lr 3.14687e-05 | gnorm 1.908 | clip 0.000 | oom 0.000 | loss_scale 64.000 | wall 64 | train_wall 44 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 009 | valid on 'valid' subset | loss 11.340 | nll_loss 10.659 | ppl 1616.93 | num_updates 251 | |
| epoch 010: 10 / 28 loss=11.333, nll_loss=10.700, ppl=1663.86, wps=14367, ups=3, wpb=2796.273, bsz=102.909, num_updates=262, lr=3.28435e-05, gnorm=1.715, clip=0.000, oom=0.000, loss_scale=64.000, wall=68, train_wall=46 | |
| epoch 010: 20 / 28 loss=11.328, nll_loss=10.692, ppl=1654.74, wps=14430, ups=4, wpb=2811.381, bsz=106.095, num_updates=272, lr=3.40932e-05, gnorm=1.911, clip=0.000, oom=0.000, loss_scale=64.000, wall=70, train_wall=48 | |
| epoch 010 | loss 11.326 | nll_loss 10.687 | ppl 1648.72 | wps 14538 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 279 | lr 3.4968e-05 | gnorm 1.863 | clip 0.000 | oom 0.000 | loss_scale 64.000 | wall 72 | train_wall 49 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 010 | valid on 'valid' subset | loss 11.153 | nll_loss 10.438 | ppl 1387.61 | num_updates 279 | |
| epoch 011: 10 / 28 loss=11.214, nll_loss=10.551, ppl=1500.15, wps=14413, ups=3, wpb=2842.636, bsz=106.182, num_updates=290, lr=3.63428e-05, gnorm=1.602, clip=0.000, oom=0.000, loss_scale=64.000, wall=76, train_wall=51 | |
| epoch 011: 20 / 28 loss=11.248, nll_loss=10.587, ppl=1538.44, wps=14638, ups=4, wpb=2869.048, bsz=101.524, num_updates=300, lr=3.75925e-05, gnorm=1.875, clip=0.000, oom=0.000, loss_scale=64.000, wall=78, train_wall=53 | |
| epoch 011 | loss 11.223 | nll_loss 10.556 | ppl 1505.36 | wps 14473 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 307 | lr 3.84673e-05 | gnorm 1.841 | clip 0.000 | oom 0.000 | loss_scale 64.000 | wall 79 | train_wall 54 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 011 | valid on 'valid' subset | loss 11.061 | nll_loss 10.316 | ppl 1275.01 | num_updates 307 | |
| epoch 012: 10 / 28 loss=11.138, nll_loss=10.455, ppl=1403.79, wps=14411, ups=3, wpb=2824.273, bsz=105.818, num_updates=318, lr=3.98421e-05, gnorm=1.720, clip=0.000, oom=0.000, loss_scale=64.000, wall=83, train_wall=56 | |
| epoch 012: 20 / 28 loss=11.128, nll_loss=10.438, ppl=1387.65, wps=14447, ups=4, wpb=2824.524, bsz=106.095, num_updates=328, lr=4.10918e-05, gnorm=1.626, clip=0.000, oom=0.000, loss_scale=64.000, wall=85, train_wall=57 | |
| epoch 012 | loss 11.121 | nll_loss 10.429 | ppl 1378.87 | wps 14473 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 335 | lr 4.19666e-05 | gnorm 1.596 | clip 0.000 | oom 0.000 | loss_scale 64.000 | wall 86 | train_wall 59 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 012 | valid on 'valid' subset | loss 10.925 | nll_loss 10.158 | ppl 1142.24 | num_updates 335 | |
| epoch 013: 10 / 28 loss=11.040, nll_loss=10.336, ppl=1292.27, wps=14544, ups=3, wpb=2842.273, bsz=114.182, num_updates=346, lr=4.33414e-05, gnorm=2.332, clip=0.000, oom=0.000, loss_scale=64.000, wall=90, train_wall=61 | |
| epoch 013: 20 / 28 loss=11.082, nll_loss=10.377, ppl=1330.04, wps=14590, ups=3, wpb=2845.238, bsz=106.857, num_updates=356, lr=4.45911e-05, gnorm=2.456, clip=0.000, oom=0.000, loss_scale=64.000, wall=92, train_wall=62 | |
| epoch 013 | loss 11.082 | nll_loss 10.377 | ppl 1329.90 | wps 14575 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 363 | lr 4.54659e-05 | gnorm 2.312 | clip 0.000 | oom 0.000 | loss_scale 64.000 | wall 94 | train_wall 64 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 013 | valid on 'valid' subset | loss 10.904 | nll_loss 10.135 | ppl 1124.70 | num_updates 363 | |
| epoch 014: 10 / 28 loss=10.994, nll_loss=10.279, ppl=1242.30, wps=14534, ups=3, wpb=2850.909, bsz=105.091, num_updates=374, lr=4.68407e-05, gnorm=2.242, clip=0.000, oom=0.000, loss_scale=64.000, wall=98, train_wall=66 | |
| epoch 014: 20 / 28 loss=11.035, nll_loss=10.321, ppl=1279.44, wps=14621, ups=4, wpb=2853.571, bsz=99.619, num_updates=384, lr=4.80904e-05, gnorm=2.211, clip=0.000, oom=0.000, loss_scale=64.000, wall=100, train_wall=67 | |
| epoch 014 | loss 11.021 | nll_loss 10.306 | ppl 1266.27 | wps 14521 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 391 | lr 4.89652e-05 | gnorm 2.305 | clip 0.000 | oom 0.000 | loss_scale 64.000 | wall 101 | train_wall 68 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 014 | valid on 'valid' subset | loss 10.846 | nll_loss 10.052 | ppl 1061.62 | num_updates 391 | |
| epoch 015: 10 / 28 loss=10.946, nll_loss=10.215, ppl=1188.31, wps=14725, ups=3, wpb=2838.091, bsz=103.273, num_updates=402, lr=5.034e-05, gnorm=2.227, clip=0.000, oom=0.000, loss_scale=64.000, wall=105, train_wall=70 | |
| epoch 015: 20 / 28 loss=10.976, nll_loss=10.250, ppl=1218.04, wps=14769, ups=4, wpb=2870.476, bsz=101.905, num_updates=412, lr=5.15897e-05, gnorm=2.149, clip=0.000, oom=0.000, loss_scale=64.000, wall=107, train_wall=72 | |
| epoch 015 | loss 10.963 | nll_loss 10.236 | ppl 1206.07 | wps 14607 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 419 | lr 5.24645e-05 | gnorm 2.176 | clip 0.000 | oom 0.000 | loss_scale 64.000 | wall 108 | train_wall 73 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 015 | valid on 'valid' subset | loss 10.732 | nll_loss 9.932 | ppl 977.00 | num_updates 419 | |
| epoch 016: 10 / 28 loss=10.864, nll_loss=10.124, ppl=1116.28, wps=15085, ups=3, wpb=2876.909, bsz=109.818, num_updates=430, lr=5.38393e-05, gnorm=2.000, clip=0.000, oom=0.000, loss_scale=64.000, wall=112, train_wall=75 | |
| epoch 016: 20 / 28 loss=10.846, nll_loss=10.103, ppl=1099.59, wps=14776, ups=4, wpb=2855.762, bsz=112.000, num_updates=440, lr=5.5089e-05, gnorm=2.136, clip=0.000, oom=0.000, loss_scale=64.000, wall=114, train_wall=77 | |
| epoch 016 | loss 10.869 | nll_loss 10.129 | ppl 1119.67 | wps 14750 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 447 | lr 5.59638e-05 | gnorm 2.057 | clip 0.000 | oom 0.000 | loss_scale 64.000 | wall 116 | train_wall 78 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 016 | valid on 'valid' subset | loss 10.633 | nll_loss 9.817 | ppl 902.24 | num_updates 447 | |
| WARNING: overflow detected, setting loss scale to: 32.0 | |
| epoch 017: 10 / 28 loss=10.632, nll_loss=9.866, ppl=933.12, wps=12318, ups=2, wpb=2720.000, bsz=129.600, num_updates=457, lr=5.72136e-05, gnorm=2.825, clip=0.000, oom=0.000, loss_scale=32.000, wall=120, train_wall=80 | |
| epoch 017: 20 / 28 loss=10.748, nll_loss=9.994, ppl=1019.91, wps=13573, ups=3, wpb=2792.650, bsz=111.800, num_updates=467, lr=5.84633e-05, gnorm=2.321, clip=0.000, oom=0.000, loss_scale=32.000, wall=122, train_wall=82 | |
| epoch 017 | loss 10.787 | nll_loss 10.037 | ppl 1050.88 | wps 13848 | ups 4 | wpb 2814.481 | bsz 106.222 | num_updates 474 | lr 5.93382e-05 | gnorm 2.239 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 123 | train_wall 83 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 017 | valid on 'valid' subset | loss 10.515 | nll_loss 9.693 | ppl 827.67 | num_updates 474 | |
| epoch 018: 10 / 28 loss=10.608, nll_loss=9.840, ppl=916.80, wps=14207, ups=3, wpb=2760.818, bsz=106.545, num_updates=485, lr=6.07129e-05, gnorm=1.954, clip=0.000, oom=0.000, loss_scale=32.000, wall=127, train_wall=85 | |
| epoch 018: 20 / 28 loss=10.672, nll_loss=9.909, ppl=961.26, wps=14477, ups=4, wpb=2822.333, bsz=105.333, num_updates=495, lr=6.19626e-05, gnorm=2.118, clip=0.000, oom=0.000, loss_scale=32.000, wall=129, train_wall=87 | |
| epoch 018 | loss 10.686 | nll_loss 9.923 | ppl 970.57 | wps 14534 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 502 | lr 6.28375e-05 | gnorm 2.183 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 130 | train_wall 88 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 018 | valid on 'valid' subset | loss 10.332 | nll_loss 9.492 | ppl 719.95 | num_updates 502 | |
| epoch 019: 10 / 28 loss=10.622, nll_loss=9.853, ppl=924.84, wps=14897, ups=3, wpb=2921.182, bsz=96.364, num_updates=513, lr=6.42122e-05, gnorm=2.083, clip=0.000, oom=0.000, loss_scale=32.000, wall=134, train_wall=90 | |
| epoch 019: 20 / 28 loss=10.533, nll_loss=9.754, ppl=863.55, wps=14335, ups=4, wpb=2809.810, bsz=104.952, num_updates=523, lr=6.54619e-05, gnorm=2.289, clip=0.000, oom=0.000, loss_scale=32.000, wall=136, train_wall=92 | |
| epoch 019 | loss 10.560 | nll_loss 9.784 | ppl 881.50 | wps 14477 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 530 | lr 6.63368e-05 | gnorm 2.310 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 138 | train_wall 93 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 019 | valid on 'valid' subset | loss 10.188 | nll_loss 9.330 | ppl 643.59 | num_updates 530 | |
| epoch 020: 10 / 28 loss=10.463, nll_loss=9.675, ppl=817.45, wps=14599, ups=3, wpb=2841.273, bsz=111.273, num_updates=541, lr=6.77115e-05, gnorm=2.600, clip=0.000, oom=0.000, loss_scale=32.000, wall=142, train_wall=95 | |
| epoch 020: 20 / 28 loss=10.463, nll_loss=9.671, ppl=814.98, wps=14602, ups=4, wpb=2853.048, bsz=106.095, num_updates=551, lr=6.89612e-05, gnorm=2.390, clip=0.000, oom=0.000, loss_scale=32.000, wall=144, train_wall=97 | |
| epoch 020 | loss 10.459 | nll_loss 9.667 | ppl 812.84 | wps 14580 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 558 | lr 6.98361e-05 | gnorm 2.397 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 145 | train_wall 98 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 020 | valid on 'valid' subset | loss 10.011 | nll_loss 9.131 | ppl 560.84 | num_updates 558 | |
| epoch 021: 10 / 28 loss=10.412, nll_loss=9.613, ppl=783.26, wps=14525, ups=3, wpb=2832.364, bsz=99.273, num_updates=569, lr=7.12108e-05, gnorm=2.986, clip=0.000, oom=0.000, loss_scale=32.000, wall=149, train_wall=100 | |
| epoch 021: 20 / 28 loss=10.395, nll_loss=9.595, ppl=773.29, wps=14591, ups=4, wpb=2858.095, bsz=101.143, num_updates=579, lr=7.24605e-05, gnorm=2.954, clip=0.000, oom=0.000, loss_scale=32.000, wall=151, train_wall=101 | |
| epoch 021 | loss 10.365 | nll_loss 9.561 | ppl 755.33 | wps 14474 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 586 | lr 7.33354e-05 | gnorm 2.884 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 152 | train_wall 103 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 021 | valid on 'valid' subset | loss 9.891 | nll_loss 9.007 | ppl 514.63 | num_updates 586 | |
| epoch 022: 10 / 28 loss=10.166, nll_loss=9.345, ppl=650.29, wps=14089, ups=3, wpb=2742.091, bsz=102.182, num_updates=597, lr=7.47101e-05, gnorm=2.507, clip=0.000, oom=0.000, loss_scale=32.000, wall=156, train_wall=105 | |
| epoch 022: 20 / 28 loss=10.160, nll_loss=9.335, ppl=645.67, wps=14392, ups=4, wpb=2830.000, bsz=108.381, num_updates=607, lr=7.59598e-05, gnorm=2.505, clip=0.000, oom=0.000, loss_scale=32.000, wall=158, train_wall=106 | |
| epoch 022 | loss 10.194 | nll_loss 9.372 | ppl 662.68 | wps 14484 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 614 | lr 7.68347e-05 | gnorm 2.540 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 160 | train_wall 108 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 022 | valid on 'valid' subset | loss 9.680 | nll_loss 8.762 | ppl 434.03 | num_updates 614 | |
| epoch 023: 10 / 28 loss=10.091, nll_loss=9.258, ppl=612.15, wps=14969, ups=3, wpb=2889.545, bsz=101.091, num_updates=625, lr=7.82094e-05, gnorm=2.454, clip=0.000, oom=0.000, loss_scale=32.000, wall=164, train_wall=109 | |
| epoch 023: 20 / 28 loss=10.097, nll_loss=9.265, ppl=615.11, wps=15036, ups=4, wpb=2905.857, bsz=94.286, num_updates=635, lr=7.94591e-05, gnorm=2.379, clip=0.000, oom=0.000, loss_scale=32.000, wall=166, train_wall=111 | |
| epoch 023 | loss 10.044 | nll_loss 9.205 | ppl 590.33 | wps 14618 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 642 | lr 8.0334e-05 | gnorm 2.682 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 167 | train_wall 112 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 023 | valid on 'valid' subset | loss 9.556 | nll_loss 8.622 | ppl 393.93 | num_updates 642 | |
| epoch 024: 10 / 28 loss=9.834, nll_loss=8.971, ppl=501.66, wps=14165, ups=3, wpb=2752.091, bsz=112.000, num_updates=653, lr=8.17087e-05, gnorm=2.865, clip=0.000, oom=0.000, loss_scale=32.000, wall=171, train_wall=114 | |
| epoch 024: 20 / 28 loss=9.850, nll_loss=8.986, ppl=507.17, wps=14421, ups=4, wpb=2805.952, bsz=111.048, num_updates=663, lr=8.29584e-05, gnorm=2.781, clip=0.000, oom=0.000, loss_scale=32.000, wall=173, train_wall=116 | |
| epoch 024 | loss 9.903 | nll_loss 9.045 | ppl 528.28 | wps 14609 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 670 | lr 8.38333e-05 | gnorm 2.761 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 174 | train_wall 117 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 024 | valid on 'valid' subset | loss 9.245 | nll_loss 8.265 | ppl 307.63 | num_updates 670 | |
| epoch 025: 10 / 28 loss=9.764, nll_loss=8.893, ppl=475.42, wps=14791, ups=3, wpb=2895.364, bsz=90.182, num_updates=681, lr=8.5208e-05, gnorm=2.493, clip=0.000, oom=0.000, loss_scale=32.000, wall=178, train_wall=119 | |
| epoch 025: 20 / 28 loss=9.654, nll_loss=8.770, ppl=436.65, wps=14309, ups=4, wpb=2797.952, bsz=104.952, num_updates=691, lr=8.64577e-05, gnorm=2.646, clip=0.000, oom=0.000, loss_scale=32.000, wall=180, train_wall=121 | |
| epoch 025 | loss 9.710 | nll_loss 8.830 | ppl 455.00 | wps 14490 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 698 | lr 8.73326e-05 | gnorm 2.839 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 182 | train_wall 122 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 025 | valid on 'valid' subset | loss 9.085 | nll_loss 8.068 | ppl 268.29 | num_updates 698 | |
| epoch 026: 10 / 28 loss=9.655, nll_loss=8.765, ppl=434.92, wps=14779, ups=3, wpb=2856.909, bsz=97.091, num_updates=709, lr=8.87073e-05, gnorm=3.195, clip=0.000, oom=0.000, loss_scale=32.000, wall=186, train_wall=124 | |
| epoch 026: 20 / 28 loss=9.524, nll_loss=8.621, ppl=393.71, wps=14249, ups=4, wpb=2787.286, bsz=108.762, num_updates=719, lr=8.9957e-05, gnorm=3.126, clip=0.000, oom=0.000, loss_scale=32.000, wall=188, train_wall=126 | |
| epoch 026 | loss 9.571 | nll_loss 8.674 | ppl 408.46 | wps 14483 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 726 | lr 9.08319e-05 | gnorm 3.039 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 189 | train_wall 127 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 026 | valid on 'valid' subset | loss 8.794 | nll_loss 7.757 | ppl 216.36 | num_updates 726 | |
| epoch 027: 10 / 28 loss=9.373, nll_loss=8.455, ppl=351.01, wps=14716, ups=3, wpb=2873.182, bsz=98.545, num_updates=737, lr=9.22066e-05, gnorm=2.757, clip=0.000, oom=0.000, loss_scale=32.000, wall=193, train_wall=129 | |
| epoch 027: 20 / 28 loss=9.404, nll_loss=8.488, ppl=359.10, wps=14917, ups=4, wpb=2905.905, bsz=96.952, num_updates=747, lr=9.34563e-05, gnorm=2.818, clip=0.000, oom=0.000, loss_scale=32.000, wall=195, train_wall=131 | |
| epoch 027 | loss 9.356 | nll_loss 8.435 | ppl 346.01 | wps 14539 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 754 | lr 9.43312e-05 | gnorm 3.002 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 196 | train_wall 132 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 027 | valid on 'valid' subset | loss 8.665 | nll_loss 7.630 | ppl 198.08 | num_updates 754 | |
| epoch 028: 10 / 28 loss=9.342, nll_loss=8.417, ppl=341.74, wps=14947, ups=3, wpb=2862.727, bsz=89.091, num_updates=765, lr=9.57059e-05, gnorm=3.494, clip=0.000, oom=0.000, loss_scale=32.000, wall=200, train_wall=134 | |
| epoch 028: 20 / 28 loss=9.274, nll_loss=8.340, ppl=324.10, wps=14673, ups=4, wpb=2853.095, bsz=97.714, num_updates=775, lr=9.69556e-05, gnorm=3.270, clip=0.000, oom=0.000, loss_scale=32.000, wall=202, train_wall=136 | |
| epoch 028 | loss 9.222 | nll_loss 8.282 | ppl 311.28 | wps 14532 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 782 | lr 9.78305e-05 | gnorm 3.174 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 204 | train_wall 137 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 028 | valid on 'valid' subset | loss 8.387 | nll_loss 7.320 | ppl 159.81 | num_updates 782 | |
| epoch 029: 10 / 28 loss=9.010, nll_loss=8.043, ppl=263.82, wps=14094, ups=3, wpb=2733.545, bsz=101.818, num_updates=793, lr=9.92052e-05, gnorm=3.434, clip=0.000, oom=0.000, loss_scale=32.000, wall=208, train_wall=139 | |
| epoch 029: 20 / 28 loss=9.023, nll_loss=8.058, ppl=266.56, wps=14370, ups=4, wpb=2818.762, bsz=106.286, num_updates=803, lr=0.000100455, gnorm=3.319, clip=0.000, oom=0.000, loss_scale=32.000, wall=210, train_wall=140 | |
| epoch 029 | loss 9.030 | nll_loss 8.067 | ppl 268.11 | wps 14502 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 810 | lr 0.00010133 | gnorm 3.265 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 211 | train_wall 142 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 029 | valid on 'valid' subset | loss 8.120 | nll_loss 7.021 | ppl 129.85 | num_updates 810 | |
| epoch 030: 10 / 28 loss=8.821, nll_loss=7.836, ppl=228.44, wps=14355, ups=3, wpb=2803.909, bsz=114.182, num_updates=821, lr=0.000102704, gnorm=3.589, clip=0.000, oom=0.000, loss_scale=32.000, wall=215, train_wall=144 | |
| epoch 030: 20 / 28 loss=8.887, nll_loss=7.906, ppl=239.90, wps=14453, ups=4, wpb=2823.190, bsz=106.476, num_updates=831, lr=0.000103954, gnorm=3.584, clip=0.000, oom=0.000, loss_scale=32.000, wall=217, train_wall=145 | |
| epoch 030 | loss 8.914 | nll_loss 7.936 | ppl 244.87 | wps 14504 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 838 | lr 0.000104829 | gnorm 3.527 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 218 | train_wall 147 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 030 | valid on 'valid' subset | loss 7.961 | nll_loss 6.835 | ppl 114.14 | num_updates 838 | |
| epoch 031: 10 / 28 loss=8.737, nll_loss=7.737, ppl=213.27, wps=14267, ups=3, wpb=2809.545, bsz=99.273, num_updates=849, lr=0.000106204, gnorm=3.521, clip=0.000, oom=0.000, loss_scale=32.000, wall=222, train_wall=149 | |
| epoch 031: 20 / 28 loss=8.710, nll_loss=7.707, ppl=208.89, wps=14478, ups=4, wpb=2840.238, bsz=104.571, num_updates=859, lr=0.000107454, gnorm=3.398, clip=0.000, oom=0.000, loss_scale=32.000, wall=224, train_wall=150 | |
| epoch 031 | loss 8.717 | nll_loss 7.716 | ppl 210.30 | wps 14478 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 866 | lr 0.000108328 | gnorm 3.480 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 226 | train_wall 151 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 031 | valid on 'valid' subset | loss 7.775 | nll_loss 6.598 | ppl 96.89 | num_updates 866 | |
| epoch 032: 10 / 28 loss=8.667, nll_loss=7.660, ppl=202.27, wps=14560, ups=3, wpb=2790.182, bsz=98.545, num_updates=877, lr=0.000109703, gnorm=4.003, clip=0.000, oom=0.000, loss_scale=32.000, wall=230, train_wall=153 | |
| epoch 032: 20 / 28 loss=8.630, nll_loss=7.615, ppl=196.03, wps=14511, ups=4, wpb=2822.000, bsz=101.143, num_updates=887, lr=0.000110953, gnorm=3.827, clip=0.000, oom=0.000, loss_scale=32.000, wall=232, train_wall=155 | |
| epoch 032 | loss 8.627 | nll_loss 7.612 | ppl 195.70 | wps 14523 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 894 | lr 0.000111828 | gnorm 3.841 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 233 | train_wall 156 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 032 | valid on 'valid' subset | loss 7.645 | nll_loss 6.478 | ppl 89.14 | num_updates 894 | |
| epoch 033: 10 / 28 loss=8.444, nll_loss=7.410, ppl=170.03, wps=14383, ups=3, wpb=2820.273, bsz=98.545, num_updates=905, lr=0.000113202, gnorm=3.562, clip=0.000, oom=0.000, loss_scale=32.000, wall=237, train_wall=158 | |
| epoch 033: 20 / 28 loss=8.410, nll_loss=7.370, ppl=165.37, wps=14484, ups=4, wpb=2824.190, bsz=103.429, num_updates=915, lr=0.000114452, gnorm=3.459, clip=0.000, oom=0.000, loss_scale=32.000, wall=239, train_wall=160 | |
| epoch 033 | loss 8.418 | nll_loss 7.379 | ppl 166.41 | wps 14523 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 922 | lr 0.000115327 | gnorm 3.451 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 240 | train_wall 161 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 033 | valid on 'valid' subset | loss 7.418 | nll_loss 6.205 | ppl 73.76 | num_updates 922 | |
| epoch 034: 10 / 28 loss=8.120, nll_loss=7.048, ppl=132.29, wps=14462, ups=3, wpb=2809.727, bsz=108.000, num_updates=933, lr=0.000116702, gnorm=3.229, clip=0.000, oom=0.000, loss_scale=32.000, wall=244, train_wall=163 | |
| epoch 034: 20 / 28 loss=8.194, nll_loss=7.129, ppl=139.95, wps=14720, ups=3, wpb=2875.905, bsz=100.000, num_updates=943, lr=0.000117951, gnorm=3.176, clip=0.000, oom=0.000, loss_scale=32.000, wall=246, train_wall=165 | |
| epoch 034 | loss 8.172 | nll_loss 7.104 | ppl 137.56 | wps 14544 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 950 | lr 0.000118826 | gnorm 3.303 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 248 | train_wall 166 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 034 | valid on 'valid' subset | loss 7.091 | nll_loss 5.847 | ppl 57.57 | num_updates 950 | |
| epoch 035: 10 / 28 loss=8.110, nll_loss=7.033, ppl=130.93, wps=15224, ups=3, wpb=3016.636, bsz=96.000, num_updates=961, lr=0.000120201, gnorm=3.301, clip=0.000, oom=0.000, loss_scale=32.000, wall=252, train_wall=168 | |
| epoch 035: 20 / 28 loss=8.013, nll_loss=6.925, ppl=121.55, wps=14565, ups=4, wpb=2861.429, bsz=97.143, num_updates=971, lr=0.000121451, gnorm=3.371, clip=0.000, oom=0.000, loss_scale=32.000, wall=254, train_wall=170 | |
| epoch 035 | loss 7.977 | nll_loss 6.883 | ppl 118.03 | wps 14472 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 978 | lr 0.000122326 | gnorm 3.444 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 255 | train_wall 171 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 035 | valid on 'valid' subset | loss 6.890 | nll_loss 5.620 | ppl 49.19 | num_updates 978 | |
| epoch 036: 10 / 28 loss=7.694, nll_loss=6.570, ppl=95.00, wps=13936, ups=3, wpb=2745.636, bsz=118.909, num_updates=989, lr=0.0001237, gnorm=3.472, clip=0.000, oom=0.000, loss_scale=32.000, wall=259, train_wall=173 | |
| epoch 036: 20 / 28 loss=7.786, nll_loss=6.670, ppl=101.82, wps=14367, ups=4, wpb=2825.190, bsz=108.381, num_updates=999, lr=0.00012495, gnorm=3.510, clip=0.000, oom=0.000, loss_scale=32.000, wall=261, train_wall=175 | |
| epoch 036 | loss 7.812 | nll_loss 6.698 | ppl 103.83 | wps 14501 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1006 | lr 0.000125825 | gnorm 3.488 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 263 | train_wall 176 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 036 | valid on 'valid' subset | loss 6.618 | nll_loss 5.285 | ppl 38.99 | num_updates 1006 | |
| epoch 037: 10 / 28 loss=7.566, nll_loss=6.423, ppl=85.79, wps=14406, ups=3, wpb=2779.364, bsz=99.636, num_updates=1017, lr=0.0001272, gnorm=3.583, clip=0.000, oom=0.000, loss_scale=32.000, wall=267, train_wall=178 | |
| epoch 037: 20 / 28 loss=7.630, nll_loss=6.494, ppl=90.11, wps=14563, ups=4, wpb=2836.714, bsz=100.190, num_updates=1027, lr=0.000128449, gnorm=3.776, clip=0.000, oom=0.000, loss_scale=32.000, wall=269, train_wall=180 | |
| epoch 037 | loss 7.632 | nll_loss 6.494 | ppl 90.12 | wps 14539 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1034 | lr 0.000129324 | gnorm 3.768 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 270 | train_wall 181 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 037 | valid on 'valid' subset | loss 6.553 | nll_loss 5.254 | ppl 38.17 | num_updates 1034 | |
| epoch 038: 10 / 28 loss=7.506, nll_loss=6.356, ppl=81.91, wps=15031, ups=3, wpb=2912.636, bsz=96.000, num_updates=1045, lr=0.000130699, gnorm=3.704, clip=0.000, oom=0.000, loss_scale=32.000, wall=274, train_wall=183 | |
| epoch 038: 20 / 28 loss=7.447, nll_loss=6.288, ppl=78.15, wps=14738, ups=4, wpb=2876.762, bsz=102.476, num_updates=1055, lr=0.000131949, gnorm=3.597, clip=0.000, oom=0.000, loss_scale=32.000, wall=276, train_wall=184 | |
| epoch 038 | loss 7.444 | nll_loss 6.284 | ppl 77.92 | wps 14563 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1062 | lr 0.000132823 | gnorm 3.645 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 277 | train_wall 186 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 038 | valid on 'valid' subset | loss 6.316 | nll_loss 4.942 | ppl 30.74 | num_updates 1062 | |
| epoch 039: 10 / 28 loss=7.321, nll_loss=6.142, ppl=70.63, wps=15042, ups=3, wpb=2891.636, bsz=98.545, num_updates=1073, lr=0.000134198, gnorm=3.872, clip=0.000, oom=0.000, loss_scale=32.000, wall=281, train_wall=188 | |
| epoch 039: 20 / 28 loss=7.314, nll_loss=6.135, ppl=70.29, wps=14696, ups=4, wpb=2846.810, bsz=100.000, num_updates=1083, lr=0.000135448, gnorm=3.816, clip=0.000, oom=0.000, loss_scale=32.000, wall=283, train_wall=189 | |
| epoch 039 | loss 7.314 | nll_loss 6.135 | ppl 70.26 | wps 14601 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1090 | lr 0.000136323 | gnorm 3.905 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 285 | train_wall 191 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 039 | valid on 'valid' subset | loss 6.244 | nll_loss 4.870 | ppl 29.24 | num_updates 1090 | |
| epoch 040: 10 / 28 loss=7.121, nll_loss=5.918, ppl=60.47, wps=14728, ups=3, wpb=2857.909, bsz=94.545, num_updates=1101, lr=0.000137697, gnorm=3.568, clip=0.000, oom=0.000, loss_scale=32.000, wall=289, train_wall=192 | |
| epoch 040: 20 / 28 loss=7.157, nll_loss=5.959, ppl=62.22, wps=14902, ups=4, wpb=2893.333, bsz=91.238, num_updates=1111, lr=0.000138947, gnorm=3.512, clip=0.000, oom=0.000, loss_scale=32.000, wall=291, train_wall=194 | |
| epoch 040 | loss 7.070 | nll_loss 5.861 | ppl 58.13 | wps 14497 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1118 | lr 0.000139822 | gnorm 3.493 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 292 | train_wall 195 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 040 | valid on 'valid' subset | loss 5.846 | nll_loss 4.434 | ppl 21.62 | num_updates 1118 | |
| epoch 041: 10 / 28 loss=6.883, nll_loss=5.654, ppl=50.35, wps=14404, ups=3, wpb=2838.273, bsz=101.818, num_updates=1129, lr=0.000141197, gnorm=3.607, clip=0.000, oom=0.000, loss_scale=32.000, wall=296, train_wall=197 | |
| epoch 041: 20 / 28 loss=6.859, nll_loss=5.624, ppl=49.33, wps=14399, ups=3, wpb=2841.667, bsz=108.952, num_updates=1139, lr=0.000142447, gnorm=3.577, clip=0.000, oom=0.000, loss_scale=32.000, wall=298, train_wall=199 | |
| epoch 041 | loss 6.865 | nll_loss 5.630 | ppl 49.52 | wps 14456 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1146 | lr 0.000143321 | gnorm 3.531 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 299 | train_wall 200 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 041 | valid on 'valid' subset | loss 5.553 | nll_loss 4.065 | ppl 16.74 | num_updates 1146 | |
| epoch 042: 10 / 28 loss=6.597, nll_loss=5.330, ppl=40.24, wps=14598, ups=3, wpb=2826.091, bsz=95.636, num_updates=1157, lr=0.000144696, gnorm=3.448, clip=0.000, oom=0.000, loss_scale=32.000, wall=303, train_wall=202 | |
| epoch 042: 20 / 28 loss=6.617, nll_loss=5.352, ppl=40.85, wps=14561, ups=4, wpb=2846.476, bsz=104.571, num_updates=1167, lr=0.000145946, gnorm=3.477, clip=0.000, oom=0.000, loss_scale=32.000, wall=305, train_wall=204 | |
| epoch 042 | loss 6.639 | nll_loss 5.376 | ppl 41.52 | wps 14474 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1174 | lr 0.000146821 | gnorm 3.480 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 307 | train_wall 205 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 042 | valid on 'valid' subset | loss 5.427 | nll_loss 3.936 | ppl 15.31 | num_updates 1174 | |
| epoch 043: 10 / 28 loss=6.411, nll_loss=5.120, ppl=34.78, wps=14474, ups=3, wpb=2850.455, bsz=109.818, num_updates=1185, lr=0.000148195, gnorm=3.499, clip=0.000, oom=0.000, loss_scale=32.000, wall=311, train_wall=207 | |
| epoch 043: 20 / 28 loss=6.443, nll_loss=5.154, ppl=35.60, wps=14550, ups=4, wpb=2844.238, bsz=104.381, num_updates=1195, lr=0.000149445, gnorm=3.578, clip=0.000, oom=0.000, loss_scale=32.000, wall=313, train_wall=209 | |
| epoch 043 | loss 6.507 | nll_loss 5.226 | ppl 37.42 | wps 14526 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1202 | lr 0.00015032 | gnorm 3.831 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 314 | train_wall 210 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 043 | valid on 'valid' subset | loss 5.231 | nll_loss 3.691 | ppl 12.91 | num_updates 1202 | |
| epoch 044: 10 / 28 loss=6.364, nll_loss=5.065, ppl=33.47, wps=15029, ups=3, wpb=2880.545, bsz=86.182, num_updates=1213, lr=0.000151695, gnorm=3.492, clip=0.000, oom=0.000, loss_scale=32.000, wall=318, train_wall=212 | |
| epoch 044: 20 / 28 loss=6.306, nll_loss=4.997, ppl=31.94, wps=14670, ups=4, wpb=2858.952, bsz=100.000, num_updates=1223, lr=0.000152944, gnorm=3.490, clip=0.000, oom=0.000, loss_scale=32.000, wall=320, train_wall=214 | |
| epoch 044 | loss 6.295 | nll_loss 4.985 | ppl 31.67 | wps 14544 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1230 | lr 0.000153819 | gnorm 3.499 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 321 | train_wall 215 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 044 | valid on 'valid' subset | loss 5.039 | nll_loss 3.459 | ppl 11.00 | num_updates 1230 | |
| epoch 045: 10 / 28 loss=5.930, nll_loss=4.575, ppl=23.83, wps=14298, ups=3, wpb=2734.636, bsz=112.364, num_updates=1241, lr=0.000155194, gnorm=3.567, clip=0.000, oom=0.000, loss_scale=32.000, wall=325, train_wall=217 | |
| epoch 045: 20 / 28 loss=6.081, nll_loss=4.743, ppl=26.78, wps=14514, ups=4, wpb=2817.476, bsz=107.238, num_updates=1251, lr=0.000156444, gnorm=3.670, clip=0.000, oom=0.000, loss_scale=32.000, wall=327, train_wall=219 | |
| epoch 045 | loss 6.106 | nll_loss 4.770 | ppl 27.28 | wps 14566 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1258 | lr 0.000157319 | gnorm 3.569 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 329 | train_wall 220 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 045 | valid on 'valid' subset | loss 4.858 | nll_loss 3.257 | ppl 9.56 | num_updates 1258 | |
| epoch 046: 10 / 28 loss=5.741, nll_loss=4.362, ppl=20.57, wps=13919, ups=3, wpb=2737.545, bsz=118.545, num_updates=1269, lr=0.000158693, gnorm=3.653, clip=0.000, oom=0.000, loss_scale=32.000, wall=333, train_wall=222 | |
| epoch 046: 20 / 28 loss=5.878, nll_loss=4.514, ppl=22.85, wps=14388, ups=4, wpb=2824.429, bsz=108.571, num_updates=1279, lr=0.000159943, gnorm=3.603, clip=0.000, oom=0.000, loss_scale=32.000, wall=335, train_wall=224 | |
| epoch 046 | loss 5.910 | nll_loss 4.549 | ppl 23.41 | wps 14514 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1286 | lr 0.000160818 | gnorm 3.575 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 336 | train_wall 225 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 046 | valid on 'valid' subset | loss 4.582 | nll_loss 2.946 | ppl 7.71 | num_updates 1286 | |
| epoch 047: 10 / 28 loss=5.770, nll_loss=4.394, ppl=21.02, wps=14554, ups=3, wpb=2873.455, bsz=109.091, num_updates=1297, lr=0.000162193, gnorm=3.954, clip=0.000, oom=0.000, loss_scale=32.000, wall=340, train_wall=227 | |
| epoch 047: 20 / 28 loss=5.787, nll_loss=4.411, ppl=21.27, wps=14704, ups=4, wpb=2901.810, bsz=103.619, num_updates=1307, lr=0.000163442, gnorm=3.611, clip=0.000, oom=0.000, loss_scale=32.000, wall=342, train_wall=228 | |
| epoch 047 | loss 5.741 | nll_loss 4.358 | ppl 20.51 | wps 14438 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1314 | lr 0.000164317 | gnorm 3.586 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 343 | train_wall 230 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 047 | valid on 'valid' subset | loss 4.529 | nll_loss 2.891 | ppl 7.42 | num_updates 1314 | |
| epoch 048: 10 / 28 loss=5.595, nll_loss=4.191, ppl=18.26, wps=14540, ups=3, wpb=2885.364, bsz=101.091, num_updates=1325, lr=0.000165692, gnorm=4.080, clip=0.000, oom=0.000, loss_scale=32.000, wall=347, train_wall=232 | |
| epoch 048: 20 / 28 loss=5.657, nll_loss=4.260, ppl=19.16, wps=14215, ups=4, wpb=2799.714, bsz=109.524, num_updates=1335, lr=0.000166942, gnorm=4.461, clip=0.000, oom=0.000, loss_scale=32.000, wall=349, train_wall=233 | |
| epoch 048 | loss 5.710 | nll_loss 4.318 | ppl 19.95 | wps 14444 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1342 | lr 0.000167816 | gnorm 4.202 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 351 | train_wall 235 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 048 | valid on 'valid' subset | loss 4.372 | nll_loss 2.673 | ppl 6.38 | num_updates 1342 | |
| epoch 049: 10 / 28 loss=5.585, nll_loss=4.179, ppl=18.12, wps=15364, ups=3, wpb=3006.000, bsz=83.273, num_updates=1353, lr=0.000169191, gnorm=3.424, clip=0.000, oom=0.000, loss_scale=32.000, wall=355, train_wall=236 | |
| epoch 049: 20 / 28 loss=5.471, nll_loss=4.049, ppl=16.56, wps=14929, ups=4, wpb=2918.238, bsz=94.286, num_updates=1363, lr=0.000170441, gnorm=3.431, clip=0.000, oom=0.000, loss_scale=32.000, wall=357, train_wall=238 | |
| epoch 049 | loss 5.387 | nll_loss 3.955 | ppl 15.51 | wps 14481 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1370 | lr 0.000171316 | gnorm 3.358 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 358 | train_wall 239 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 049 | valid on 'valid' subset | loss 4.085 | nll_loss 2.354 | ppl 5.11 | num_updates 1370 | |
| epoch 050: 10 / 28 loss=4.973, nll_loss=3.484, ppl=11.19, wps=14040, ups=3, wpb=2763.909, bsz=118.909, num_updates=1381, lr=0.00017269, gnorm=3.269, clip=0.000, oom=0.000, loss_scale=32.000, wall=362, train_wall=241 | |
| epoch 050: 20 / 28 loss=5.080, nll_loss=3.607, ppl=12.18, wps=14452, ups=4, wpb=2823.619, bsz=104.190, num_updates=1391, lr=0.00017394, gnorm=3.159, clip=0.000, oom=0.000, loss_scale=32.000, wall=364, train_wall=243 | |
| epoch 050 | loss 5.146 | nll_loss 3.681 | ppl 12.82 | wps 14521 | ups 4 | wpb 2838.786 | bsz 104.714 | num_updates 1398 | lr 0.000174815 | gnorm 3.281 | clip 0.000 | oom 0.000 | loss_scale 32.000 | wall 365 | train_wall 244 | |
| WARNING: 72 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1808, 612, 1824, 2185, 1244, 723, 1226, 388, 1791, 378] | |
| epoch 050 | valid on 'valid' subset | loss 3.945 | nll_loss 2.180 | ppl 4.53 | num_updates 1398 | |
| done training in 367.2 seconds | |
Fri Aug 16 19:13:45 UTC 2019 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment