Created
April 22, 2019 19:05
-
-
Save pipibjc/5014656d9bee25d2beb78b296fd5849b to your computer and use it in GitHub Desktop.
Flores reproduce issue 7
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Namespace(adam_betas='(0.9, 0.98)', adam_eps=1e-08, adaptive_input=False, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, arch='transformer', attention_dropout=0.2, bucket_cap_mb=25, clip_norm=0.0, cpu=False, criterion='label_smoothed_cross_entropy', curriculum=0, data=['/private/home/pipibjc/tools/flores/data-bin/wiki_ne_en_bpe5000/'], ddp_backend='no_c10d', decoder_attention_heads=2, decoder_embed_dim=512, decoder_embed_path=None, decoder_ffn_embed_dim=2048, decoder_input_dim=512, decoder_layers=5, decoder_learned_pos=False, decoder_normalize_before=True, decoder_output_dim=512, device_id=3, distributed_backend='nccl', distributed_init_method='tcp://localhost:14992', distributed_port=-1, distributed_rank=3, distributed_world_size=4, dropout=0.4, encoder_attention_heads=2, encoder_embed_dim=512, encoder_embed_path=None, encoder_ffn_embed_dim=2048, encoder_layers=5, encoder_learned_pos=False, encoder_normalize_before=True, fix_batches_to_gpus=False, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, keep_interval_updates=-1, keep_last_epochs=-1, label_smoothing=0.2, lazy_load=False, left_pad_source='True', left_pad_target='False', log_format='json', log_interval=100, lr=[0.001], lr_scheduler='inverse_sqrt', lr_shrink=0.1, max_epoch=100, max_sentences=None, max_sentences_valid=None, max_source_positions=1024, max_target_positions=1024, max_tokens=4000, max_update=0, memory_efficient_fp16=False, min_loss_scale=0.0001, min_lr=1e-09, momentum=0.99, no_epoch_checkpoints=False, no_progress_bar=False, no_save=False, no_token_positional_embeddings=False, num_workers=0, optimizer='adam', optimizer_overrides='{}', raw_text=False, relu_dropout=0.2, required_batch_size_multiple=8, reset_lr_scheduler=False, reset_optimizer=False, restore_file='checkpoint_last.pt', save_dir='/checkpoint/pipibjc/2019-04-16/flores_issue7.ne.en.no_c10d.transformer.shareemb.layers5.emb512.ffndim2048.heads2.normbefore.attndrop0.2.reludrop0.2.adam.beta0.9_0.98.initlr1e-07.warmup4000.lr0.001.clip0.0.drop0.4.wd0.0001.ls0.2.maxtok4000.upfreq8.seed1.ngpu4', save_interval=10, save_interval_updates=0, seed=1, sentence_avg=False, share_all_embeddings=True, share_decoder_input_output_embed=False, skip_invalid_size_inputs_valid_test=False, source_lang='ne', target_lang='en', task='translation', tensorboard_logdir='/checkpoint/pipibjc/tensorboard_logs/2019-04-16/flores_issue7.ne.en.no_c10d.transformer.shareemb.layers5.emb512.ffndim2048.heads2.normbefore.attndrop0.2.reludrop0.2.adam.beta0.9_0.98.initlr1e-07.warmup4000.lr0.001.clip0.0.drop0.4.wd0.0001.ls0.2.maxtok4000.upfreq8.seed1.ngpu4', threshold_loss_scale=None, train_subset='train', update_freq=[8], upsample_primary=1, user_dir=None, valid_subset='valid', validate_interval=1, warmup_init_lr=1e-07, warmup_updates=4000, weight_decay=0.0001) | |
{"epoch": 1, "train_loss": "12.433", "train_nll_loss": "12.214", "train_ppl": "4751.10", "train_wps": "104512", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "72", "train_lr": "1.80982e-05", "train_gnorm": "1.354", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "84", "train_train_wall": "61"} | |
{"epoch": 1, "valid_loss": "11.465", "valid_nll_loss": "11.032", "valid_ppl": "2094.62", "valid_num_updates": "72"} | |
{"epoch": 2, "train_loss": "10.852", "train_nll_loss": "10.279", "train_ppl": "1242.06", "train_wps": "105081", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "144", "train_lr": "3.60964e-05", "train_gnorm": "0.494", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "155", "train_train_wall": "121"} | |
{"epoch": 2, "valid_loss": "10.556", "valid_nll_loss": "9.878", "valid_ppl": "940.81", "valid_num_updates": "144"} | |
{"epoch": 3, "train_loss": "10.213", "train_nll_loss": "9.411", "train_ppl": "680.82", "train_wps": "104749", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "216", "train_lr": "5.40946e-05", "train_gnorm": "0.518", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "227", "train_train_wall": "183"} | |
{"epoch": 3, "valid_loss": "10.227", "valid_nll_loss": "9.362", "valid_ppl": "658.10", "valid_num_updates": "216"} | |
{"epoch": 4, "train_loss": "9.882", "train_nll_loss": "8.930", "train_ppl": "487.74", "train_wps": "104661", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "288", "train_lr": "7.20928e-05", "train_gnorm": "0.351", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "299", "train_train_wall": "244"} | |
{"epoch": 4, "valid_loss": "10.308", "valid_nll_loss": "9.358", "valid_ppl": "656.27", "valid_num_updates": "288"} | |
{"epoch": 5, "train_loss": "9.549", "train_nll_loss": "8.477", "train_ppl": "356.38", "train_wps": "105566", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "360", "train_lr": "9.0091e-05", "train_gnorm": "0.466", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "371", "train_train_wall": "304"} | |
{"epoch": 5, "valid_loss": "9.781", "valid_nll_loss": "8.743", "valid_ppl": "428.58", "valid_num_updates": "360"} | |
{"epoch": 6, "train_loss": "9.177", "train_nll_loss": "7.973", "train_ppl": "251.21", "train_wps": "104942", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "432", "train_lr": "0.000108089", "train_gnorm": "0.320", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "443", "train_train_wall": "365"} | |
{"epoch": 6, "valid_loss": "9.501", "valid_nll_loss": "8.310", "valid_ppl": "317.32", "valid_num_updates": "432"} | |
{"epoch": 7, "train_loss": "8.835", "train_nll_loss": "7.506", "train_ppl": "181.80", "train_wps": "103532", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "504", "train_lr": "0.000126087", "train_gnorm": "0.356", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "516", "train_train_wall": "427"} | |
{"epoch": 7, "valid_loss": "9.311", "valid_nll_loss": "8.031", "valid_ppl": "261.52", "valid_num_updates": "504"} | |
{"epoch": 8, "train_loss": "8.517", "train_nll_loss": "7.079", "train_ppl": "135.20", "train_wps": "104221", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "576", "train_lr": "0.000144086", "train_gnorm": "0.306", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "588", "train_train_wall": "488"} | |
{"epoch": 8, "valid_loss": "9.122", "valid_nll_loss": "7.734", "valid_ppl": "212.87", "valid_num_updates": "576"} | |
{"epoch": 9, "train_loss": "8.209", "train_nll_loss": "6.668", "train_ppl": "101.68", "train_wps": "105118", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "648", "train_lr": "0.000162084", "train_gnorm": "0.340", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "660", "train_train_wall": "548"} | |
{"epoch": 9, "valid_loss": "8.973", "valid_nll_loss": "7.501", "valid_ppl": "181.11", "valid_num_updates": "648"} | |
{"epoch": 10, "train_loss": "7.923", "train_nll_loss": "6.287", "train_ppl": "78.06", "train_wps": "104820", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "720", "train_lr": "0.000180082", "train_gnorm": "0.294", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "732", "train_train_wall": "609"} | |
{"epoch": 10, "valid_loss": "8.801", "valid_nll_loss": "7.267", "valid_ppl": "154.04", "valid_num_updates": "720"} | |
| saved checkpoint /checkpoint/pipibjc/2019-04-16/flores_issue7.ne.en.no_c10d.transformer.shareemb.layers5.emb512.ffndim2048.heads2.normbefore.attndrop0.2.reludrop0.2.adam.beta0.9_0.98.initlr1e-07.warmup4000.lr0.001.clip0.0.drop0.4.wd0.0001.ls0.2.maxtok4000.upfreq8.seed1.ngpu4/checkpoint10.pt (epoch 10 @ 720 updates) (writing took 6.133517742156982 seconds) | |
{"epoch": 11, "train_loss": "7.677", "train_nll_loss": "5.958", "train_ppl": "62.17", "train_wps": "103842", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "792", "train_lr": "0.00019808", "train_gnorm": "0.330", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "811", "train_train_wall": "670"} | |
{"epoch": 11, "valid_loss": "8.735", "valid_nll_loss": "7.148", "valid_ppl": "141.85", "valid_num_updates": "792", "valid_best_loss": "8.73534"} | |
{"epoch": 12, "train_loss": "7.460", "train_nll_loss": "5.670", "train_ppl": "50.91", "train_wps": "104500", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "864", "train_lr": "0.000216078", "train_gnorm": "0.299", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "883", "train_train_wall": "732"} | |
{"epoch": 12, "valid_loss": "8.637", "valid_nll_loss": "7.036", "valid_ppl": "131.24", "valid_num_updates": "864", "valid_best_loss": "8.63726"} | |
{"epoch": 13, "train_loss": "7.270", "train_nll_loss": "5.416", "train_ppl": "42.69", "train_wps": "104768", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "936", "train_lr": "0.000234077", "train_gnorm": "0.289", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "955", "train_train_wall": "793"} | |
{"epoch": 13, "valid_loss": "8.559", "valid_nll_loss": "6.928", "valid_ppl": "121.78", "valid_num_updates": "936", "valid_best_loss": "8.55852"} | |
{"epoch": 14, "train_loss": "7.102", "train_nll_loss": "5.193", "train_ppl": "36.58", "train_wps": "103783", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "1008", "train_lr": "0.000252075", "train_gnorm": "0.300", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "1028", "train_train_wall": "855"} | |
{"epoch": 14, "valid_loss": "8.479", "valid_nll_loss": "6.797", "valid_ppl": "111.18", "valid_num_updates": "1008", "valid_best_loss": "8.47912"} | |
{"epoch": 92, "train_loss": "4.775", "train_nll_loss": "2.198", "train_ppl": "4.59", "train_wps": "105025", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "6624", "train_lr": "0.000777087", "train_gnorm": "0.123", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "6698", "train_train_wall": "5602"} | |
{"epoch": 92, "valid_loss": "6.933", "valid_nll_loss": "4.712", "valid_ppl": "26.21", "valid_num_updates": "6624", "valid_best_loss": "6.91486"} | |
{"epoch": 93, "train_loss": "4.771", "train_nll_loss": "2.192", "train_ppl": "4.57", "train_wps": "104347", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "6696", "train_lr": "0.000772898", "train_gnorm": "0.119", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "6770", "train_train_wall": "5663"} | |
{"epoch": 93, "valid_loss": "6.917", "valid_nll_loss": "4.682", "valid_ppl": "25.67", "valid_num_updates": "6696", "valid_best_loss": "6.91486"} | |
{"epoch": 94, "train_loss": "4.766", "train_nll_loss": "2.187", "train_ppl": "4.55", "train_wps": "98189", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "6768", "train_lr": "0.000768776", "train_gnorm": "0.119", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "6847", "train_train_wall": "5727"} | |
{"epoch": 94, "valid_loss": "6.917", "valid_nll_loss": "4.687", "valid_ppl": "25.75", "valid_num_updates": "6768", "valid_best_loss": "6.91486"} | |
{"epoch": 95, "train_loss": "4.762", "train_nll_loss": "2.181", "train_ppl": "4.54", "train_wps": "104912", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "6840", "train_lr": "0.000764719", "train_gnorm": "0.121", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "6919", "train_train_wall": "5787"} | |
{"epoch": 95, "valid_loss": "6.917", "valid_nll_loss": "4.678", "valid_ppl": "25.60", "valid_num_updates": "6840", "valid_best_loss": "6.91486"} | |
{"epoch": 96, "train_loss": "4.757", "train_nll_loss": "2.176", "train_ppl": "4.52", "train_wps": "105657", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "6912", "train_lr": "0.000760726", "train_gnorm": "0.120", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "6990", "train_train_wall": "5848"} | |
{"epoch": 96, "valid_loss": "6.893", "valid_nll_loss": "4.665", "valid_ppl": "25.37", "valid_num_updates": "6912", "valid_best_loss": "6.89268"} | |
{"epoch": 97, "train_loss": "4.753", "train_nll_loss": "2.170", "train_ppl": "4.50", "train_wps": "104768", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "6984", "train_lr": "0.000756794", "train_gnorm": "0.122", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "7062", "train_train_wall": "5909"} | |
{"epoch": 97, "valid_loss": "6.894", "valid_nll_loss": "4.650", "valid_ppl": "25.12", "valid_num_updates": "6984", "valid_best_loss": "6.89435"} | |
{"epoch": 98, "train_loss": "4.747", "train_nll_loss": "2.163", "train_ppl": "4.48", "train_wps": "104787", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "7056", "train_lr": "0.000752923", "train_gnorm": "0.117", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "7134", "train_train_wall": "5969"} | |
{"epoch": 98, "valid_loss": "6.912", "valid_nll_loss": "4.664", "valid_ppl": "25.35", "valid_num_updates": "7056", "valid_best_loss": "6.91219"} | |
{"epoch": 99, "train_loss": "4.746", "train_nll_loss": "2.161", "train_ppl": "4.47", "train_wps": "104791", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "7128", "train_lr": "0.000749111", "train_gnorm": "0.122", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "7206", "train_train_wall": "6030"} | |
{"epoch": 99, "valid_loss": "6.890", "valid_nll_loss": "4.652", "valid_ppl": "25.14", "valid_num_updates": "7128", "valid_best_loss": "6.8898"} | |
{"epoch": 100, "train_loss": "4.740", "train_nll_loss": "2.154", "train_ppl": "4.45", "train_wps": "105986", "train_ups": "1", "train_wpb": "102198.653", "train_bsz": "7820.500", "train_num_updates": "7200", "train_lr": "0.000745356", "train_gnorm": "0.117", "train_clip": "0.000", "train_oom": "0.000", "train_wall": "7277", "train_train_wall": "6090"} | |
{"epoch": 100, "valid_loss": "6.904", "valid_nll_loss": "4.666", "valid_ppl": "25.39", "valid_num_updates": "7200", "valid_best_loss": "6.90396"} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment