zheyuye/gluon_roberta_large.log

## gluon_roberta_large.log
2020-07-14 08:24:58,197 - root - INFO - GPU communication supported by horovod
2020-07-14 08:24:58,197 - root - INFO - GPU communication supported by horovod
2020-07-14 08:24:58,197 - root - INFO - GPU communication supported by horovod
2020-07-14 08:24:58,197 - root - INFO - GPU communication supported by horovod
2020-07-14 08:25:06,274 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 08:25:06,286 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 08:25:06,298 - root - INFO - Prepare training data
2020-07-14 08:25:06,317 - root - INFO - Prepare training data
2020-07-14 08:25:06,340 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 08:25:06,381 - root - INFO - Prepare training data
2020-07-14 08:25:06,413 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 08:25:06,500 - root - INFO - Prepare training data
2020-07-14 08:25:08,757 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson
2020-07-14 08:25:08,758 - root - INFO - Processing the Training data:
2020-07-14 08:25:08,771 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson
2020-07-14 08:25:08,772 - root - INFO - Processing the Training data:
2020-07-14 08:25:08,793 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson
2020-07-14 08:25:08,794 - root - INFO - Processing the Training data:
2020-07-14 08:25:08,810 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson
2020-07-14 08:25:08,811 - root - INFO - Processing the Training data:
2020-07-14 08:25:09,365 - root - INFO - Done! #Unreliable Span=0 / #Mismatched Answer=0 / #Total=11873
2020-07-14 08:25:09,367 - root - INFO - Before Chunking, #Train/Is Impossible = 11873/5945
2020-07-14 08:25:09,367 - root - INFO - After Chunking, #Train Sample/Is Impossible = 12006/12006
2020-07-14 08:25:09,367 - root - INFO - Creating distributed trainer...
2020-07-14 08:25:09,371 - root - INFO - Done! #Unreliable Span=0 / #Mismatched Answer=0 / #Total=11873
2020-07-14 08:25:09,373 - root - INFO - Before Chunking, #Train/Is Impossible = 11873/5945
2020-07-14 08:25:09,373 - root - INFO - After Chunking, #Train Sample/Is Impossible = 12006/12006
2020-07-14 08:25:09,374 - root - INFO - Creating distributed trainer...
2020-07-14 08:25:09,380 - root - INFO - Using gradient accumulation. Effective global batch size = 48
2020-07-14 08:25:09,386 - root - INFO - Using gradient accumulation. Effective global batch size = 48
2020-07-14 08:25:09,400 - root - INFO - Done! #Unreliable Span=0 / #Mismatched Answer=0 / #Total=11873
2020-07-14 08:25:09,402 - root - INFO - Before Chunking, #Train/Is Impossible = 11873/5945
2020-07-14 08:25:09,403 - root - INFO - After Chunking, #Train Sample/Is Impossible = 12006/12006
2020-07-14 08:25:09,403 - root - INFO - Creating distributed trainer...
2020-07-14 08:25:09,416 - root - INFO - Using gradient accumulation. Effective global batch size = 48
2020-07-14 08:25:09,418 - root - INFO - Done! #Unreliable Span=0 / #Mismatched Answer=0 / #Total=11873
2020-07-14 08:25:09,420 - root - INFO - Before Chunking, #Train/Is Impossible = 11873/5945
2020-07-14 08:25:09,420 - root - INFO - After Chunking, #Train Sample/Is Impossible = 12006/12006
2020-07-14 08:25:09,420 - root - INFO - Creating distributed trainer...
2020-07-14 08:25:09,422 - root - INFO - #Total Training Steps=750, Warmup=150, Save Interval=200
2020-07-14 08:25:09,430 - root - INFO - #Total Training Steps=750, Warmup=150, Save Interval=200
2020-07-14 08:25:09,433 - root - INFO - Using gradient accumulation. Effective global batch size = 48
2020-07-14 08:25:09,458 - root - INFO - #Total Training Steps=750, Warmup=150, Save Interval=200
2020-07-14 08:25:09,476 - root - INFO - #Total Training Steps=750, Warmup=150, Save Interval=200
2020-07-14 08:28:37,071 - root - INFO - Step: 50/750, Loss span/answer/total=1.5229/0.0954/1.6184, LR=0.00001000, grad_norm=0.0102. Time cost=207.61, Throughput=2.89 samples/s ETA=0.81h
2020-07-14 08:32:13,367 - root - INFO - Step: 100/750, Loss span/answer/total=0.1254/0.0058/0.1312, LR=0.00002000, grad_norm=0.0170. Time cost=216.30, Throughput=2.77 samples/s ETA=0.77h
2020-07-14 08:35:48,985 - root - INFO - Step: 150/750, Loss span/answer/total=0.0079/0.0005/0.0083, LR=0.00003000, grad_norm=0.0235. Time cost=215.62, Throughput=2.78 samples/s ETA=0.71h
2020-07-14 08:42:47,430 - root - INFO - GPU communication supported by horovod
2020-07-14 08:42:47,430 - root - INFO - GPU communication supported by horovod
2020-07-14 08:42:47,430 - root - INFO - GPU communication supported by horovod
2020-07-14 08:42:47,431 - root - INFO - GPU communication supported by horovod
2020-07-14 08:42:55,415 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 08:42:55,433 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 08:42:55,441 - root - INFO - Prepare training data
2020-07-14 08:42:55,458 - root - INFO - Prepare training data
2020-07-14 08:42:55,572 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 08:42:55,618 - root - INFO - Prepare training data
2020-07-14 08:42:55,631 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 08:42:55,670 - root - INFO - Prepare training data
2020-07-14 08:43:22,759 - root - INFO - GPU communication supported by horovod
2020-07-14 08:43:22,759 - root - INFO - GPU communication supported by horovod
2020-07-14 08:43:22,759 - root - INFO - GPU communication supported by horovod
2020-07-14 08:43:22,759 - root - INFO - GPU communication supported by horovod
2020-07-14 08:43:30,723 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 08:43:30,748 - root - INFO - Prepare training data
2020-07-14 08:43:30,796 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 08:43:30,824 - root - INFO - Prepare training data
2020-07-14 08:43:30,832 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 08:43:30,870 - root - INFO - Prepare training data
2020-07-14 08:43:30,872 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 08:43:30,927 - root - INFO - Prepare training data
2020-07-14 08:43:51,700 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson
2020-07-14 08:43:51,701 - root - INFO - Processing the Training data:
2020-07-14 08:43:51,870 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson
2020-07-14 08:43:51,870 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson
2020-07-14 08:43:51,871 - root - INFO - Processing the Training data:
2020-07-14 08:43:51,871 - root - INFO - Processing the Training data:
2020-07-14 08:43:51,934 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson
2020-07-14 08:43:51,935 - root - INFO - Processing the Training data:
2020-07-14 08:43:57,968 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319
2020-07-14 08:43:57,989 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498
2020-07-14 08:43:57,990 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697
2020-07-14 08:43:57,990 - root - INFO - Creating distributed trainer...
2020-07-14 08:43:58,002 - root - INFO - Using gradient accumulation. Effective global batch size = 48
2020-07-14 08:43:58,044 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200
2020-07-14 08:43:58,094 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319
2020-07-14 08:43:58,116 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498
2020-07-14 08:43:58,116 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697
2020-07-14 08:43:58,116 - root - INFO - Creating distributed trainer...
2020-07-14 08:43:58,129 - root - INFO - Using gradient accumulation. Effective global batch size = 48
2020-07-14 08:43:58,140 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319
2020-07-14 08:43:58,161 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498
2020-07-14 08:43:58,161 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697
2020-07-14 08:43:58,161 - root - INFO - Creating distributed trainer...
2020-07-14 08:43:58,163 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319
2020-07-14 08:43:58,171 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200
2020-07-14 08:43:58,174 - root - INFO - Using gradient accumulation. Effective global batch size = 48
2020-07-14 08:43:58,185 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498
2020-07-14 08:43:58,185 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697
2020-07-14 08:43:58,185 - root - INFO - Creating distributed trainer...
2020-07-14 08:43:58,198 - root - INFO - Using gradient accumulation. Effective global batch size = 48
2020-07-14 08:43:58,218 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200
2020-07-14 08:43:58,242 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200
2020-07-14 08:47:11,580 - root - INFO - Step: 50/8161, Loss span/answer/total=4.5936/0.3121/4.9057, LR=0.00000092, grad_norm=0.8805. Time cost=193.50, Throughput=3.10 samples/s ETA=8.72h
2020-07-14 08:50:21,126 - root - INFO - Step: 100/8161, Loss span/answer/total=3.9519/0.2963/4.2482, LR=0.00000184, grad_norm=0.8129. Time cost=189.55, Throughput=3.17 samples/s ETA=8.58h
2020-07-14 08:53:40,545 - root - INFO - Step: 150/8161, Loss span/answer/total=3.5912/0.2979/3.8891, LR=0.00000276, grad_norm=1.1070. Time cost=199.42, Throughput=3.01 samples/s ETA=8.64h
2020-07-14 08:56:52,530 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_200.params
2020-07-14 08:56:52,530 - root - INFO - Step: 200/8161, Loss span/answer/total=2.5279/0.2770/2.8049, LR=0.00000368, grad_norm=1.7652. Time cost=191.98, Throughput=3.13 samples/s ETA=8.56h
2020-07-14 09:00:01,731 - root - INFO - Step: 250/8161, Loss span/answer/total=1.6767/0.2792/1.9560, LR=0.00000460, grad_norm=2.1414. Time cost=189.20, Throughput=3.17 samples/s ETA=8.47h
2020-07-14 09:03:10,564 - root - INFO - Step: 300/8161, Loss span/answer/total=1.3609/0.2229/1.5838, LR=0.00000551, grad_norm=2.2598. Time cost=188.83, Throughput=3.18 samples/s ETA=8.39h
2020-07-14 09:06:21,258 - root - INFO - Step: 350/8161, Loss span/answer/total=1.1037/0.2142/1.3180, LR=0.00000643, grad_norm=1.6036. Time cost=190.69, Throughput=3.15 samples/s ETA=8.33h
2020-07-14 09:09:32,098 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_400.params
2020-07-14 09:09:32,099 - root - INFO - Step: 400/8161, Loss span/answer/total=1.0808/0.2131/1.2938, LR=0.00000735, grad_norm=2.0335. Time cost=190.84, Throughput=3.14 samples/s ETA=8.27h
2020-07-14 09:12:43,010 - root - INFO - Step: 450/8161, Loss span/answer/total=1.0440/0.1907/1.2347, LR=0.00000827, grad_norm=1.5600. Time cost=190.91, Throughput=3.14 samples/s ETA=8.21h
2020-07-14 09:15:47,598 - root - INFO - Step: 500/8161, Loss span/answer/total=0.9771/0.1689/1.1460, LR=0.00000919, grad_norm=3.2921. Time cost=184.59, Throughput=3.25 samples/s ETA=8.13h
2020-07-14 09:18:58,111 - root - INFO - Step: 550/8161, Loss span/answer/total=0.9907/0.1919/1.1826, LR=0.00001011, grad_norm=1.9369. Time cost=190.51, Throughput=3.15 samples/s ETA=8.07h
2020-07-14 09:22:12,371 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_600.params
2020-07-14 09:22:12,371 - root - INFO - Step: 600/8161, Loss span/answer/total=0.9575/0.1871/1.1446, LR=0.00001103, grad_norm=2.0379. Time cost=194.26, Throughput=3.09 samples/s ETA=8.03h
2020-07-14 09:25:16,471 - root - INFO - Step: 650/8161, Loss span/answer/total=0.9153/0.2056/1.1209, LR=0.00001195, grad_norm=2.2053. Time cost=184.10, Throughput=3.26 samples/s ETA=7.96h
2020-07-14 09:28:21,716 - root - INFO - Step: 700/8161, Loss span/answer/total=0.9022/0.1800/1.0822, LR=0.00001287, grad_norm=2.0880. Time cost=185.24, Throughput=3.24 samples/s ETA=7.89h
2020-07-14 09:31:34,459 - root - INFO - Step: 750/8161, Loss span/answer/total=0.8562/0.1595/1.0157, LR=0.00001379, grad_norm=2.0007. Time cost=192.74, Throughput=3.11 samples/s ETA=7.84h
2020-07-14 09:34:47,966 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_800.params
2020-07-14 09:34:47,966 - root - INFO - Step: 800/8161, Loss span/answer/total=0.8080/0.1869/0.9949, LR=0.00001471, grad_norm=2.1405. Time cost=193.51, Throughput=3.10 samples/s ETA=7.80h
2020-07-14 09:38:06,716 - root - INFO - Step: 850/8161, Loss span/answer/total=0.8854/0.1687/1.0541, LR=0.00001563, grad_norm=2.4760. Time cost=198.75, Throughput=3.02 samples/s ETA=7.76h
2020-07-14 09:41:22,141 - root - INFO - Step: 900/8161, Loss span/answer/total=0.8633/0.1698/1.0331, LR=0.00001654, grad_norm=2.0626. Time cost=195.42, Throughput=3.07 samples/s ETA=7.72h
2020-07-14 09:44:35,609 - root - INFO - Step: 950/8161, Loss span/answer/total=0.8498/0.1884/1.0382, LR=0.00001746, grad_norm=6.7273. Time cost=193.47, Throughput=3.10 samples/s ETA=7.67h
2020-07-14 09:47:50,034 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1000.params
2020-07-14 09:47:50,034 - root - INFO - Step: 1000/8161, Loss span/answer/total=0.8966/0.1889/1.0856, LR=0.00001838, grad_norm=1.4678. Time cost=194.42, Throughput=3.09 samples/s ETA=7.62h
2020-07-14 09:50:57,237 - root - INFO - Step: 1050/8161, Loss span/answer/total=0.8836/0.1864/1.0700, LR=0.00001930, grad_norm=2.0181. Time cost=187.20, Throughput=3.21 samples/s ETA=7.56h
2020-07-14 09:54:07,456 - root - INFO - Step: 1100/8161, Loss span/answer/total=0.8858/0.1835/1.0693, LR=0.00002022, grad_norm=1.9293. Time cost=190.22, Throughput=3.15 samples/s ETA=7.51h
2020-07-14 09:57:22,497 - root - INFO - Step: 1150/8161, Loss span/answer/total=0.8753/0.1828/1.0581, LR=0.00002114, grad_norm=2.4784. Time cost=195.04, Throughput=3.08 samples/s ETA=7.46h
2020-07-14 10:00:35,263 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1200.params
2020-07-14 10:00:35,263 - root - INFO - Step: 1200/8161, Loss span/answer/total=0.8590/0.1843/1.0433, LR=0.00002206, grad_norm=2.2267. Time cost=192.77, Throughput=3.11 samples/s ETA=7.41h
2020-07-14 10:03:46,197 - root - INFO - Step: 1250/8161, Loss span/answer/total=0.8105/0.1847/0.9952, LR=0.00002298, grad_norm=9.9131. Time cost=190.93, Throughput=3.14 samples/s ETA=7.35h
2020-07-14 10:07:01,651 - root - INFO - Step: 1300/8161, Loss span/answer/total=0.7724/0.1644/0.9368, LR=0.00002390, grad_norm=2.0911. Time cost=195.45, Throughput=3.07 samples/s ETA=7.31h
2020-07-14 10:10:10,936 - root - INFO - Step: 1350/8161, Loss span/answer/total=0.9344/0.1920/1.1264, LR=0.00002482, grad_norm=2.0462. Time cost=189.28, Throughput=3.17 samples/s ETA=7.25h
2020-07-14 10:13:30,035 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1400.params
2020-07-14 10:13:30,035 - root - INFO - Step: 1400/8161, Loss span/answer/total=0.7847/0.1689/0.9536, LR=0.00002574, grad_norm=1.6321. Time cost=199.10, Throughput=3.01 samples/s ETA=7.21h
2020-07-14 10:16:37,514 - root - INFO - Step: 1450/8161, Loss span/answer/total=0.7959/0.1377/0.9336, LR=0.00002665, grad_norm=11.1343. Time cost=187.48, Throughput=3.20 samples/s ETA=7.15h
2020-07-14 10:19:49,740 - root - INFO - Step: 1500/8161, Loss span/answer/total=0.8426/0.1708/1.0134, LR=0.00002757, grad_norm=2.4158. Time cost=192.23, Throughput=3.12 samples/s ETA=7.09h
2020-07-14 10:23:04,349 - root - INFO - Step: 1550/8161, Loss span/answer/total=0.7925/0.1872/0.9797, LR=0.00002849, grad_norm=2.5468. Time cost=194.61, Throughput=3.08 samples/s ETA=7.04h
2020-07-14 10:26:21,144 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1600.params
2020-07-14 10:26:21,144 - root - INFO - Step: 1600/8161, Loss span/answer/total=0.7817/0.1799/0.9616, LR=0.00002941, grad_norm=1.3321. Time cost=196.79, Throughput=3.05 samples/s ETA=7.00h
2020-07-14 10:29:26,828 - root - INFO - Step: 1650/8161, Loss span/answer/total=0.7816/0.1583/0.9399, LR=0.00002992, grad_norm=2.3034. Time cost=185.68, Throughput=3.23 samples/s ETA=6.94h
2020-07-14 10:32:40,330 - root - INFO - Step: 1700/8161, Loss span/answer/total=0.7563/0.1498/0.9061, LR=0.00002969, grad_norm=1.5616. Time cost=193.50, Throughput=3.10 samples/s ETA=6.89h
2020-07-14 10:35:56,491 - root - INFO - Step: 1750/8161, Loss span/answer/total=0.8360/0.1616/0.9976, LR=0.00002946, grad_norm=1.8526. Time cost=196.16, Throughput=3.06 samples/s ETA=6.84h
2020-07-14 10:39:10,844 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1800.params
2020-07-14 10:39:10,844 - root - INFO - Step: 1800/8161, Loss span/answer/total=0.8494/0.1844/1.0338, LR=0.00002923, grad_norm=2.0884. Time cost=194.35, Throughput=3.09 samples/s ETA=6.79h
2020-07-14 10:42:28,789 - root - INFO - Step: 1850/8161, Loss span/answer/total=0.8517/0.1800/1.0317, LR=0.00002900, grad_norm=2.0119. Time cost=197.95, Throughput=3.03 samples/s ETA=6.74h
2020-07-14 10:45:40,738 - root - INFO - Step: 1900/8161, Loss span/answer/total=0.7495/0.1561/0.9056, LR=0.00002877, grad_norm=1.3511. Time cost=191.95, Throughput=3.13 samples/s ETA=6.68h
2020-07-14 10:48:50,613 - root - INFO - Step: 1950/8161, Loss span/answer/total=0.7947/0.1719/0.9665, LR=0.00002854, grad_norm=2.0416. Time cost=189.87, Throughput=3.16 samples/s ETA=6.63h
2020-07-14 10:52:09,451 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2000.params
2020-07-14 10:52:09,451 - root - INFO - Step: 2000/8161, Loss span/answer/total=0.7748/0.1672/0.9420, LR=0.00002831, grad_norm=1.7656. Time cost=198.84, Throughput=3.02 samples/s ETA=6.58h
2020-07-14 10:55:18,951 - root - INFO - Step: 2050/8161, Loss span/answer/total=0.7177/0.1886/0.9063, LR=0.00002808, grad_norm=1.1931. Time cost=189.50, Throughput=3.17 samples/s ETA=6.53h
2020-07-14 10:58:32,587 - root - INFO - Step: 2100/8161, Loss span/answer/total=0.6977/0.1282/0.8259, LR=0.00002785, grad_norm=1.8929. Time cost=193.64, Throughput=3.10 samples/s ETA=6.47h
2020-07-14 11:01:47,010 - root - INFO - Step: 2150/8161, Loss span/answer/total=0.6713/0.1432/0.8145, LR=0.00002762, grad_norm=1.4084. Time cost=194.42, Throughput=3.09 samples/s ETA=6.42h
2020-07-14 11:05:09,671 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2200.params
2020-07-14 11:05:09,671 - root - INFO - Step: 2200/8161, Loss span/answer/total=0.7879/0.1481/0.9360, LR=0.00002739, grad_norm=1.8508. Time cost=202.66, Throughput=2.96 samples/s ETA=6.38h
2020-07-14 11:08:22,018 - root - INFO - Step: 2250/8161, Loss span/answer/total=0.8174/0.1796/0.9970, LR=0.00002716, grad_norm=1.8646. Time cost=192.35, Throughput=3.12 samples/s ETA=6.32h
2020-07-14 11:11:32,462 - root - INFO - Step: 2300/8161, Loss span/answer/total=0.7551/0.1536/0.9087, LR=0.00002693, grad_norm=0.7570. Time cost=190.44, Throughput=3.15 samples/s ETA=6.27h
2020-07-14 11:14:44,478 - root - INFO - Step: 2350/8161, Loss span/answer/total=0.7227/0.1467/0.8693, LR=0.00002670, grad_norm=1.6918. Time cost=192.02, Throughput=3.12 samples/s ETA=6.21h
2020-07-14 11:17:54,186 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2400.params
2020-07-14 11:17:54,186 - root - INFO - Step: 2400/8161, Loss span/answer/total=0.7196/0.1537/0.8733, LR=0.00002647, grad_norm=1.4427. Time cost=189.71, Throughput=3.16 samples/s ETA=6.16h
2020-07-14 11:21:09,349 - root - INFO - Step: 2450/8161, Loss span/answer/total=0.7637/0.1652/0.9289, LR=0.00002624, grad_norm=2.0733. Time cost=195.16, Throughput=3.07 samples/s ETA=6.11h
2020-07-14 11:24:19,783 - root - INFO - Step: 2500/8161, Loss span/answer/total=0.7494/0.1332/0.8825, LR=0.00002601, grad_norm=1.4492. Time cost=190.43, Throughput=3.15 samples/s ETA=6.05h
2020-07-14 11:27:40,839 - root - INFO - Step: 2550/8161, Loss span/answer/total=0.6923/0.1732/0.8655, LR=0.00002578, grad_norm=1.6334. Time cost=201.06, Throughput=2.98 samples/s ETA=6.00h
2020-07-14 11:30:47,481 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2600.params
2020-07-14 11:30:47,481 - root - INFO - Step: 2600/8161, Loss span/answer/total=0.6443/0.1479/0.7922, LR=0.00002555, grad_norm=1.6361. Time cost=186.64, Throughput=3.21 samples/s ETA=5.95h
2020-07-14 11:34:03,299 - root - INFO - Step: 2650/8161, Loss span/answer/total=0.7569/0.1562/0.9131, LR=0.00002532, grad_norm=1.7191. Time cost=195.82, Throughput=3.06 samples/s ETA=5.90h
2020-07-14 11:37:19,334 - root - INFO - Step: 2700/8161, Loss span/answer/total=0.7257/0.1302/0.8559, LR=0.00002509, grad_norm=1.3729. Time cost=196.03, Throughput=3.06 samples/s ETA=5.84h
2020-07-14 11:40:34,398 - root - INFO - Step: 2750/8161, Loss span/answer/total=0.5917/0.1189/0.7106, LR=0.00002486, grad_norm=1.6643. Time cost=195.06, Throughput=3.07 samples/s ETA=5.79h
2020-07-14 11:43:50,551 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2800.params
2020-07-14 11:43:50,552 - root - INFO - Step: 2800/8161, Loss span/answer/total=0.6178/0.1140/0.7318, LR=0.00002463, grad_norm=1.8395. Time cost=196.15, Throughput=3.06 samples/s ETA=5.74h
2020-07-14 11:46:59,723 - root - INFO - Step: 2850/8161, Loss span/answer/total=0.6697/0.1092/0.7789, LR=0.00002440, grad_norm=1.7596. Time cost=189.17, Throughput=3.17 samples/s ETA=5.68h
2020-07-14 11:50:12,712 - root - INFO - Step: 2900/8161, Loss span/answer/total=0.6541/0.1132/0.7672, LR=0.00002417, grad_norm=1.9585. Time cost=192.99, Throughput=3.11 samples/s ETA=5.63h
2020-07-14 11:53:26,457 - root - INFO - Step: 2950/8161, Loss span/answer/total=0.6785/0.1367/0.8152, LR=0.00002394, grad_norm=0.9884. Time cost=193.74, Throughput=3.10 samples/s ETA=5.58h
2020-07-14 11:56:34,236 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3000.params
2020-07-14 11:56:34,236 - root - INFO - Step: 3000/8161, Loss span/answer/total=0.5995/0.1241/0.7236, LR=0.00002371, grad_norm=1.6040. Time cost=187.78, Throughput=3.20 samples/s ETA=5.52h
2020-07-14 11:59:45,391 - root - INFO - Step: 3050/8161, Loss span/answer/total=0.5789/0.1026/0.6815, LR=0.00002348, grad_norm=1.7711. Time cost=191.15, Throughput=3.14 samples/s ETA=5.47h
2020-07-14 12:02:50,252 - root - INFO - Step: 3100/8161, Loss span/answer/total=0.6229/0.1321/0.7550, LR=0.00002325, grad_norm=1.6470. Time cost=184.86, Throughput=3.25 samples/s ETA=5.41h
2020-07-14 12:06:08,501 - root - INFO - Step: 3150/8161, Loss span/answer/total=0.5569/0.1089/0.6657, LR=0.00002302, grad_norm=1.3360. Time cost=198.25, Throughput=3.03 samples/s ETA=5.36h
2020-07-14 12:09:23,235 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3200.params
2020-07-14 12:09:23,235 - root - INFO - Step: 3200/8161, Loss span/answer/total=0.6164/0.1195/0.7359, LR=0.00002280, grad_norm=1.3351. Time cost=194.73, Throughput=3.08 samples/s ETA=5.31h
2020-07-14 12:12:32,239 - root - INFO - Step: 3250/8161, Loss span/answer/total=0.6190/0.1171/0.7360, LR=0.00002257, grad_norm=1.3754. Time cost=189.00, Throughput=3.17 samples/s ETA=5.25h
2020-07-14 12:15:42,266 - root - INFO - Step: 3300/8161, Loss span/answer/total=0.5693/0.1307/0.6999, LR=0.00002234, grad_norm=3.7025. Time cost=190.03, Throughput=3.16 samples/s ETA=5.20h
2020-07-14 12:18:53,482 - root - INFO - Step: 3350/8161, Loss span/answer/total=0.6057/0.1060/0.7117, LR=0.00002211, grad_norm=0.9262. Time cost=191.22, Throughput=3.14 samples/s ETA=5.14h
2020-07-14 12:22:13,217 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3400.params
2020-07-14 12:22:13,217 - root - INFO - Step: 3400/8161, Loss span/answer/total=0.5592/0.1263/0.6855, LR=0.00002188, grad_norm=1.5454. Time cost=199.73, Throughput=3.00 samples/s ETA=5.09h
2020-07-14 12:25:29,405 - root - INFO - Step: 3450/8161, Loss span/answer/total=0.7056/0.1154/0.8210, LR=0.00002165, grad_norm=1.3269. Time cost=196.19, Throughput=3.06 samples/s ETA=5.04h
2020-07-14 12:28:49,735 - root - INFO - Step: 3500/8161, Loss span/answer/total=0.6430/0.1189/0.7620, LR=0.00002142, grad_norm=2.3717. Time cost=200.33, Throughput=3.00 samples/s ETA=4.99h
2020-07-14 12:32:01,502 - root - INFO - Step: 3550/8161, Loss span/answer/total=0.6160/0.1029/0.7189, LR=0.00002119, grad_norm=2.2066. Time cost=191.77, Throughput=3.13 samples/s ETA=4.94h
2020-07-14 12:35:21,378 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3600.params
2020-07-14 12:35:21,378 - root - INFO - Step: 3600/8161, Loss span/answer/total=0.6436/0.1139/0.7575, LR=0.00002096, grad_norm=1.7208. Time cost=199.88, Throughput=3.00 samples/s ETA=4.89h
2020-07-14 12:38:27,183 - root - INFO - Step: 3650/8161, Loss span/answer/total=0.5716/0.1296/0.7011, LR=0.00002073, grad_norm=2.7656. Time cost=185.80, Throughput=3.23 samples/s ETA=4.83h
2020-07-14 12:41:37,339 - root - INFO - Step: 3700/8161, Loss span/answer/total=0.5560/0.1315/0.6875, LR=0.00002050, grad_norm=1.5470. Time cost=190.16, Throughput=3.16 samples/s ETA=4.78h
2020-07-14 12:44:53,016 - root - INFO - Step: 3750/8161, Loss span/answer/total=0.6238/0.1626/0.7864, LR=0.00002027, grad_norm=1.3384. Time cost=195.68, Throughput=3.07 samples/s ETA=4.72h
2020-07-14 12:48:10,997 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3800.params
2020-07-14 12:48:10,998 - root - INFO - Step: 3800/8161, Loss span/answer/total=0.5691/0.0919/0.6610, LR=0.00002004, grad_norm=1.3039. Time cost=197.98, Throughput=3.03 samples/s ETA=4.67h
2020-07-14 12:51:22,231 - root - INFO - Step: 3850/8161, Loss span/answer/total=0.5391/0.1053/0.6444, LR=0.00001981, grad_norm=1.7181. Time cost=191.23, Throughput=3.14 samples/s ETA=4.62h
2020-07-14 12:54:34,710 - root - INFO - Step: 3900/8161, Loss span/answer/total=0.5623/0.0929/0.6552, LR=0.00001958, grad_norm=0.9861. Time cost=192.48, Throughput=3.12 samples/s ETA=4.56h
2020-07-14 12:57:51,075 - root - INFO - Step: 3950/8161, Loss span/answer/total=0.5110/0.1019/0.6129, LR=0.00001935, grad_norm=1.2498. Time cost=196.37, Throughput=3.06 samples/s ETA=4.51h
2020-07-14 13:01:05,949 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4000.params
2020-07-14 13:01:05,950 - root - INFO - Step: 4000/8161, Loss span/answer/total=0.5466/0.1039/0.6504, LR=0.00001912, grad_norm=1.2746. Time cost=194.87, Throughput=3.08 samples/s ETA=4.46h
2020-07-14 13:04:22,529 - root - INFO - Step: 4050/8161, Loss span/answer/total=0.5686/0.1003/0.6689, LR=0.00001889, grad_norm=1.2573. Time cost=196.58, Throughput=3.05 samples/s ETA=4.41h
2020-07-14 13:07:43,253 - root - INFO - Step: 4100/8161, Loss span/answer/total=0.5485/0.0978/0.6464, LR=0.00001866, grad_norm=1.8622. Time cost=200.72, Throughput=2.99 samples/s ETA=4.35h
2020-07-14 13:10:51,018 - root - INFO - Step: 4150/8161, Loss span/answer/total=0.5211/0.0910/0.6121, LR=0.00001843, grad_norm=1.3872. Time cost=187.76, Throughput=3.20 samples/s ETA=4.30h
2020-07-14 13:14:03,317 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4200.params
2020-07-14 13:14:03,317 - root - INFO - Step: 4200/8161, Loss span/answer/total=0.5542/0.0973/0.6515, LR=0.00001820, grad_norm=1.5859. Time cost=192.30, Throughput=3.12 samples/s ETA=4.25h
2020-07-14 13:17:11,683 - root - INFO - Step: 4250/8161, Loss span/answer/total=0.5439/0.1210/0.6649, LR=0.00001797, grad_norm=1.0025. Time cost=188.37, Throughput=3.19 samples/s ETA=4.19h
2020-07-14 13:20:17,267 - root - INFO - Step: 4300/8161, Loss span/answer/total=0.5733/0.1004/0.6737, LR=0.00001774, grad_norm=1.1620. Time cost=185.58, Throughput=3.23 samples/s ETA=4.14h
2020-07-14 13:23:30,074 - root - INFO - Step: 4350/8161, Loss span/answer/total=0.5385/0.0984/0.6369, LR=0.00001751, grad_norm=1.2707. Time cost=192.81, Throughput=3.11 samples/s ETA=4.08h
2020-07-14 13:26:44,018 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4400.params
2020-07-14 13:26:44,018 - root - INFO - Step: 4400/8161, Loss span/answer/total=0.5379/0.1110/0.6489, LR=0.00001728, grad_norm=0.8478. Time cost=193.94, Throughput=3.09 samples/s ETA=4.03h
2020-07-14 13:29:56,690 - root - INFO - Step: 4450/8161, Loss span/answer/total=0.5146/0.1009/0.6156, LR=0.00001705, grad_norm=1.1027. Time cost=192.67, Throughput=3.11 samples/s ETA=3.97h
2020-07-14 13:33:12,646 - root - INFO - Step: 4500/8161, Loss span/answer/total=0.5094/0.0938/0.6032, LR=0.00001682, grad_norm=1.5611. Time cost=195.96, Throughput=3.06 samples/s ETA=3.92h
2020-07-14 13:36:22,556 - root - INFO - Step: 4550/8161, Loss span/answer/total=0.5033/0.0982/0.6015, LR=0.00001659, grad_norm=1.6522. Time cost=189.91, Throughput=3.16 samples/s ETA=3.87h
2020-07-14 13:39:40,829 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4600.params
2020-07-14 13:39:40,829 - root - INFO - Step: 4600/8161, Loss span/answer/total=0.5157/0.0860/0.6016, LR=0.00001636, grad_norm=1.8351. Time cost=198.27, Throughput=3.03 samples/s ETA=3.82h
2020-07-14 13:42:53,607 - root - INFO - Step: 4650/8161, Loss span/answer/total=0.5027/0.0865/0.5892, LR=0.00001613, grad_norm=1.8733. Time cost=192.78, Throughput=3.11 samples/s ETA=3.76h
2020-07-14 13:46:08,456 - root - INFO - Step: 4700/8161, Loss span/answer/total=0.5850/0.0978/0.6828, LR=0.00001590, grad_norm=1.7329. Time cost=194.85, Throughput=3.08 samples/s ETA=3.71h
2020-07-14 13:49:19,529 - root - INFO - Step: 4750/8161, Loss span/answer/total=0.5094/0.1020/0.6114, LR=0.00001567, grad_norm=1.2870. Time cost=191.07, Throughput=3.14 samples/s ETA=3.65h
2020-07-14 13:52:37,218 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4800.params
2020-07-14 13:52:37,218 - root - INFO - Step: 4800/8161, Loss span/answer/total=0.5063/0.0959/0.6022, LR=0.00001544, grad_norm=1.8758. Time cost=197.69, Throughput=3.04 samples/s ETA=3.60h
2020-07-14 13:55:46,301 - root - INFO - Step: 4850/8161, Loss span/answer/total=0.5025/0.1018/0.6043, LR=0.00001521, grad_norm=2.3371. Time cost=189.08, Throughput=3.17 samples/s ETA=3.55h
2020-07-14 13:58:56,580 - root - INFO - Step: 4900/8161, Loss span/answer/total=0.5882/0.1120/0.7002, LR=0.00001498, grad_norm=1.2556. Time cost=190.28, Throughput=3.15 samples/s ETA=3.49h
2020-07-14 14:02:03,431 - root - INFO - Step: 4950/8161, Loss span/answer/total=0.5735/0.0958/0.6693, LR=0.00001475, grad_norm=1.2067. Time cost=186.85, Throughput=3.21 samples/s ETA=3.44h
2020-07-14 14:05:20,063 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5000.params
2020-07-14 14:05:20,063 - root - INFO - Step: 5000/8161, Loss span/answer/total=0.5814/0.0990/0.6804, LR=0.00001452, grad_norm=1.4738. Time cost=196.63, Throughput=3.05 samples/s ETA=3.39h
2020-07-14 14:08:31,317 - root - INFO - Step: 5050/8161, Loss span/answer/total=0.5513/0.0932/0.6446, LR=0.00001429, grad_norm=1.6197. Time cost=191.25, Throughput=3.14 samples/s ETA=3.33h
2020-07-14 14:11:39,658 - root - INFO - Step: 5100/8161, Loss span/answer/total=0.4253/0.0981/0.5234, LR=0.00001406, grad_norm=1.7552. Time cost=188.34, Throughput=3.19 samples/s ETA=3.28h
2020-07-14 14:14:55,279 - root - INFO - Step: 5150/8161, Loss span/answer/total=0.4877/0.0816/0.5693, LR=0.00001384, grad_norm=1.3550. Time cost=195.62, Throughput=3.07 samples/s ETA=3.22h
2020-07-14 14:18:13,879 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5200.params
2020-07-14 14:18:13,879 - root - INFO - Step: 5200/8161, Loss span/answer/total=0.5461/0.1109/0.6571, LR=0.00001361, grad_norm=2.4072. Time cost=198.60, Throughput=3.02 samples/s ETA=3.17h
2020-07-14 14:21:24,121 - root - INFO - Step: 5250/8161, Loss span/answer/total=0.5578/0.0978/0.6556, LR=0.00001338, grad_norm=1.7026. Time cost=190.24, Throughput=3.15 samples/s ETA=3.12h
2020-07-14 14:24:41,272 - root - INFO - Step: 5300/8161, Loss span/answer/total=0.5165/0.0762/0.5927, LR=0.00001315, grad_norm=2.2658. Time cost=197.15, Throughput=3.04 samples/s ETA=3.07h
2020-07-14 14:27:45,172 - root - INFO - Step: 5350/8161, Loss span/answer/total=0.5096/0.1067/0.6163, LR=0.00001292, grad_norm=1.2365. Time cost=183.90, Throughput=3.26 samples/s ETA=3.01h
2020-07-14 14:31:00,387 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5400.params
2020-07-14 14:31:00,387 - root - INFO - Step: 5400/8161, Loss span/answer/total=0.5312/0.0831/0.6143, LR=0.00001269, grad_norm=1.4942. Time cost=195.21, Throughput=3.07 samples/s ETA=2.96h
2020-07-14 14:34:17,245 - root - INFO - Step: 5450/8161, Loss span/answer/total=0.4819/0.1096/0.5914, LR=0.00001246, grad_norm=1.1723. Time cost=196.86, Throughput=3.04 samples/s ETA=2.90h
2020-07-14 14:37:35,542 - root - INFO - Step: 5500/8161, Loss span/answer/total=0.3815/0.0786/0.4601, LR=0.00001223, grad_norm=1.7482. Time cost=198.30, Throughput=3.03 samples/s ETA=2.85h
2020-07-14 14:40:56,047 - root - INFO - Step: 5550/8161, Loss span/answer/total=0.4540/0.0679/0.5218, LR=0.00001200, grad_norm=1.0213. Time cost=200.50, Throughput=2.99 samples/s ETA=2.80h
2020-07-14 14:44:05,191 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5600.params
2020-07-14 14:44:05,192 - root - INFO - Step: 5600/8161, Loss span/answer/total=0.3761/0.0647/0.4408, LR=0.00001177, grad_norm=1.6066. Time cost=189.14, Throughput=3.17 samples/s ETA=2.74h
2020-07-14 14:47:18,057 - root - INFO - Step: 5650/8161, Loss span/answer/total=0.4483/0.0466/0.4949, LR=0.00001154, grad_norm=1.1719. Time cost=192.87, Throughput=3.11 samples/s ETA=2.69h
2020-07-14 14:50:32,480 - root - INFO - Step: 5700/8161, Loss span/answer/total=0.4071/0.0735/0.4806, LR=0.00001131, grad_norm=0.9141. Time cost=194.42, Throughput=3.09 samples/s ETA=2.64h
2020-07-14 14:53:43,863 - root - INFO - Step: 5750/8161, Loss span/answer/total=0.4998/0.0811/0.5809, LR=0.00001108, grad_norm=1.7222. Time cost=191.38, Throughput=3.14 samples/s ETA=2.58h
2020-07-14 14:56:50,952 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5800.params
2020-07-14 14:56:50,952 - root - INFO - Step: 5800/8161, Loss span/answer/total=0.4418/0.0593/0.5011, LR=0.00001085, grad_norm=1.5506. Time cost=187.09, Throughput=3.21 samples/s ETA=2.53h
2020-07-14 15:00:05,308 - root - INFO - Step: 5850/8161, Loss span/answer/total=0.3896/0.0692/0.4588, LR=0.00001062, grad_norm=1.0585. Time cost=194.36, Throughput=3.09 samples/s ETA=2.48h
2020-07-14 15:03:15,127 - root - INFO - Step: 5900/8161, Loss span/answer/total=0.3892/0.0616/0.4508, LR=0.00001039, grad_norm=1.7596. Time cost=189.82, Throughput=3.16 samples/s ETA=2.42h
2020-07-14 15:06:35,571 - root - INFO - Step: 5950/8161, Loss span/answer/total=0.4063/0.0653/0.4716, LR=0.00001016, grad_norm=1.4484. Time cost=200.44, Throughput=2.99 samples/s ETA=2.37h
2020-07-14 15:09:44,694 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_6000.params
2020-07-14 15:09:44,694 - root - INFO - Step: 6000/8161, Loss span/answer/total=0.3973/0.0681/0.4654, LR=0.00000993, grad_norm=1.6632. Time cost=189.12, Throughput=3.17 samples/s ETA=2.32h
2020-07-14 15:12:57,243 - root - INFO - Step: 6050/8161, Loss span/answer/total=0.4605/0.0628/0.5233, LR=0.00000970, grad_norm=1.5790. Time cost=192.55, Throughput=3.12 samples/s ETA=2.26h
2020-07-14 15:16:12,702 - root - INFO - Step: 6100/8161, Loss span/answer/total=0.3830/0.0520/0.4351, LR=0.00000947, grad_norm=0.9119. Time cost=195.46, Throughput=3.07 samples/s ETA=2.21h
2020-07-14 15:19:21,712 - root - INFO - Step: 6150/8161, Loss span/answer/total=0.3837/0.0588/0.4425, LR=0.00000924, grad_norm=0.8564. Time cost=189.01, Throughput=3.17 samples/s ETA=2.15h
2020-07-14 15:22:30,802 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_6200.params
2020-07-14 15:22:30,802 - root - INFO - Step: 6200/8161, Loss span/answer/total=0.4001/0.0646/0.4647, LR=0.00000901, grad_norm=2.1057. Time cost=189.09, Throughput=3.17 samples/s ETA=2.10h
2020-07-14 15:25:42,962 - root - INFO - Step: 6250/8161, Loss span/answer/total=0.3856/0.0626/0.4482, LR=0.00000878, grad_norm=1.3489. Time cost=192.16, Throughput=3.12 samples/s ETA=2.05h
2020-07-14 15:28:53,283 - root - INFO - Step: 6300/8161, Loss span/answer/total=0.4150/0.0671/0.4821, LR=0.00000855, grad_norm=1.3946. Time cost=190.32, Throughput=3.15 samples/s ETA=1.99h
2020-07-14 15:32:00,157 - root - INFO - GPU communication supported by horovod
2020-07-14 15:32:00,157 - root - INFO - GPU communication supported by horovod
2020-07-14 15:32:00,158 - root - INFO - GPU communication supported by horovod
2020-07-14 15:32:00,158 - root - INFO - GPU communication supported by horovod
2020-07-14 15:32:08,155 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 15:32:08,182 - root - INFO - Prepare training data
2020-07-14 15:32:08,204 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 15:32:08,206 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 15:32:08,218 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 15:32:08,229 - root - INFO - Prepare training data
2020-07-14 15:32:08,261 - root - INFO - Prepare training data
2020-07-14 15:32:08,276 - root - INFO - Prepare training data
2020-07-14 15:32:29,201 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson
2020-07-14 15:32:29,202 - root - INFO - Processing the Training data:
2020-07-14 15:32:29,251 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson
2020-07-14 15:32:29,252 - root - INFO - Processing the Training data:
2020-07-14 15:32:29,291 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson
2020-07-14 15:32:29,292 - root - INFO - Processing the Training data:
2020-07-14 15:32:29,489 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson
2020-07-14 15:32:29,489 - root - INFO - Processing the Training data:
2020-07-14 15:32:35,456 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319
2020-07-14 15:32:35,478 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498
2020-07-14 15:32:35,478 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697
2020-07-14 15:32:35,478 - root - INFO - Creating distributed trainer...
2020-07-14 15:32:35,491 - root - INFO - Using gradient accumulation. Effective global batch size = 48
2020-07-14 15:32:35,505 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319
2020-07-14 15:32:35,515 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319
2020-07-14 15:32:35,527 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498
2020-07-14 15:32:35,527 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697
2020-07-14 15:32:35,527 - root - INFO - Creating distributed trainer...
2020-07-14 15:32:35,533 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200
2020-07-14 15:32:35,537 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498
2020-07-14 15:32:35,537 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697
2020-07-14 15:32:35,537 - root - INFO - Creating distributed trainer...
2020-07-14 15:32:35,540 - root - INFO - Using gradient accumulation. Effective global batch size = 48
2020-07-14 15:32:35,550 - root - INFO - Using gradient accumulation. Effective global batch size = 48
2020-07-14 15:32:35,582 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200
2020-07-14 15:32:35,591 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200
2020-07-14 15:32:35,977 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319
2020-07-14 15:32:36,025 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498
2020-07-14 15:32:36,025 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697
2020-07-14 15:32:36,025 - root - INFO - Creating distributed trainer...
2020-07-14 15:32:36,053 - root - INFO - Using gradient accumulation. Effective global batch size = 48
2020-07-14 15:32:36,156 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200
2020-07-14 15:34:29,605 - root - INFO - GPU communication supported by horovod
2020-07-14 15:34:29,605 - root - INFO - GPU communication supported by horovod
2020-07-14 15:34:29,605 - root - INFO - GPU communication supported by horovod
2020-07-14 15:34:29,605 - root - INFO - GPU communication supported by horovod
2020-07-14 15:34:37,602 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 15:34:37,621 - root - INFO - Prepare training data
2020-07-14 15:34:37,633 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 15:34:37,672 - root - INFO - Prepare training data
2020-07-14 15:34:37,729 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 15:34:37,768 - root - INFO - Prepare training data
2020-07-14 15:34:37,828 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-14 15:34:37,868 - root - INFO - Prepare training data
2020-07-14 15:34:58,672 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson
2020-07-14 15:34:58,673 - root - INFO - Processing the Training data:
2020-07-14 15:34:58,703 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson
2020-07-14 15:34:58,703 - root - INFO - Processing the Training data:
2020-07-14 15:34:58,843 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson
2020-07-14 15:34:58,844 - root - INFO - Processing the Training data:
2020-07-14 15:34:58,845 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson
2020-07-14 15:34:58,846 - root - INFO - Processing the Training data:
2020-07-14 15:35:04,931 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319
2020-07-14 15:35:04,949 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319
2020-07-14 15:35:04,953 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498
2020-07-14 15:35:04,953 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697
2020-07-14 15:35:04,953 - root - INFO - Creating distributed trainer...
2020-07-14 15:35:04,966 - root - INFO - Using gradient accumulation. Effective global batch size = 48
2020-07-14 15:35:04,970 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498
2020-07-14 15:35:04,970 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697
2020-07-14 15:35:04,970 - root - INFO - Creating distributed trainer...
2020-07-14 15:35:04,983 - root - INFO - Using gradient accumulation. Effective global batch size = 48
2020-07-14 15:35:05,008 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200
2020-07-14 15:35:05,025 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200
2020-07-14 15:35:05,081 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319
2020-07-14 15:35:05,082 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319
2020-07-14 15:35:05,103 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498
2020-07-14 15:35:05,103 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697
2020-07-14 15:35:05,103 - root - INFO - Creating distributed trainer...
2020-07-14 15:35:05,104 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498
2020-07-14 15:35:05,104 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697
2020-07-14 15:35:05,104 - root - INFO - Creating distributed trainer...
2020-07-14 15:35:05,116 - root - INFO - Using gradient accumulation. Effective global batch size = 48
2020-07-14 15:35:05,117 - root - INFO - Using gradient accumulation. Effective global batch size = 48
2020-07-14 15:35:05,163 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200
2020-07-14 15:35:05,176 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200
2020-07-14 15:38:19,569 - root - INFO - Step: 50/8161, Loss span/answer/total=4.5740/0.3139/4.8879, LR=0.00000092, grad_norm=0.9138. Time cost=194.30, Throughput=3.09 samples/s ETA=8.76h
2020-07-14 15:41:27,980 - root - INFO - Step: 100/8161, Loss span/answer/total=3.9481/0.2985/4.2466, LR=0.00000184, grad_norm=0.9825. Time cost=188.41, Throughput=3.18 samples/s ETA=8.57h
2020-07-14 15:44:46,282 - root - INFO - Step: 150/8161, Loss span/answer/total=3.5266/0.2936/3.8203, LR=0.00000276, grad_norm=1.0096. Time cost=198.30, Throughput=3.03 samples/s ETA=8.62h
2020-07-14 15:48:00,005 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_200.params
2020-07-14 15:48:00,005 - root - INFO - Step: 200/8161, Loss span/answer/total=2.4381/0.2765/2.7145, LR=0.00000368, grad_norm=2.1141. Time cost=193.72, Throughput=3.10 samples/s ETA=8.57h
2020-07-14 15:51:09,429 - root - INFO - Step: 250/8161, Loss span/answer/total=1.6377/0.2685/1.9062, LR=0.00000460, grad_norm=2.5878. Time cost=189.42, Throughput=3.17 samples/s ETA=8.47h
2020-07-14 15:54:18,691 - root - INFO - Step: 300/8161, Loss span/answer/total=1.4293/0.2237/1.6530, LR=0.00000551, grad_norm=2.2391. Time cost=189.26, Throughput=3.17 samples/s ETA=8.40h
2020-07-14 15:57:28,142 - root - INFO - Step: 350/8161, Loss span/answer/total=1.0939/0.1943/1.2882, LR=0.00000643, grad_norm=2.0628. Time cost=189.45, Throughput=3.17 samples/s ETA=8.32h
2020-07-14 16:00:38,758 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_400.params
2020-07-14 16:00:38,758 - root - INFO - Step: 400/8161, Loss span/answer/total=1.0850/0.2150/1.3000, LR=0.00000735, grad_norm=2.0075. Time cost=190.62, Throughput=3.15 samples/s ETA=8.26h
2020-07-14 16:03:48,092 - root - INFO - Step: 450/8161, Loss span/answer/total=1.0578/0.2005/1.2584, LR=0.00000827, grad_norm=1.5312. Time cost=189.33, Throughput=3.17 samples/s ETA=8.20h
2020-07-14 16:06:51,413 - root - INFO - Step: 500/8161, Loss span/answer/total=0.9698/0.1584/1.1282, LR=0.00000919, grad_norm=14.4826. Time cost=183.32, Throughput=3.27 samples/s ETA=8.11h
2020-07-14 16:10:01,519 - root - INFO - Step: 550/8161, Loss span/answer/total=0.9736/0.1793/1.1529, LR=0.00001011, grad_norm=2.6735. Time cost=190.11, Throughput=3.16 samples/s ETA=8.06h
2020-07-14 16:13:17,019 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_600.params
2020-07-14 16:13:17,019 - root - INFO - Step: 600/8161, Loss span/answer/total=0.9496/0.1810/1.1306, LR=0.00001103, grad_norm=1.9650. Time cost=195.50, Throughput=3.07 samples/s ETA=8.02h
2020-07-14 16:16:23,175 - root - INFO - Step: 650/8161, Loss span/answer/total=0.9297/0.2164/1.1460, LR=0.00001195, grad_norm=2.0462. Time cost=186.16, Throughput=3.22 samples/s ETA=7.95h
2020-07-14 16:19:28,832 - root - INFO - Step: 700/8161, Loss span/answer/total=0.9484/0.1668/1.1152, LR=0.00001287, grad_norm=1.7099. Time cost=185.66, Throughput=3.23 samples/s ETA=7.89h
2020-07-14 16:22:41,756 - root - INFO - Step: 750/8161, Loss span/answer/total=0.9283/0.1833/1.1116, LR=0.00001379, grad_norm=1.8619. Time cost=192.92, Throughput=3.11 samples/s ETA=7.84h
2020-07-14 16:25:56,779 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_800.params
2020-07-14 16:25:56,779 - root - INFO - Step: 800/8161, Loss span/answer/total=0.8120/0.1885/1.0005, LR=0.00001471, grad_norm=2.3574. Time cost=195.02, Throughput=3.08 samples/s ETA=7.80h
2020-07-14 16:29:15,093 - root - INFO - Step: 850/8161, Loss span/answer/total=0.9007/0.1828/1.0835, LR=0.00001563, grad_norm=3.1871. Time cost=198.31, Throughput=3.03 samples/s ETA=7.76h
2020-07-14 16:32:30,411 - root - INFO - Step: 900/8161, Loss span/answer/total=0.8275/0.1522/0.9796, LR=0.00001654, grad_norm=1.6902. Time cost=195.32, Throughput=3.07 samples/s ETA=7.72h
2020-07-14 16:35:43,656 - root - INFO - Step: 950/8161, Loss span/answer/total=0.9085/0.1903/1.0988, LR=0.00001746, grad_norm=9.4860. Time cost=193.24, Throughput=3.10 samples/s ETA=7.67h
2020-07-14 16:38:58,355 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1000.params
2020-07-14 16:38:58,355 - root - INFO - Step: 1000/8161, Loss span/answer/total=0.9499/0.2015/1.1514, LR=0.00001838, grad_norm=2.2885. Time cost=194.70, Throughput=3.08 samples/s ETA=7.62h
2020-07-14 16:42:04,167 - root - INFO - Step: 1050/8161, Loss span/answer/total=0.8268/0.1694/0.9962, LR=0.00001930, grad_norm=2.0775. Time cost=185.81, Throughput=3.23 samples/s ETA=7.56h
2020-07-14 16:45:15,834 - root - INFO - Step: 1100/8161, Loss span/answer/total=0.8575/0.1950/1.0525, LR=0.00002022, grad_norm=1.4896. Time cost=191.67, Throughput=3.13 samples/s ETA=7.51h
2020-07-14 16:48:29,671 - root - INFO - Step: 1150/8161, Loss span/answer/total=0.8812/0.1893/1.0705, LR=0.00002114, grad_norm=1.5334. Time cost=193.84, Throughput=3.10 samples/s ETA=7.46h
2020-07-14 16:51:43,537 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1200.params
2020-07-14 16:51:43,537 - root - INFO - Step: 1200/8161, Loss span/answer/total=0.8344/0.1673/1.0017, LR=0.00002206, grad_norm=2.3005. Time cost=193.87, Throughput=3.09 samples/s ETA=7.41h
2020-07-14 16:54:52,849 - root - INFO - Step: 1250/8161, Loss span/answer/total=0.8273/0.1657/0.9931, LR=0.00002298, grad_norm=2.6358. Time cost=189.31, Throughput=3.17 samples/s ETA=7.35h
2020-07-14 16:58:09,046 - root - INFO - Step: 1300/8161, Loss span/answer/total=0.8334/0.1783/1.0117, LR=0.00002390, grad_norm=2.0325. Time cost=196.20, Throughput=3.06 samples/s ETA=7.31h
2020-07-14 17:01:18,507 - root - INFO - Step: 1350/8161, Loss span/answer/total=0.8737/0.1763/1.0500, LR=0.00002482, grad_norm=1.6385. Time cost=189.46, Throughput=3.17 samples/s ETA=7.25h
2020-07-14 17:04:39,333 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1400.params
2020-07-14 17:04:39,333 - root - INFO - Step: 1400/8161, Loss span/answer/total=0.7660/0.1648/0.9308, LR=0.00002574, grad_norm=1.3930. Time cost=200.83, Throughput=2.99 samples/s ETA=7.21h
2020-07-14 17:07:47,821 - root - INFO - Step: 1450/8161, Loss span/answer/total=0.8390/0.1736/1.0126, LR=0.00002665, grad_norm=1.5518. Time cost=188.49, Throughput=3.18 samples/s ETA=7.15h
2020-07-14 17:11:00,857 - root - INFO - Step: 1500/8161, Loss span/answer/total=0.8812/0.1966/1.0778, LR=0.00002757, grad_norm=3.7684. Time cost=193.04, Throughput=3.11 samples/s ETA=7.10h
2020-07-14 17:14:15,423 - root - INFO - Step: 1550/8161, Loss span/answer/total=0.7900/0.1759/0.9659, LR=0.00002849, grad_norm=2.1649. Time cost=194.57, Throughput=3.08 samples/s ETA=7.05h
2020-07-14 17:17:32,327 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1600.params
2020-07-14 17:17:32,327 - root - INFO - Step: 1600/8161, Loss span/answer/total=0.7428/0.1530/0.8959, LR=0.00002941, grad_norm=1.8114. Time cost=196.90, Throughput=3.05 samples/s ETA=7.00h
2020-07-14 17:20:38,112 - root - INFO - Step: 1650/8161, Loss span/answer/total=0.7954/0.1645/0.9599, LR=0.00002992, grad_norm=1.5165. Time cost=185.78, Throughput=3.23 samples/s ETA=6.94h
2020-07-14 17:23:52,630 - root - INFO - Step: 1700/8161, Loss span/answer/total=0.7653/0.1553/0.9206, LR=0.00002969, grad_norm=1.9788. Time cost=194.52, Throughput=3.08 samples/s ETA=6.89h
2020-07-14 17:27:10,097 - root - INFO - Step: 1750/8161, Loss span/answer/total=0.8212/0.1642/0.9854, LR=0.00002946, grad_norm=2.0279. Time cost=197.47, Throughput=3.04 samples/s ETA=6.84h
2020-07-14 17:30:23,722 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1800.params
2020-07-14 17:30:23,722 - root - INFO - Step: 1800/8161, Loss span/answer/total=0.8525/0.1771/1.0295, LR=0.00002923, grad_norm=1.6133. Time cost=193.62, Throughput=3.10 samples/s ETA=6.79h
2020-07-14 17:33:42,373 - root - INFO - Step: 1850/8161, Loss span/answer/total=0.8413/0.1776/1.0190, LR=0.00002900, grad_norm=1.4633. Time cost=198.65, Throughput=3.02 samples/s ETA=6.74h
2020-07-14 17:36:53,698 - root - INFO - Step: 1900/8161, Loss span/answer/total=0.7954/0.1642/0.9596, LR=0.00002877, grad_norm=1.3822. Time cost=191.32, Throughput=3.14 samples/s ETA=6.69h
2020-07-14 17:40:03,430 - root - INFO - Step: 1950/8161, Loss span/answer/total=0.7761/0.1468/0.9229, LR=0.00002854, grad_norm=1.9558. Time cost=189.73, Throughput=3.16 samples/s ETA=6.63h
2020-07-14 17:43:22,785 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2000.params
2020-07-14 17:43:22,785 - root - INFO - Step: 2000/8161, Loss span/answer/total=0.7829/0.1690/0.9519, LR=0.00002831, grad_norm=1.7791. Time cost=199.35, Throughput=3.01 samples/s ETA=6.59h
2020-07-14 17:46:30,740 - root - INFO - Step: 2050/8161, Loss span/answer/total=0.7582/0.1827/0.9409, LR=0.00002808, grad_norm=1.2299. Time cost=187.96, Throughput=3.19 samples/s ETA=6.53h
2020-07-14 17:49:46,439 - root - INFO - Step: 2100/8161, Loss span/answer/total=0.7431/0.1347/0.8778, LR=0.00002785, grad_norm=1.3342. Time cost=195.70, Throughput=3.07 samples/s ETA=6.48h
2020-07-14 17:53:02,863 - root - INFO - Step: 2150/8161, Loss span/answer/total=0.9334/0.1548/1.0882, LR=0.00002762, grad_norm=1.5187. Time cost=196.42, Throughput=3.05 samples/s ETA=6.43h
2020-07-14 17:56:25,867 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2200.params
2020-07-14 17:56:25,867 - root - INFO - Step: 2200/8161, Loss span/answer/total=0.7940/0.1577/0.9516, LR=0.00002739, grad_norm=1.2215. Time cost=203.00, Throughput=2.96 samples/s ETA=6.38h
2020-07-14 17:59:38,356 - root - INFO - Step: 2250/8161, Loss span/answer/total=0.7854/0.1825/0.9679, LR=0.00002716, grad_norm=1.3242. Time cost=192.49, Throughput=3.12 samples/s ETA=6.33h
2020-07-14 18:02:48,119 - root - INFO - Step: 2300/8161, Loss span/answer/total=0.7437/0.1325/0.8762, LR=0.00002693, grad_norm=1.1554. Time cost=189.76, Throughput=3.16 samples/s ETA=6.27h
2020-07-14 18:05:59,453 - root - INFO - Step: 2350/8161, Loss span/answer/total=0.7186/0.1501/0.8687, LR=0.00002670, grad_norm=1.6597. Time cost=191.33, Throughput=3.14 samples/s ETA=6.22h
2020-07-14 18:09:09,587 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2400.params
2020-07-14 18:09:09,587 - root - INFO - Step: 2400/8161, Loss span/answer/total=0.7207/0.1471/0.8677, LR=0.00002647, grad_norm=1.6368. Time cost=190.13, Throughput=3.16 samples/s ETA=6.16h
2020-07-14 18:12:24,348 - root - INFO - Step: 2450/8161, Loss span/answer/total=0.7684/0.1547/0.9231, LR=0.00002624, grad_norm=1.9939. Time cost=194.76, Throughput=3.08 samples/s ETA=6.11h
2020-07-14 18:15:35,223 - root - INFO - Step: 2500/8161, Loss span/answer/total=0.7108/0.1386/0.8494, LR=0.00002601, grad_norm=1.6097. Time cost=190.87, Throughput=3.14 samples/s ETA=6.06h
2020-07-14 18:18:55,555 - root - INFO - Step: 2550/8161, Loss span/answer/total=0.7025/0.1647/0.8672, LR=0.00002578, grad_norm=1.1942. Time cost=200.33, Throughput=3.00 samples/s ETA=6.01h
2020-07-14 18:22:03,399 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2600.params
2020-07-14 18:22:03,399 - root - INFO - Step: 2600/8161, Loss span/answer/total=0.6790/0.1576/0.8366, LR=0.00002555, grad_norm=1.4794. Time cost=187.84, Throughput=3.19 samples/s ETA=5.95h
2020-07-14 18:25:18,675 - root - INFO - Step: 2650/8161, Loss span/answer/total=0.7288/0.1643/0.8931, LR=0.00002532, grad_norm=2.1849. Time cost=195.28, Throughput=3.07 samples/s ETA=5.90h
2020-07-14 18:28:34,113 - root - INFO - Step: 2700/8161, Loss span/answer/total=0.7185/0.1293/0.8477, LR=0.00002509, grad_norm=1.6405. Time cost=195.44, Throughput=3.07 samples/s ETA=5.85h
2020-07-14 18:31:50,425 - root - INFO - Step: 2750/8161, Loss span/answer/total=0.6064/0.1159/0.7223, LR=0.00002486, grad_norm=2.7561. Time cost=196.31, Throughput=3.05 samples/s ETA=5.80h
2020-07-14 18:35:05,860 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2800.params
2020-07-14 18:35:05,861 - root - INFO - Step: 2800/8161, Loss span/answer/total=0.6658/0.1403/0.8061, LR=0.00002463, grad_norm=1.1489. Time cost=195.43, Throughput=3.07 samples/s ETA=5.74h
2020-07-14 18:38:14,197 - root - INFO - Step: 2850/8161, Loss span/answer/total=0.6983/0.1294/0.8278, LR=0.00002440, grad_norm=1.4723. Time cost=188.34, Throughput=3.19 samples/s ETA=5.69h
2020-07-14 18:41:26,990 - root - INFO - Step: 2900/8161, Loss span/answer/total=0.6289/0.1120/0.7409, LR=0.00002417, grad_norm=1.9911. Time cost=192.79, Throughput=3.11 samples/s ETA=5.63h
2020-07-14 18:44:38,514 - root - INFO - Step: 2950/8161, Loss span/answer/total=0.6349/0.1189/0.7538, LR=0.00002394, grad_norm=1.2215. Time cost=191.52, Throughput=3.13 samples/s ETA=5.58h
2020-07-14 18:47:46,802 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3000.params
2020-07-14 18:47:46,802 - root - INFO - Step: 3000/8161, Loss span/answer/total=0.6051/0.1230/0.7281, LR=0.00002371, grad_norm=1.9404. Time cost=188.29, Throughput=3.19 samples/s ETA=5.52h
2020-07-14 18:50:58,991 - root - INFO - Step: 3050/8161, Loss span/answer/total=0.6007/0.1108/0.7115, LR=0.00002348, grad_norm=1.5713. Time cost=192.19, Throughput=3.12 samples/s ETA=5.47h
2020-07-14 18:54:03,981 - root - INFO - Step: 3100/8161, Loss span/answer/total=0.6201/0.1198/0.7399, LR=0.00002325, grad_norm=15.4591. Time cost=184.99, Throughput=3.24 samples/s ETA=5.41h
2020-07-14 18:57:24,423 - root - INFO - Step: 3150/8161, Loss span/answer/total=0.5833/0.1168/0.7001, LR=0.00002302, grad_norm=1.2660. Time cost=200.44, Throughput=2.99 samples/s ETA=5.36h
2020-07-14 19:00:38,314 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3200.params
2020-07-14 19:00:38,314 - root - INFO - Step: 3200/8161, Loss span/answer/total=0.5754/0.1392/0.7147, LR=0.00002280, grad_norm=1.7093. Time cost=193.89, Throughput=3.09 samples/s ETA=5.31h
2020-07-14 19:03:46,482 - root - INFO - Step: 3250/8161, Loss span/answer/total=0.5912/0.1243/0.7155, LR=0.00002257, grad_norm=1.6054. Time cost=188.17, Throughput=3.19 samples/s ETA=5.26h
2020-07-14 19:06:57,204 - root - INFO - Step: 3300/8161, Loss span/answer/total=0.5784/0.1375/0.7159, LR=0.00002234, grad_norm=1.7708. Time cost=190.72, Throughput=3.15 samples/s ETA=5.20h
2020-07-14 19:10:08,916 - root - INFO - Step: 3350/8161, Loss span/answer/total=0.6490/0.1394/0.7884, LR=0.00002211, grad_norm=0.9929. Time cost=191.71, Throughput=3.13 samples/s ETA=5.15h
2020-07-14 19:13:30,356 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3400.params
2020-07-14 19:13:30,356 - root - INFO - Step: 3400/8161, Loss span/answer/total=0.5344/0.1239/0.6584, LR=0.00002188, grad_norm=1.0861. Time cost=201.44, Throughput=2.98 samples/s ETA=5.10h
2020-07-14 19:16:46,255 - root - INFO - Step: 3450/8161, Loss span/answer/total=0.6563/0.1212/0.7775, LR=0.00002165, grad_norm=1.6354. Time cost=195.90, Throughput=3.06 samples/s ETA=5.05h
2020-07-14 19:20:08,618 - root - INFO - Step: 3500/8161, Loss span/answer/total=0.6558/0.1089/0.7648, LR=0.00002142, grad_norm=1.7389. Time cost=202.36, Throughput=2.96 samples/s ETA=5.00h
2020-07-14 19:23:20,439 - root - INFO - Step: 3550/8161, Loss span/answer/total=0.6205/0.1071/0.7276, LR=0.00002119, grad_norm=1.1037. Time cost=191.82, Throughput=3.13 samples/s ETA=4.94h
2020-07-14 19:26:39,713 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3600.params
2020-07-14 19:26:39,714 - root - INFO - Step: 3600/8161, Loss span/answer/total=0.6081/0.1148/0.7230, LR=0.00002096, grad_norm=1.3200. Time cost=199.27, Throughput=3.01 samples/s ETA=4.89h
2020-07-14 19:29:45,317 - root - INFO - Step: 3650/8161, Loss span/answer/total=0.5934/0.1191/0.7126, LR=0.00002073, grad_norm=1.8144. Time cost=185.60, Throughput=3.23 samples/s ETA=4.83h
2020-07-14 19:32:56,956 - root - INFO - Step: 3700/8161, Loss span/answer/total=0.5790/0.1221/0.7011, LR=0.00002050, grad_norm=1.2696. Time cost=191.64, Throughput=3.13 samples/s ETA=4.78h
2020-07-14 19:36:12,898 - root - INFO - Step: 3750/8161, Loss span/answer/total=0.5821/0.1224/0.7045, LR=0.00002027, grad_norm=1.7185. Time cost=195.94, Throughput=3.06 samples/s ETA=4.73h
2020-07-14 19:39:32,290 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3800.params
2020-07-14 19:39:32,290 - root - INFO - Step: 3800/8161, Loss span/answer/total=0.5515/0.0821/0.6336, LR=0.00002004, grad_norm=2.3873. Time cost=199.39, Throughput=3.01 samples/s ETA=4.68h
2020-07-14 19:42:42,255 - root - INFO - Step: 3850/8161, Loss span/answer/total=0.5287/0.0982/0.6269, LR=0.00001981, grad_norm=1.6992. Time cost=189.97, Throughput=3.16 samples/s ETA=4.62h
2020-07-14 19:45:55,782 - root - INFO - Step: 3900/8161, Loss span/answer/total=0.5403/0.0919/0.6322, LR=0.00001958, grad_norm=1.2932. Time cost=193.53, Throughput=3.10 samples/s ETA=4.57h
2020-07-14 19:49:13,083 - root - INFO - Step: 3950/8161, Loss span/answer/total=0.4978/0.1099/0.6077, LR=0.00001935, grad_norm=1.2998. Time cost=197.30, Throughput=3.04 samples/s ETA=4.52h
2020-07-14 19:52:27,483 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4000.params
2020-07-14 19:52:27,483 - root - INFO - Step: 4000/8161, Loss span/answer/total=0.5036/0.1098/0.6134, LR=0.00001912, grad_norm=1.1717. Time cost=194.40, Throughput=3.09 samples/s ETA=4.46h
2020-07-14 19:55:44,433 - root - INFO - Step: 4050/8161, Loss span/answer/total=0.5325/0.0789/0.6114, LR=0.00001889, grad_norm=1.2614. Time cost=196.95, Throughput=3.05 samples/s ETA=4.41h
2020-07-14 19:59:04,390 - root - INFO - Step: 4100/8161, Loss span/answer/total=0.5558/0.0984/0.6542, LR=0.00001866, grad_norm=2.0315. Time cost=199.96, Throughput=3.00 samples/s ETA=4.36h
2020-07-14 20:02:13,006 - root - INFO - Step: 4150/8161, Loss span/answer/total=0.5618/0.1028/0.6645, LR=0.00001843, grad_norm=1.5764. Time cost=188.62, Throughput=3.18 samples/s ETA=4.30h
2020-07-14 20:05:27,300 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4200.params
2020-07-14 20:05:27,301 - root - INFO - Step: 4200/8161, Loss span/answer/total=0.5585/0.1095/0.6680, LR=0.00001820, grad_norm=1.5410. Time cost=194.29, Throughput=3.09 samples/s ETA=4.25h
2020-07-14 20:08:35,748 - root - INFO - Step: 4250/8161, Loss span/answer/total=0.5177/0.0987/0.6164, LR=0.00001797, grad_norm=1.0852. Time cost=188.45, Throughput=3.18 samples/s ETA=4.19h
2020-07-14 20:11:39,837 - root - INFO - Step: 4300/8161, Loss span/answer/total=0.5651/0.0906/0.6557, LR=0.00001774, grad_norm=1.4587. Time cost=184.09, Throughput=3.26 samples/s ETA=4.14h
2020-07-14 20:14:54,241 - root - INFO - Step: 4350/8161, Loss span/answer/total=0.5509/0.1009/0.6519, LR=0.00001751, grad_norm=6.5584. Time cost=194.40, Throughput=3.09 samples/s ETA=4.09h
2020-07-14 20:18:08,281 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4400.params
2020-07-14 20:18:08,281 - root - INFO - Step: 4400/8161, Loss span/answer/total=0.5717/0.1150/0.6867, LR=0.00001728, grad_norm=1.2226. Time cost=194.04, Throughput=3.09 samples/s ETA=4.03h
2020-07-14 20:21:20,248 - root - INFO - Step: 4450/8161, Loss span/answer/total=0.5255/0.0910/0.6166, LR=0.00001705, grad_norm=1.2802. Time cost=191.97, Throughput=3.13 samples/s ETA=3.98h
2020-07-14 20:24:35,265 - root - INFO - Step: 4500/8161, Loss span/answer/total=0.5217/0.1029/0.6246, LR=0.00001682, grad_norm=1.4404. Time cost=195.02, Throughput=3.08 samples/s ETA=3.93h
2020-07-14 20:27:46,910 - root - INFO - Step: 4550/8161, Loss span/answer/total=0.5017/0.1006/0.6024, LR=0.00001659, grad_norm=2.1950. Time cost=191.64, Throughput=3.13 samples/s ETA=3.87h
2020-07-14 20:31:06,490 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4600.params
2020-07-14 20:31:06,490 - root - INFO - Step: 4600/8161, Loss span/answer/total=0.5070/0.1011/0.6081, LR=0.00001636, grad_norm=1.7031. Time cost=199.58, Throughput=3.01 samples/s ETA=3.82h
2020-07-14 20:34:20,369 - root - INFO - Step: 4650/8161, Loss span/answer/total=0.4975/0.0881/0.5856, LR=0.00001613, grad_norm=1.5950. Time cost=193.88, Throughput=3.09 samples/s ETA=3.77h
2020-07-14 20:37:35,882 - root - INFO - Step: 4700/8161, Loss span/answer/total=0.5709/0.0977/0.6686, LR=0.00001590, grad_norm=2.0156. Time cost=195.51, Throughput=3.07 samples/s ETA=3.71h
2020-07-14 20:40:49,278 - root - INFO - Step: 4750/8161, Loss span/answer/total=0.5213/0.0948/0.6161, LR=0.00001567, grad_norm=1.8299. Time cost=193.40, Throughput=3.10 samples/s ETA=3.66h
2020-07-14 20:44:06,587 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4800.params
2020-07-14 20:44:06,587 - root - INFO - Step: 4800/8161, Loss span/answer/total=0.5009/0.1144/0.6153, LR=0.00001544, grad_norm=1.5089. Time cost=197.31, Throughput=3.04 samples/s ETA=3.61h
2020-07-14 20:47:16,889 - root - INFO - Step: 4850/8161, Loss span/answer/total=0.4927/0.0957/0.5884, LR=0.00001521, grad_norm=1.3726. Time cost=190.30, Throughput=3.15 samples/s ETA=3.55h
2020-07-14 20:50:27,539 - root - INFO - Step: 4900/8161, Loss span/answer/total=0.6038/0.0994/0.7032, LR=0.00001498, grad_norm=1.5305. Time cost=190.65, Throughput=3.15 samples/s ETA=3.50h
2020-07-14 20:53:35,318 - root - INFO - Step: 4950/8161, Loss span/answer/total=0.5941/0.1062/0.7003, LR=0.00001475, grad_norm=5.3665. Time cost=187.78, Throughput=3.20 samples/s ETA=3.44h
2020-07-14 20:56:53,535 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5000.params
2020-07-14 20:56:53,535 - root - INFO - Step: 5000/8161, Loss span/answer/total=0.5886/0.0964/0.6851, LR=0.00001452, grad_norm=1.9062. Time cost=198.22, Throughput=3.03 samples/s ETA=3.39h
2020-07-14 21:00:06,070 - root - INFO - Step: 5050/8161, Loss span/answer/total=0.5621/0.0939/0.6560, LR=0.00001429, grad_norm=1.8509. Time cost=192.53, Throughput=3.12 samples/s ETA=3.34h
2020-07-14 21:03:15,464 - root - INFO - Step: 5100/8161, Loss span/answer/total=0.4969/0.0967/0.5936, LR=0.00001406, grad_norm=1.7891. Time cost=189.39, Throughput=3.17 samples/s ETA=3.28h
2020-07-14 21:06:32,697 - root - INFO - Step: 5150/8161, Loss span/answer/total=0.5231/0.0906/0.6137, LR=0.00001384, grad_norm=1.6945. Time cost=197.23, Throughput=3.04 samples/s ETA=3.23h
2020-07-14 21:09:53,806 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5200.params
2020-07-14 21:09:53,806 - root - INFO - Step: 5200/8161, Loss span/answer/total=0.6016/0.1198/0.7214, LR=0.00001361, grad_norm=1.8797. Time cost=201.11, Throughput=2.98 samples/s ETA=3.18h
2020-07-14 21:13:04,751 - root - INFO - Step: 5250/8161, Loss span/answer/total=0.4959/0.0973/0.5931, LR=0.00001338, grad_norm=1.7449. Time cost=190.94, Throughput=3.14 samples/s ETA=3.12h
2020-07-14 21:16:22,823 - root - INFO - Step: 5300/8161, Loss span/answer/total=0.5179/0.0829/0.6008, LR=0.00001315, grad_norm=2.0806. Time cost=198.07, Throughput=3.03 samples/s ETA=3.07h
2020-07-14 21:19:27,094 - root - INFO - Step: 5350/8161, Loss span/answer/total=0.5125/0.0961/0.6085, LR=0.00001292, grad_norm=1.8463. Time cost=184.27, Throughput=3.26 samples/s ETA=3.02h
2020-07-14 21:22:45,648 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5400.params
2020-07-14 21:22:45,648 - root - INFO - Step: 5400/8161, Loss span/answer/total=0.5466/0.0818/0.6284, LR=0.00001269, grad_norm=1.5377. Time cost=198.55, Throughput=3.02 samples/s ETA=2.96h
2020-07-14 21:26:01,508 - root - INFO - Step: 5450/8161, Loss span/answer/total=0.5089/0.1053/0.6142, LR=0.00001246, grad_norm=1.8162. Time cost=195.86, Throughput=3.06 samples/s ETA=2.91h
2020-07-14 21:29:18,997 - root - INFO - Step: 5500/8161, Loss span/answer/total=0.4228/0.0791/0.5018, LR=0.00001223, grad_norm=3.5414. Time cost=197.49, Throughput=3.04 samples/s ETA=2.86h
2020-07-14 21:32:39,895 - root - INFO - Step: 5550/8161, Loss span/answer/total=0.4608/0.0821/0.5430, LR=0.00001200, grad_norm=1.7035. Time cost=200.90, Throughput=2.99 samples/s ETA=2.80h
2020-07-14 21:35:53,498 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5600.params
2020-07-14 21:35:53,498 - root - INFO - Step: 5600/8161, Loss span/answer/total=0.3661/0.0575/0.4236, LR=0.00001177, grad_norm=1.7812. Time cost=193.60, Throughput=3.10 samples/s ETA=2.75h
2020-07-14 21:39:07,211 - root - INFO - Step: 5650/8161, Loss span/answer/total=0.4743/0.0522/0.5264, LR=0.00001154, grad_norm=1.2576. Time cost=193.71, Throughput=3.10 samples/s ETA=2.70h
2020-07-14 21:42:22,387 - root - INFO - Step: 5700/8161, Loss span/answer/total=0.3931/0.0696/0.4628, LR=0.00001131, grad_norm=1.3404. Time cost=195.18, Throughput=3.07 samples/s ETA=2.64h
2020-07-14 21:45:34,159 - root - INFO - Step: 5750/8161, Loss span/answer/total=0.4859/0.0606/0.5466, LR=0.00001108, grad_norm=1.0631. Time cost=191.77, Throughput=3.13 samples/s ETA=2.59h
2020-07-14 21:48:47,578 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5800.params
2020-07-14 21:48:47,579 - root - INFO - Step: 5800/8161, Loss span/answer/total=0.4083/0.0600/0.4682, LR=0.00001085, grad_norm=2.0642. Time cost=193.42, Throughput=3.10 samples/s ETA=2.54h
2020-07-14 21:52:01,216 - root - INFO - Step: 5850/8161, Loss span/answer/total=0.3876/0.0574/0.4450, LR=0.00001062, grad_norm=1.1036. Time cost=193.64, Throughput=3.10 samples/s ETA=2.48h
2020-07-14 21:55:11,557 - root - INFO - Step: 5900/8161, Loss span/answer/total=0.4148/0.0775/0.4923, LR=0.00001039, grad_norm=2.2148. Time cost=190.34, Throughput=3.15 samples/s ETA=2.43h
2020-07-14 21:58:29,072 - root - INFO - Step: 5950/8161, Loss span/answer/total=0.4107/0.0566/0.4672, LR=0.00001016, grad_norm=1.2852. Time cost=197.51, Throughput=3.04 samples/s ETA=2.37h
2020-07-14 22:01:41,868 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_6000.params
2020-07-14 22:01:41,869 - root - INFO - Step: 6000/8161, Loss span/answer/total=0.4159/0.0675/0.4834, LR=0.00000993, grad_norm=1.3154. Time cost=192.80, Throughput=3.11 samples/s ETA=2.32h
2020-07-14 22:04:53,352 - root - INFO - Step: 6050/8161, Loss span/answer/total=0.4306/0.0664/0.4969, LR=0.00000970, grad_norm=5.1835. Time cost=191.48, Throughput=3.13 samples/s ETA=2.27h
2020-07-14 22:08:09,185 - root - INFO - Step: 6100/8161, Loss span/answer/total=0.4099/0.0597/0.4696, LR=0.00000947, grad_norm=7.6002. Time cost=195.83, Throughput=3.06 samples/s ETA=2.21h
2020-07-14 22:11:17,269 - root - INFO - Step: 6150/8161, Loss span/answer/total=0.4242/0.0839/0.5081, LR=0.00000924, grad_norm=1.1508. Time cost=188.08, Throughput=3.19 samples/s ETA=2.16h
2020-07-14 22:14:29,056 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_6200.params
2020-07-14 22:14:29,056 - root - INFO - Step: 6200/8161, Loss span/answer/total=0.4776/0.0728/0.5504, LR=0.00000901, grad_norm=1.9973. Time cost=191.79, Throughput=3.13 samples/s ETA=2.11h
2020-07-14 22:17:39,767 - root - INFO - Step: 6250/8161, Loss span/answer/total=0.3884/0.0599/0.4483, LR=0.00000878, grad_norm=2.3867. Time cost=190.71, Throughput=3.15 samples/s ETA=2.05h
2020-07-14 22:20:49,213 - root - INFO - Step: 6300/8161, Loss span/answer/total=0.4076/0.0672/0.4748, LR=0.00000855, grad_norm=1.6369. Time cost=189.45, Throughput=3.17 samples/s ETA=2.00h
2020-07-14 22:24:01,249 - root - INFO - Step: 6350/8161, Loss span/answer/total=0.4142/0.0682/0.4824, LR=0.00000832, grad_norm=1.6283. Time cost=192.04, Throughput=3.12 samples/s ETA=1.94h
2020-07-14 22:27:13,813 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_6400.params
2020-07-14 22:27:13,814 - root - INFO - Step: 6400/8161, Loss span/answer/total=0.4449/0.0575/0.5024, LR=0.00000809, grad_norm=1.5339. Time cost=192.56, Throughput=3.12 samples/s ETA=1.89h
2020-07-14 22:30:25,934 - root - INFO - Step: 6450/8161, Loss span/answer/total=0.3957/0.0432/0.4389, LR=0.00000786, grad_norm=1.1557. Time cost=192.12, Throughput=3.12 samples/s ETA=1.84h
2020-07-14 22:33:34,587 - root - INFO - Step: 6500/8161, Loss span/answer/total=0.3982/0.0674/0.4657, LR=0.00000763, grad_norm=1.1749. Time cost=188.65, Throughput=3.18 samples/s ETA=1.78h
2020-07-14 22:36:45,707 - root - INFO - Step: 6550/8161, Loss span/answer/total=0.4521/0.0805/0.5326, LR=0.00000740, grad_norm=0.9745. Time cost=191.12, Throughput=3.14 samples/s ETA=1.73h
2020-07-14 22:39:57,453 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_6600.params
2020-07-14 22:39:57,454 - root - INFO - Step: 6600/8161, Loss span/answer/total=0.5043/0.0921/0.5964, LR=0.00000717, grad_norm=1.6489. Time cost=191.75, Throughput=3.13 samples/s ETA=1.67h
2020-07-14 22:43:05,283 - root - INFO - Step: 6650/8161, Loss span/answer/total=0.4128/0.0532/0.4661, LR=0.00000694, grad_norm=1.1854. Time cost=187.83, Throughput=3.19 samples/s ETA=1.62h
2020-07-14 22:46:11,795 - root - INFO - Step: 6700/8161, Loss span/answer/total=0.3934/0.0498/0.4433, LR=0.00000671, grad_norm=1.5059. Time cost=186.51, Throughput=3.22 samples/s ETA=1.57h
2020-07-14 22:49:26,882 - root - INFO - Step: 6750/8161, Loss span/answer/total=0.4352/0.0500/0.4853, LR=0.00000648, grad_norm=1.4848. Time cost=195.09, Throughput=3.08 samples/s ETA=1.51h
2020-07-14 22:52:45,660 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_6800.params
2020-07-14 22:52:45,661 - root - INFO - Step: 6800/8161, Loss span/answer/total=0.4016/0.0566/0.4583, LR=0.00000625, grad_norm=1.5634. Time cost=198.78, Throughput=3.02 samples/s ETA=1.46h
2020-07-14 22:55:53,687 - root - INFO - Step: 6850/8161, Loss span/answer/total=0.4129/0.0549/0.4677, LR=0.00000602, grad_norm=1.7989. Time cost=188.03, Throughput=3.19 samples/s ETA=1.41h
2020-07-14 22:59:07,587 - root - INFO - Step: 6900/8161, Loss span/answer/total=0.4209/0.0564/0.4773, LR=0.00000579, grad_norm=1.4212. Time cost=193.90, Throughput=3.09 samples/s ETA=1.35h
2020-07-14 23:02:20,821 - root - INFO - Step: 6950/8161, Loss span/answer/total=0.4443/0.0777/0.5220, LR=0.00000556, grad_norm=5.6529. Time cost=193.23, Throughput=3.11 samples/s ETA=1.30h
2020-07-14 23:05:35,147 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_7000.params
2020-07-14 23:05:35,147 - root - INFO - Step: 7000/8161, Loss span/answer/total=0.4073/0.0603/0.4676, LR=0.00000533, grad_norm=1.5356. Time cost=194.32, Throughput=3.09 samples/s ETA=1.25h
2020-07-14 23:08:46,964 - root - INFO - Step: 7050/8161, Loss span/answer/total=0.4259/0.0756/0.5016, LR=0.00000510, grad_norm=2.0354. Time cost=191.82, Throughput=3.13 samples/s ETA=1.19h
2020-07-14 23:12:08,230 - root - INFO - Step: 7100/8161, Loss span/answer/total=0.4025/0.0516/0.4541, LR=0.00000488, grad_norm=1.6929. Time cost=201.26, Throughput=2.98 samples/s ETA=1.14h
2020-07-14 23:15:19,463 - root - INFO - Step: 7150/8161, Loss span/answer/total=0.4131/0.0626/0.4757, LR=0.00000465, grad_norm=1.0583. Time cost=191.23, Throughput=3.14 samples/s ETA=1.08h
2020-07-14 23:18:29,325 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_7200.params
2020-07-14 23:18:29,326 - root - INFO - Step: 7200/8161, Loss span/answer/total=0.3899/0.0588/0.4487, LR=0.00000442, grad_norm=1.0074. Time cost=189.86, Throughput=3.16 samples/s ETA=1.03h
2020-07-14 23:21:47,642 - root - INFO - Step: 7250/8161, Loss span/answer/total=0.4479/0.0600/0.5080, LR=0.00000419, grad_norm=2.7129. Time cost=198.32, Throughput=3.03 samples/s ETA=0.98h
2020-07-14 23:25:09,485 - root - INFO - Step: 7300/8161, Loss span/answer/total=0.3741/0.0544/0.4284, LR=0.00000396, grad_norm=1.6513. Time cost=201.84, Throughput=2.97 samples/s ETA=0.92h
2020-07-14 23:28:21,112 - root - INFO - Step: 7350/8161, Loss span/answer/total=0.4304/0.0623/0.4927, LR=0.00000373, grad_norm=1.1648. Time cost=191.63, Throughput=3.13 samples/s ETA=0.87h
2020-07-14 23:31:42,116 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_7400.params
2020-07-14 23:31:42,116 - root - INFO - Step: 7400/8161, Loss span/answer/total=0.4392/0.0491/0.4883, LR=0.00000350, grad_norm=1.5138. Time cost=201.00, Throughput=2.99 samples/s ETA=0.82h
2020-07-14 23:34:56,042 - root - INFO - Step: 7450/8161, Loss span/answer/total=0.4220/0.0668/0.4887, LR=0.00000327, grad_norm=1.7039. Time cost=193.93, Throughput=3.09 samples/s ETA=0.76h
2020-07-14 23:38:12,700 - root - INFO - Step: 7500/8161, Loss span/answer/total=0.3895/0.0651/0.4546, LR=0.00000304, grad_norm=1.4577. Time cost=196.66, Throughput=3.05 samples/s ETA=0.71h
2020-07-14 23:41:23,998 - root - INFO - Step: 7550/8161, Loss span/answer/total=0.4179/0.0453/0.4632, LR=0.00000281, grad_norm=1.5650. Time cost=191.30, Throughput=3.14 samples/s ETA=0.66h
2020-07-14 23:44:55,111 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_7600.params
2020-07-14 23:44:55,111 - root - INFO - Step: 7600/8161, Loss span/answer/total=0.4465/0.0511/0.4975, LR=0.00000258, grad_norm=1.3331. Time cost=211.11, Throughput=2.84 samples/s ETA=0.60h
2020-07-14 23:47:58,099 - root - INFO - Step: 7650/8161, Loss span/answer/total=0.3817/0.0608/0.4425, LR=0.00000235, grad_norm=1.0029. Time cost=182.99, Throughput=3.28 samples/s ETA=0.55h
2020-07-14 23:51:12,415 - root - INFO - Step: 7700/8161, Loss span/answer/total=0.3911/0.0538/0.4449, LR=0.00000212, grad_norm=4.2882. Time cost=194.32, Throughput=3.09 samples/s ETA=0.50h
2020-07-14 23:54:20,809 - root - INFO - Step: 7750/8161, Loss span/answer/total=0.4156/0.0548/0.4704, LR=0.00000189, grad_norm=1.4461. Time cost=188.39, Throughput=3.18 samples/s ETA=0.44h
2020-07-14 23:57:37,034 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_7800.params
2020-07-14 23:57:37,034 - root - INFO - Step: 7800/8161, Loss span/answer/total=0.3519/0.0528/0.4047, LR=0.00000166, grad_norm=1.4581. Time cost=196.22, Throughput=3.06 samples/s ETA=0.39h
2020-07-15 00:00:50,075 - root - INFO - Step: 7850/8161, Loss span/answer/total=0.4029/0.0635/0.4664, LR=0.00000143, grad_norm=0.9792. Time cost=193.04, Throughput=3.11 samples/s ETA=0.33h
2020-07-15 00:03:59,176 - root - INFO - Step: 7900/8161, Loss span/answer/total=0.4431/0.0649/0.5080, LR=0.00000120, grad_norm=1.0435. Time cost=189.10, Throughput=3.17 samples/s ETA=0.28h
2020-07-15 00:07:12,219 - root - INFO - Step: 7950/8161, Loss span/answer/total=0.4379/0.0521/0.4900, LR=0.00000097, grad_norm=1.5092. Time cost=193.04, Throughput=3.11 samples/s ETA=0.23h
2020-07-15 00:10:26,027 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_8000.params
2020-07-15 00:10:26,027 - root - INFO - Step: 8000/8161, Loss span/answer/total=0.4136/0.0473/0.4609, LR=0.00000074, grad_norm=1.1394. Time cost=193.81, Throughput=3.10 samples/s ETA=0.17h
2020-07-15 00:13:38,697 - root - INFO - Step: 8050/8161, Loss span/answer/total=0.3939/0.0683/0.4621, LR=0.00000051, grad_norm=12.0716. Time cost=192.67, Throughput=3.11 samples/s ETA=0.12h
2020-07-15 00:16:48,837 - root - INFO - Step: 8100/8161, Loss span/answer/total=0.3792/0.0419/0.4211, LR=0.00000028, grad_norm=2.0027. Time cost=190.14, Throughput=3.16 samples/s ETA=0.07h
2020-07-15 00:20:03,947 - root - INFO - Step: 8150/8161, Loss span/answer/total=0.3541/0.0522/0.4064, LR=0.00000005, grad_norm=1.3096. Time cost=195.11, Throughput=3.08 samples/s ETA=0.01h
2020-07-15 00:20:53,389 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_8161.params
2020-07-15 00:20:53,389 - root - INFO - Finish training step: 8161 within 8.763373736739158 hours
2020-07-15 00:20:53,395 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_8161.params
2020-07-15 00:20:53,395 - root - INFO - Finish training step: 8161 within 8.763424505790075 hours
2020-07-15 00:20:53,419 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_8161.params
2020-07-15 00:20:53,420 - root - INFO - Finish training step: 8161 within 8.763374406364228 hours
2020-07-15 00:20:53,535 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_8161.params
2020-07-15 00:20:53,535 - root - INFO - Finish training step: 8161 within 8.763468142085605 hours
2020-07-15 03:39:13,789 - root - INFO - GPU communication supported by KVStore
2020-07-15 03:39:13,789 - root - INFO - Srarting inference without horovod on the first node on device [gpu(0), gpu(1), gpu(2), gpu(3)]
2020-07-15 03:39:31,627 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0
2020-07-15 03:39:31,648 - root - INFO - Prepare dev data
2020-07-15 03:39:31,750 - root - INFO - Tokenize Data:
2020-07-15 03:39:41,793 - root - INFO - Done! Time spent:10.04 seconds
2020-07-15 03:39:43,327 - root - INFO - Starting evaluate the checkpoint fairseq_roberta_large_squad2.0_8161.params
2020-07-15 03:41:44,712 - root - INFO - [batch 10], Time cost=120.41, Throughput=15.95 samples/s, ETA=0.17h
2020-07-15 03:44:17,184 - root - INFO - [batch 20], Time cost=152.47, Throughput=12.59 samples/s, ETA=0.16h
2020-07-15 03:46:38,659 - root - INFO - [batch 30], Time cost=141.47, Throughput=13.57 samples/s, ETA=0.12h
2020-07-15 03:48:49,060 - root - INFO - [batch 40], Time cost=130.40, Throughput=14.72 samples/s, ETA=0.08h
2020-07-15 03:51:03,864 - root - INFO - [batch 50], Time cost=134.80, Throughput=14.24 samples/s, ETA=0.04h
2020-07-15 03:53:11,686 - root - INFO - [batch 60], Time cost=127.82, Throughput=15.02 samples/s, ETA=0.01h
2020-07-15 03:53:47,278 - root - INFO - Time cost=842.975556 s, Thoughput=14.24 samples/s
2020-07-15 03:53:52,697 - root - INFO - The evaluated results are {"exact": 44.251663438052724, "f1": 47.47999540626087, "total": 11873, "HasAns_exact": 87.36504723346829, "HasAns_f1": 93.83096920690542, "HasAns_total": 5928, "NoAns_exact": 1.2615643397813288, "NoAns_f1": 1.2615643397813288, "NoAns_total": 5945, "best_exact": 85.88393834751116, "best_exact_thresh": -1.9132816791534424, "best_f1": 88.73247007018989, "best_f1_thresh": -1.6530208587646484}
2020-07-15 03:53:52,697 - root - INFO - The evaluated files are saved in roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28
2020-07-15 03:53:53,513 - root - INFO - The best evaluated results are {"exact": 44.251663438052724, "f1": 47.47999540626087, "total": 11873, "HasAns_exact": 87.36504723346829, "HasAns_f1": 93.83096920690542, "HasAns_total": 5928, "NoAns_exact": 1.2615643397813288, "NoAns_f1": 1.2615643397813288, "NoAns_total": 5945, "best_exact": 85.88393834751116, "best_exact_thresh": -1.9132816791534424, "best_f1": 88.73247007018989, "best_f1_thresh": -1.6530208587646484, "best_ckpt": "fairseq_roberta_large_squad2.0_8161.params"}

## huggingface_roberta_large.log
07/16/2020 05:44:07 - WARNING - __main__ -   Process rank: 2, device: cuda:2, n_gpu: 1, distributed training: True, 16-bits training: False
07/16/2020 05:44:07 - WARNING - __main__ -   Process rank: 3, device: cuda:3, n_gpu: 1, distributed training: True, 16-bits training: False
07/16/2020 05:44:07 - WARNING - __main__ -   Process rank: 1, device: cuda:1, n_gpu: 1, distributed training: True, 16-bits training: False
07/16/2020 05:44:07 - WARNING - __main__ -   Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, 16-bits training: False
07/16/2020 05:44:07 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json from cache at /home/ubuntu/.cache/torch/transformers/c22e0b5bbb7c0cb93a87a2ae01263ae715b4c18d692b1740ce72cacaa99ad184.2d28da311092e99a05f9ee17520204614d60b0bfdb32f8a75644df7737b6a748
07/16/2020 05:44:07 - INFO - transformers.configuration_utils -   Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

07/16/2020 05:44:07 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json from cache at /home/ubuntu/.cache/torch/transformers/c22e0b5bbb7c0cb93a87a2ae01263ae715b4c18d692b1740ce72cacaa99ad184.2d28da311092e99a05f9ee17520204614d60b0bfdb32f8a75644df7737b6a748
07/16/2020 05:44:07 - INFO - transformers.configuration_utils -   Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

07/16/2020 05:44:07 - INFO - transformers.tokenization_utils_base -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json from cache at /home/ubuntu/.cache/torch/transformers/1ae1f5b6e2b22b25ccc04c000bb79ca847aa226d0761536b011cf7e5868f0655.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b
07/16/2020 05:44:07 - INFO - transformers.tokenization_utils_base -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt from cache at /home/ubuntu/.cache/torch/transformers/f8f83199a6270d582d6245dc100e99c4155de81c9745c6248077018fe01abcfb.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
07/16/2020 05:44:07 - INFO - transformers.modeling_utils -   loading weights file https://cdn.huggingface.co/roberta-large-pytorch_model.bin from cache at /home/ubuntu/.cache/torch/transformers/2339ac1858323405dffff5156947669fed6f63a0c34cfab35bda4f78791893d2.fc7abf72755ecc4a75d0d336a93c1c63358d2334f5998ed326f3b0da380bf536
07/16/2020 05:44:20 - WARNING - transformers.modeling_utils -   Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
07/16/2020 05:44:20 - WARNING - transformers.modeling_utils -   Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-large and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
07/16/2020 05:44:23 - INFO - __main__ -   Training/evaluation parameters Namespace(adam_epsilon=1e-06, cache_dir='', config_name='', data_dir=None, device=device(type='cuda', index=0), do_eval=True, do_lower_case=False, do_train=True, doc_stride=128, eval_all_checkpoints=False, evaluate_during_training=False, fp16=False, fp16_opt_level='O1', gradient_accumulation_steps=6, lang_id=0, learning_rate=3e-05, local_rank=0, logging_steps=50, max_answer_length=30, max_grad_norm=1.0, max_query_length=64, max_seq_length=512, max_steps=-1, model_name_or_path='roberta-large', model_type='roberta', n_best_size=20, n_gpu=1, no_cuda=False, null_score_diff_threshold=0.0, num_train_epochs=3.0, output_dir='./examples/models/test/', overwrite_cache=False, overwrite_output_dir=False, per_gpu_eval_batch_size=12, per_gpu_train_batch_size=2, predict_file='/home/ubuntu/SQuAD_data/dev-v2.0.json', save_steps=2000, seed=42, server_ip='', server_port='', threads=20, tokenizer_name='', train_file='/home/ubuntu/SQuAD_data/train-v2.0.json', verbose_logging=False, version_2_with_negative=True, warmup_steps=1642, weight_decay=0.01)
07/16/2020 05:44:23 - INFO - __main__ -   Loading features from cached file ./cached_train_roberta-large_512
07/16/2020 05:44:37 - WARNING - transformers.modeling_utils -   Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
07/16/2020 05:44:37 - WARNING - transformers.modeling_utils -   Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-large and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
07/16/2020 05:44:37 - WARNING - transformers.modeling_utils -   Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
07/16/2020 05:44:37 - WARNING - transformers.modeling_utils -   Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-large and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
07/16/2020 05:44:37 - WARNING - transformers.modeling_utils -   Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
07/16/2020 05:44:37 - WARNING - transformers.modeling_utils -   Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-large and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
07/16/2020 05:44:54 - INFO - __main__ -   Starting training
07/16/2020 05:44:54 - INFO - __main__ -   ***** Running training *****
07/16/2020 05:44:54 - INFO - __main__ -     Num examples = 131232
07/16/2020 05:44:54 - INFO - __main__ -     Num Epochs = 3
07/16/2020 05:44:54 - INFO - __main__ -     Instantaneous batch size per GPU = 2
07/16/2020 05:44:54 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 48
07/16/2020 05:44:54 - INFO - __main__ -     Gradient Accumulation steps = 6
07/16/2020 05:44:54 - INFO - __main__ -     Total optimization steps = 8202

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/16404 [00:00<?, ?it/s][A

Iteration:   1%|          | 92/16404 [02:00<5:55:35,  1.31s/it][A

Iteration:   1%|▏         | 212/16404 [04:00<5:28:17,  1.22s/it][A/home/ubuntu/.local/lib/python3.6/site-packages/torch/optim/lr_scheduler.py:231: UserWarning: To get the last learning rate computed by the scheduler, please use `get_last_lr()`.
  warnings.warn("To get the last learning rate computed by the scheduler, "


Iteration:   2%|▏         | 332/16404 [06:01<5:08:43,  1.15s/it][A

Iteration:   3%|▎         | 452/16404 [08:01<4:54:27,  1.11s/it][A

Iteration:   3%|▎         | 572/16404 [10:01<4:43:56,  1.08s/it][A

Iteration:   4%|▍         | 692/16404 [12:02<4:36:00,  1.05s/it][A

Iteration:   5%|▍         | 812/16404 [14:02<4:29:52,  1.04s/it][A

Iteration:   6%|▌         | 932/16404 [16:02<4:25:00,  1.03s/it][A

Iteration:   6%|▋         | 1053/16404 [18:03<4:20:40,  1.02s/it][A

Iteration:   7%|▋         | 1175/16404 [20:04<4:16:20,  1.01s/it][A

Iteration:   8%|▊         | 1297/16404 [22:04<4:12:44,  1.00s/it][A

Iteration:   9%|▊         | 1419/16404 [24:05<4:09:30,  1.00it/s][A

Iteration:   9%|▉         | 1541/16404 [26:05<4:06:44,  1.00it/s][A

Iteration:  10%|█         | 1663/16404 [28:06<4:04:14,  1.01it/s][A

Iteration:  11%|█         | 1785/16404 [30:07<4:01:51,  1.01it/s][A

Iteration:  12%|█▏        | 1907/16404 [32:08<3:59:33,  1.01it/s][A

Iteration:  12%|█▏        | 2029/16404 [34:08<3:57:25,  1.01it/s][A

Iteration:  13%|█▎        | 2151/16404 [36:09<3:55:15,  1.01it/s][A

Iteration:  14%|█▍        | 2273/16404 [38:10<3:53:07,  1.01it/s][A

Iteration:  15%|█▍        | 2395/16404 [40:10<3:51:06,  1.01it/s][A

Iteration:  15%|█▌        | 2517/16404 [42:11<3:48:55,  1.01it/s][A

Iteration:  16%|█▌        | 2639/16404 [44:11<3:46:50,  1.01it/s][A

Iteration:  17%|█▋        | 2761/16404 [46:12<3:44:46,  1.01it/s][A

Iteration:  18%|█▊        | 2883/16404 [48:12<3:42:40,  1.01it/s][A

Iteration:  18%|█▊        | 3005/16404 [50:14<3:41:28,  1.01it/s][A

Iteration:  19%|█▉        | 3126/16404 [52:15<3:39:42,  1.01it/s][A

Iteration:  20%|█▉        | 3247/16404 [54:15<3:37:40,  1.01it/s][A

Iteration:  21%|██        | 3369/16404 [56:16<3:35:30,  1.01it/s][A

Iteration:  21%|██▏       | 3491/16404 [58:16<3:33:19,  1.01it/s][A

Iteration:  22%|██▏       | 3613/16404 [1:00:17<3:31:19,  1.01it/s][A

Iteration:  23%|██▎       | 3735/16404 [1:02:18<3:29:13,  1.01it/s][A

Iteration:  24%|██▎       | 3857/16404 [1:04:19<3:27:10,  1.01it/s][A

Iteration:  24%|██▍       | 3979/16404 [1:06:20<3:25:11,  1.01it/s][A

Iteration:  25%|██▌       | 4101/16404 [1:08:21<3:23:12,  1.01it/s][A

Iteration:  26%|██▌       | 4223/16404 [1:10:22<3:21:15,  1.01it/s][A

Iteration:  26%|██▋       | 4345/16404 [1:12:23<3:19:21,  1.01it/s][A

Iteration:  27%|██▋       | 4466/16404 [1:14:23<3:17:23,  1.01it/s][A

Iteration:  28%|██▊       | 4587/16404 [1:16:23<3:15:22,  1.01it/s][A

Iteration:  29%|██▊       | 4708/16404 [1:18:23<3:13:24,  1.01it/s][A

Iteration:  29%|██▉       | 4829/16404 [1:20:23<3:11:24,  1.01it/s][A

Iteration:  30%|███       | 4950/16404 [1:22:23<3:09:29,  1.01it/s][A

Iteration:  31%|███       | 5071/16404 [1:24:24<3:07:30,  1.01it/s][A

Iteration:  32%|███▏      | 5192/16404 [1:26:24<3:05:33,  1.01it/s][A

Iteration:  32%|███▏      | 5313/16404 [1:28:24<3:03:32,  1.01it/s][A

Iteration:  33%|███▎      | 5434/16404 [1:30:24<3:01:28,  1.01it/s][A

Iteration:  34%|███▍      | 5555/16404 [1:32:24<2:59:27,  1.01it/s][A

Iteration:  35%|███▍      | 5676/16404 [1:34:24<2:57:28,  1.01it/s][A

Iteration:  35%|███▌      | 5797/16404 [1:36:24<2:55:30,  1.01it/s][A

Iteration:  36%|███▌      | 5919/16404 [1:38:25<2:53:24,  1.01it/s][A

Iteration:  37%|███▋      | 6040/16404 [1:40:25<2:51:24,  1.01it/s][A

Iteration:  38%|███▊      | 6161/16404 [1:42:26<2:49:26,  1.01it/s][A

Iteration:  38%|███▊      | 6282/16404 [1:44:26<2:47:34,  1.01it/s][A

Iteration:  39%|███▉      | 6403/16404 [1:46:26<2:45:36,  1.01it/s][A

Iteration:  40%|███▉      | 6524/16404 [1:48:26<2:43:37,  1.01it/s][A

Iteration:  41%|████      | 6645/16404 [1:50:27<2:41:36,  1.01it/s][A

Iteration:  41%|████      | 6766/16404 [1:52:27<2:39:38,  1.01it/s][A

Iteration:  42%|████▏     | 6887/16404 [1:54:27<2:37:35,  1.01it/s][A

Iteration:  43%|████▎     | 7008/16404 [1:56:27<2:35:37,  1.01it/s][A

Iteration:  43%|████▎     | 7129/16404 [1:58:28<2:33:38,  1.01it/s][A

Iteration:  44%|████▍     | 7250/16404 [2:00:28<2:31:38,  1.01it/s][A

Iteration:  45%|████▍     | 7371/16404 [2:02:28<2:29:37,  1.01it/s][A

Iteration:  46%|████▌     | 7492/16404 [2:04:29<2:27:37,  1.01it/s][A

Iteration:  46%|████▋     | 7613/16404 [2:06:29<2:25:38,  1.01it/s][A

Iteration:  47%|████▋     | 7734/16404 [2:08:29<2:23:40,  1.01it/s][A

Iteration:  48%|████▊     | 7855/16404 [2:10:29<2:21:37,  1.01it/s][A

Iteration:  49%|████▊     | 7976/16404 [2:12:30<2:19:35,  1.01it/s][A

Iteration:  49%|████▉     | 8097/16404 [2:14:30<2:17:34,  1.01it/s][A

Iteration:  50%|█████     | 8218/16404 [2:16:30<2:15:36,  1.01it/s][A

Iteration:  51%|█████     | 8339/16404 [2:18:30<2:13:35,  1.01it/s][A

Iteration:  52%|█████▏    | 8460/16404 [2:20:31<2:11:37,  1.01it/s][A

Iteration:  52%|█████▏    | 8581/16404 [2:22:31<2:09:34,  1.01it/s][A

Iteration:  53%|█████▎    | 8702/16404 [2:24:31<2:07:36,  1.01it/s][A

Iteration:  54%|█████▍    | 8823/16404 [2:26:32<2:05:35,  1.01it/s][A

Iteration:  55%|█████▍    | 8944/16404 [2:28:32<2:03:33,  1.01it/s][A

Iteration:  55%|█████▌    | 9065/16404 [2:30:32<2:01:36,  1.01it/s][A

Iteration:  56%|█████▌    | 9186/16404 [2:32:32<1:59:36,  1.01it/s][A

Iteration:  57%|█████▋    | 9307/16404 [2:34:33<1:57:34,  1.01it/s][A

Iteration:  57%|█████▋    | 9428/16404 [2:36:33<1:55:32,  1.01it/s][A

Iteration:  58%|█████▊    | 9549/16404 [2:38:33<1:53:31,  1.01it/s][A

Iteration:  59%|█████▉    | 9670/16404 [2:40:33<1:51:31,  1.01it/s][A

Iteration:  60%|█████▉    | 9791/16404 [2:42:33<1:49:29,  1.01it/s][A

Iteration:  60%|██████    | 9912/16404 [2:44:34<1:47:31,  1.01it/s][A

Iteration:  61%|██████    | 10033/16404 [2:46:34<1:45:28,  1.01it/s][A

Iteration:  62%|██████▏   | 10155/16404 [2:48:35<1:43:22,  1.01it/s][A

Iteration:  63%|██████▎   | 10276/16404 [2:50:35<1:41:21,  1.01it/s][A

Iteration:  63%|██████▎   | 10397/16404 [2:52:35<1:39:20,  1.01it/s][A

Iteration:  64%|██████▍   | 10518/16404 [2:54:35<1:37:22,  1.01it/s][A

Iteration:  65%|██████▍   | 10640/16404 [2:56:36<1:35:19,  1.01it/s][A

Iteration:  66%|██████▌   | 10761/16404 [2:58:36<1:33:18,  1.01it/s][A

Iteration:  66%|██████▋   | 10882/16404 [3:00:36<1:31:18,  1.01it/s][A

Iteration:  67%|██████▋   | 11003/16404 [3:02:36<1:29:19,  1.01it/s][A

Iteration:  68%|██████▊   | 11124/16404 [3:04:36<1:27:20,  1.01it/s][A

Iteration:  69%|██████▊   | 11245/16404 [3:06:36<1:25:19,  1.01it/s][A

Iteration:  69%|██████▉   | 11366/16404 [3:08:36<1:23:19,  1.01it/s][A

Iteration:  70%|███████   | 11487/16404 [3:10:36<1:21:19,  1.01it/s][A

Iteration:  71%|███████   | 11608/16404 [3:12:37<1:19:19,  1.01it/s][A

Iteration:  72%|███████▏  | 11729/16404 [3:14:37<1:17:21,  1.01it/s][A

Iteration:  72%|███████▏  | 11850/16404 [3:16:37<1:15:23,  1.01it/s][A

Iteration:  73%|███████▎  | 11971/16404 [3:18:37<1:13:24,  1.01it/s][A07/16/2020 09:03:54 - INFO - transformers.configuration_utils -   Configuration saved in ./examples/models/test/checkpoint-2000/config.json
07/16/2020 09:03:56 - INFO - transformers.modeling_utils -   Model weights saved in ./examples/models/test/checkpoint-2000/pytorch_model.bin
07/16/2020 09:03:56 - INFO - __main__ -   Saving model checkpoint to ./examples/models/test/checkpoint-2000
/home/ubuntu/.local/lib/python3.6/site-packages/torch/optim/lr_scheduler.py:200: UserWarning: Please also save or load the state of the optimzer when saving or loading the scheduler.
  warnings.warn(SAVE_STATE_WARNING, UserWarning)
07/16/2020 09:03:58 - INFO - __main__ -   Saving optimizer and scheduler states to ./examples/models/test/checkpoint-2000


Iteration:  74%|███████▎  | 12092/16404 [3:20:42<1:12:12,  1.00s/it][A

Iteration:  74%|███████▍  | 12213/16404 [3:22:43<1:09:59,  1.00s/it][A

Iteration:  75%|███████▌  | 12333/16404 [3:24:43<1:07:58,  1.00s/it][A

Iteration:  76%|███████▌  | 12454/16404 [3:26:43<1:05:49,  1.00it/s][A

Iteration:  77%|███████▋  | 12575/16404 [3:28:43<1:03:38,  1.00it/s][A

Iteration:  77%|███████▋  | 12696/16404 [3:30:44<1:01:41,  1.00it/s][A

Iteration:  78%|███████▊  | 12817/16404 [3:32:45<59:41,  1.00it/s]  [A

Iteration:  79%|███████▉  | 12938/16404 [3:34:46<57:41,  1.00it/s][A

Iteration:  80%|███████▉  | 13059/16404 [3:36:47<55:41,  1.00it/s][A

Iteration:  80%|████████  | 13180/16404 [3:38:48<53:40,  1.00it/s][A

Iteration:  81%|████████  | 13301/16404 [3:40:49<51:38,  1.00it/s][A

Iteration:  82%|████████▏ | 13422/16404 [3:42:50<49:39,  1.00it/s][A

Iteration:  83%|████████▎ | 13543/16404 [3:44:50<47:38,  1.00it/s][A

Iteration:  83%|████████▎ | 13664/16404 [3:46:51<45:36,  1.00it/s][A

Iteration:  84%|████████▍ | 13785/16404 [3:48:52<43:36,  1.00it/s][A

Iteration:  85%|████████▍ | 13906/16404 [3:50:53<41:35,  1.00it/s][A

Iteration:  86%|████████▌ | 14027/16404 [3:52:54<39:35,  1.00it/s][A

Iteration:  86%|████████▌ | 14148/16404 [3:54:55<37:35,  1.00it/s][A

Iteration:  87%|████████▋ | 14269/16404 [3:56:56<35:34,  1.00it/s][A

Iteration:  88%|████████▊ | 14390/16404 [3:58:57<33:33,  1.00it/s][A

Iteration:  88%|████████▊ | 14511/16404 [4:00:58<31:32,  1.00it/s][A

Iteration:  89%|████████▉ | 14632/16404 [4:02:59<29:31,  1.00it/s][A

Iteration:  90%|████████▉ | 14753/16404 [4:05:00<27:30,  1.00it/s][A

Iteration:  91%|█████████ | 14874/16404 [4:07:01<25:30,  1.00s/it][A

Iteration:  91%|█████████▏| 14995/16404 [4:09:02<23:28,  1.00it/s][A

Iteration:  92%|█████████▏| 15116/16404 [4:11:03<21:27,  1.00it/s][A

Iteration:  93%|█████████▎| 15237/16404 [4:13:04<19:26,  1.00it/s][A

Iteration:  94%|█████████▎| 15358/16404 [4:15:05<17:25,  1.00it/s][A

Iteration:  94%|█████████▍| 15479/16404 [4:17:06<15:24,  1.00it/s][A

Iteration:  95%|█████████▌| 15600/16404 [4:19:07<13:23,  1.00it/s][A

Iteration:  96%|█████████▌| 15721/16404 [4:21:07<11:22,  1.00it/s][A

Iteration:  97%|█████████▋| 15842/16404 [4:23:08<09:21,  1.00it/s][A

Iteration:  97%|█████████▋| 15963/16404 [4:25:09<07:20,  1.00it/s][A

Iteration:  98%|█████████▊| 16084/16404 [4:27:09<05:19,  1.00it/s][A

Iteration:  99%|█████████▉| 16205/16404 [4:29:10<03:18,  1.00it/s][A

Iteration: 100%|█████████▉| 16326/16404 [4:31:11<01:17,  1.00it/s][A
Iteration: 100%|██████████| 16404/16404 [4:32:29<00:00,  1.00it/s]

Epoch:  33%|███▎      | 1/3 [4:32:29<9:04:58, 16349.13s/it]

Iteration:   0%|          | 0/16404 [00:00<?, ?it/s][A

Iteration:   1%|          | 121/16404 [02:00<4:30:50,  1.00it/s][A

Iteration:   1%|▏         | 242/16404 [04:01<4:28:50,  1.00it/s][A

Iteration:   2%|▏         | 363/16404 [06:02<4:26:49,  1.00it/s][A

Iteration:   3%|▎         | 484/16404 [08:03<4:24:54,  1.00it/s][A

Iteration:   4%|▎         | 605/16404 [10:04<4:22:57,  1.00it/s][A

Iteration:   4%|▍         | 726/16404 [12:05<4:21:04,  1.00it/s][A

Iteration:   5%|▌         | 847/16404 [14:06<4:19:03,  1.00it/s][A

Iteration:   6%|▌         | 968/16404 [16:06<4:17:03,  1.00it/s][A

Iteration:   7%|▋         | 1089/16404 [18:07<4:15:04,  1.00it/s][A

Iteration:   7%|▋         | 1209/16404 [20:07<4:13:07,  1.00it/s][A

Iteration:   8%|▊         | 1330/16404 [22:08<4:11:06,  1.00it/s][A

Iteration:   9%|▉         | 1451/16404 [24:09<4:09:06,  1.00it/s][A

Iteration:  10%|▉         | 1571/16404 [26:09<4:07:08,  1.00it/s][A

Iteration:  10%|█         | 1692/16404 [28:10<4:05:11,  1.00it/s][A

Iteration:  11%|█         | 1812/16404 [30:10<4:03:12,  1.00s/it][A

Iteration:  12%|█▏        | 1933/16404 [32:11<4:01:10,  1.00it/s][A

Iteration:  13%|█▎        | 2053/16404 [34:11<3:59:10,  1.00it/s][A

Iteration:  13%|█▎        | 2174/16404 [36:12<3:57:07,  1.00it/s][A

Iteration:  14%|█▍        | 2295/16404 [38:13<3:55:03,  1.00it/s][A

Iteration:  15%|█▍        | 2416/16404 [40:14<3:53:00,  1.00it/s][A

Iteration:  15%|█▌        | 2537/16404 [42:15<3:50:58,  1.00it/s][A

Iteration:  16%|█▌        | 2658/16404 [44:16<3:49:02,  1.00it/s][A

Iteration:  17%|█▋        | 2778/16404 [46:16<3:47:05,  1.00it/s][A

Iteration:  18%|█▊        | 2899/16404 [48:17<3:45:02,  1.00it/s][A

Iteration:  18%|█▊        | 3019/16404 [50:17<3:43:05,  1.00s/it][A

Iteration:  19%|█▉        | 3140/16404 [52:17<3:40:33,  1.00it/s][A

Iteration:  20%|█▉        | 3261/16404 [54:18<3:38:47,  1.00it/s][A

Iteration:  21%|██        | 3382/16404 [56:19<3:36:46,  1.00it/s][A

Iteration:  21%|██▏       | 3503/16404 [58:20<3:34:45,  1.00it/s][A

Iteration:  22%|██▏       | 3624/16404 [1:00:21<3:32:53,  1.00it/s][A

Iteration:  23%|██▎       | 3745/16404 [1:02:22<3:30:53,  1.00it/s][A

Iteration:  24%|██▎       | 3866/16404 [1:04:23<3:28:52,  1.00it/s][A

Iteration:  24%|██▍       | 3987/16404 [1:06:24<3:26:54,  1.00it/s][A

Iteration:  25%|██▌       | 4108/16404 [1:08:25<3:24:52,  1.00it/s][A

Iteration:  26%|██▌       | 4229/16404 [1:10:26<3:22:52,  1.00it/s][A

Iteration:  27%|██▋       | 4350/16404 [1:12:27<3:20:54,  1.00s/it][A

Iteration:  27%|██▋       | 4470/16404 [1:14:27<3:18:55,  1.00s/it][A

Iteration:  28%|██▊       | 4591/16404 [1:16:28<3:16:51,  1.00it/s][A

Iteration:  29%|██▊       | 4712/16404 [1:18:29<3:14:49,  1.00it/s][A

Iteration:  29%|██▉       | 4833/16404 [1:20:30<3:12:48,  1.00it/s][A

Iteration:  30%|███       | 4954/16404 [1:22:31<3:10:46,  1.00it/s][A

Iteration:  31%|███       | 5075/16404 [1:24:32<3:08:44,  1.00it/s][A

Iteration:  32%|███▏      | 5196/16404 [1:26:33<3:06:49,  1.00s/it][A

Iteration:  32%|███▏      | 5317/16404 [1:28:34<3:04:47,  1.00s/it][A

Iteration:  33%|███▎      | 5438/16404 [1:30:35<3:02:37,  1.00it/s][A

Iteration:  34%|███▍      | 5560/16404 [1:32:35<2:59:59,  1.00it/s][A

Iteration:  35%|███▍      | 5682/16404 [1:34:36<2:57:45,  1.01it/s][A

Iteration:  35%|███▌      | 5803/16404 [1:36:37<2:55:59,  1.00it/s][A

Iteration:  36%|███▌      | 5924/16404 [1:38:38<2:53:56,  1.00it/s][A

Iteration:  37%|███▋      | 6045/16404 [1:40:38<2:51:52,  1.00it/s][A

Iteration:  38%|███▊      | 6166/16404 [1:42:38<2:49:49,  1.00it/s][A

Iteration:  38%|███▊      | 6287/16404 [1:44:39<2:47:46,  1.00it/s][A

Iteration:  39%|███▉      | 6408/16404 [1:46:39<2:45:49,  1.00it/s][A

Iteration:  40%|███▉      | 6529/16404 [1:48:39<2:43:43,  1.01it/s][A

Iteration:  41%|████      | 6650/16404 [1:50:40<2:41:39,  1.01it/s][A

Iteration:  41%|████▏     | 6771/16404 [1:52:40<2:39:34,  1.01it/s][A

Iteration:  42%|████▏     | 6892/16404 [1:54:40<2:37:33,  1.01it/s][A

Iteration:  43%|████▎     | 7013/16404 [1:56:40<2:35:31,  1.01it/s][A

Iteration:  43%|████▎     | 7134/16404 [1:58:41<2:33:35,  1.01it/s][A

Iteration:  44%|████▍     | 7255/16404 [2:00:41<2:31:36,  1.01it/s][A

Iteration:  45%|████▍     | 7376/16404 [2:02:41<2:29:33,  1.01it/s][A

Iteration:  46%|████▌     | 7497/16404 [2:04:41<2:27:32,  1.01it/s][A07/16/2020 12:23:37 - INFO - transformers.configuration_utils -   Configuration saved in ./examples/models/test/checkpoint-4000/config.json
07/16/2020 12:23:38 - INFO - transformers.modeling_utils -   Model weights saved in ./examples/models/test/checkpoint-4000/pytorch_model.bin
07/16/2020 12:23:38 - INFO - __main__ -   Saving model checkpoint to ./examples/models/test/checkpoint-4000
07/16/2020 12:23:41 - INFO - __main__ -   Saving optimizer and scheduler states to ./examples/models/test/checkpoint-4000


Iteration:  46%|████▋     | 7618/16404 [2:06:45<2:26:53,  1.00s/it][A

Iteration:  47%|████▋     | 7739/16404 [2:08:46<2:24:37,  1.00s/it][A

Iteration:  48%|████▊     | 7860/16404 [2:10:47<2:22:32,  1.00s/it][A

Iteration:  49%|████▊     | 7981/16404 [2:12:48<2:20:26,  1.00s/it][A

Iteration:  49%|████▉     | 8102/16404 [2:14:49<2:18:22,  1.00s/it][A

Iteration:  50%|█████     | 8223/16404 [2:16:50<2:16:14,  1.00it/s][A

Iteration:  51%|█████     | 8344/16404 [2:18:50<2:14:11,  1.00it/s][A

Iteration:  52%|█████▏    | 8465/16404 [2:20:51<2:12:08,  1.00it/s][A

Iteration:  52%|█████▏    | 8586/16404 [2:22:52<2:10:10,  1.00it/s][A

Iteration:  53%|█████▎    | 8707/16404 [2:24:53<2:08:07,  1.00it/s][A

Iteration:  54%|█████▍    | 8828/16404 [2:26:54<2:06:07,  1.00it/s][A

Iteration:  55%|█████▍    | 8949/16404 [2:28:55<2:04:06,  1.00it/s][A

Iteration:  55%|█████▌    | 9070/16404 [2:30:55<2:02:03,  1.00it/s][A

Iteration:  56%|█████▌    | 9191/16404 [2:32:56<2:00:01,  1.00it/s][A

Iteration:  57%|█████▋    | 9312/16404 [2:34:57<1:58:02,  1.00it/s][A

Iteration:  58%|█████▊    | 9433/16404 [2:36:58<1:55:59,  1.00it/s][A

Iteration:  58%|█████▊    | 9554/16404 [2:38:58<1:53:56,  1.00it/s][A

Iteration:  59%|█████▉    | 9675/16404 [2:40:59<1:51:57,  1.00it/s][A

Iteration:  60%|█████▉    | 9796/16404 [2:43:00<1:49:57,  1.00it/s][A

Iteration:  60%|██████    | 9917/16404 [2:45:01<1:47:57,  1.00it/s][A

Iteration:  61%|██████    | 10038/16404 [2:47:02<1:46:00,  1.00it/s][A

Iteration:  62%|██████▏   | 10159/16404 [2:49:03<1:43:59,  1.00it/s][A

Iteration:  63%|██████▎   | 10280/16404 [2:51:04<1:41:58,  1.00it/s][A

Iteration:  63%|██████▎   | 10401/16404 [2:53:05<1:39:58,  1.00it/s][A

Iteration:  64%|██████▍   | 10522/16404 [2:55:06<1:37:56,  1.00it/s][A

Iteration:  65%|██████▍   | 10643/16404 [2:57:06<1:35:52,  1.00it/s][A

Iteration:  66%|██████▌   | 10764/16404 [2:59:07<1:33:50,  1.00it/s][A

Iteration:  66%|██████▋   | 10885/16404 [3:01:08<1:31:49,  1.00it/s][A

Iteration:  67%|██████▋   | 11006/16404 [3:03:09<1:29:48,  1.00it/s][A

Iteration:  68%|██████▊   | 11127/16404 [3:05:09<1:27:48,  1.00it/s][A

Iteration:  69%|██████▊   | 11248/16404 [3:07:10<1:25:47,  1.00it/s][A

Iteration:  69%|██████▉   | 11369/16404 [3:09:11<1:23:47,  1.00it/s][A

Iteration:  70%|███████   | 11490/16404 [3:11:12<1:21:44,  1.00it/s][A

Iteration:  71%|███████   | 11611/16404 [3:13:12<1:19:41,  1.00it/s][A

Iteration:  72%|███████▏  | 11732/16404 [3:15:13<1:17:38,  1.00it/s][A

Iteration:  72%|███████▏  | 11853/16404 [3:17:14<1:15:40,  1.00it/s][A

Iteration:  73%|███████▎  | 11974/16404 [3:19:15<1:13:41,  1.00it/s][A

Iteration:  74%|███████▎  | 12095/16404 [3:21:15<1:11:41,  1.00it/s][A

Iteration:  74%|███████▍  | 12216/16404 [3:23:16<1:09:41,  1.00it/s][A

Iteration:  75%|███████▌  | 12337/16404 [3:25:17<1:07:39,  1.00it/s][A

Iteration:  76%|███████▌  | 12458/16404 [3:27:18<1:05:38,  1.00it/s][A

Iteration:  77%|███████▋  | 12579/16404 [3:29:18<1:03:36,  1.00it/s][A

Iteration:  77%|███████▋  | 12700/16404 [3:31:19<1:01:35,  1.00it/s][A

Iteration:  78%|███████▊  | 12821/16404 [3:33:20<59:36,  1.00it/s]  [A

Iteration:  79%|███████▉  | 12942/16404 [3:35:21<57:37,  1.00it/s][A

Iteration:  80%|███████▉  | 13063/16404 [3:37:22<55:35,  1.00it/s][A

Iteration:  80%|████████  | 13184/16404 [3:39:22<53:35,  1.00it/s][A

Iteration:  81%|████████  | 13305/16404 [3:41:23<51:33,  1.00it/s][A

Iteration:  82%|████████▏ | 13426/16404 [3:43:24<49:31,  1.00it/s][A

Iteration:  83%|████████▎ | 13547/16404 [3:45:24<47:28,  1.00it/s][A

Iteration:  83%|████████▎ | 13668/16404 [3:47:25<45:28,  1.00it/s][A

Iteration:  84%|████████▍ | 13789/16404 [3:49:26<43:27,  1.00it/s][A

Iteration:  85%|████████▍ | 13910/16404 [3:51:26<41:26,  1.00it/s][A

Iteration:  86%|████████▌ | 14031/16404 [3:53:27<39:27,  1.00it/s][A

Iteration:  86%|████████▋ | 14152/16404 [3:55:28<37:27,  1.00it/s][A

Iteration:  87%|████████▋ | 14273/16404 [3:57:29<35:26,  1.00it/s][A

Iteration:  88%|████████▊ | 14394/16404 [3:59:29<33:26,  1.00it/s][A

Iteration:  88%|████████▊ | 14515/16404 [4:01:30<31:25,  1.00it/s][A

Iteration:  89%|████████▉ | 14636/16404 [4:03:31<29:24,  1.00it/s][A

Iteration:  90%|████████▉ | 14757/16404 [4:05:32<27:24,  1.00it/s][A

Iteration:  91%|█████████ | 14878/16404 [4:07:33<25:23,  1.00it/s][A

Iteration:  91%|█████████▏| 14999/16404 [4:09:34<23:23,  1.00it/s][A

Iteration:  92%|█████████▏| 15120/16404 [4:11:35<21:22,  1.00it/s][A

Iteration:  93%|█████████▎| 15241/16404 [4:13:36<19:21,  1.00it/s][A

Iteration:  94%|█████████▎| 15362/16404 [4:15:37<17:21,  1.00it/s][A

Iteration:  94%|█████████▍| 15483/16404 [4:17:37<15:20,  1.00it/s][A

Iteration:  95%|█████████▌| 15604/16404 [4:19:38<13:19,  1.00it/s][A

Iteration:  96%|█████████▌| 15725/16404 [4:21:39<11:18,  1.00it/s][A

Iteration:  97%|█████████▋| 15846/16404 [4:23:41<09:17,  1.00it/s][A

Iteration:  97%|█████████▋| 15967/16404 [4:25:42<07:17,  1.00s/it][A

Iteration:  98%|█████████▊| 16087/16404 [4:27:42<05:17,  1.00s/it][A

Iteration:  99%|█████████▉| 16208/16404 [4:29:43<03:15,  1.00it/s][A

Iteration: 100%|█████████▉| 16329/16404 [4:31:43<01:14,  1.00it/s][A
Iteration: 100%|██████████| 16404/16404 [4:32:58<00:00,  1.00it/s]

Epoch:  67%|██████▋   | 2/3 [9:05:28<4:32:38, 16358.08s/it]

Iteration:   0%|          | 0/16404 [00:00<?, ?it/s][A

Iteration:   1%|          | 121/16404 [02:00<4:31:09,  1.00it/s][A

Iteration:   1%|▏         | 242/16404 [04:01<4:29:10,  1.00it/s][A

Iteration:   2%|▏         | 363/16404 [06:02<4:27:08,  1.00it/s][A

Iteration:   3%|▎         | 484/16404 [08:03<4:25:09,  1.00it/s][A

Iteration:   4%|▎         | 605/16404 [10:04<4:23:09,  1.00it/s][A

Iteration:   4%|▍         | 726/16404 [12:05<4:21:15,  1.00it/s][A

Iteration:   5%|▌         | 846/16404 [14:05<4:19:16,  1.00it/s][A

Iteration:   6%|▌         | 967/16404 [16:06<4:17:14,  1.00it/s][A

Iteration:   7%|▋         | 1088/16404 [18:07<4:15:10,  1.00it/s][A

Iteration:   7%|▋         | 1209/16404 [20:08<4:13:09,  1.00it/s][A

Iteration:   8%|▊         | 1330/16404 [22:09<4:11:04,  1.00it/s][A

Iteration:   9%|▉         | 1451/16404 [24:10<4:09:02,  1.00it/s][A

Iteration:  10%|▉         | 1572/16404 [26:11<4:07:02,  1.00it/s][A

Iteration:  10%|█         | 1693/16404 [28:12<4:05:01,  1.00it/s][A

Iteration:  11%|█         | 1814/16404 [30:13<4:03:00,  1.00it/s][A

Iteration:  12%|█▏        | 1935/16404 [32:13<4:00:57,  1.00it/s][A

Iteration:  13%|█▎        | 2056/16404 [34:14<3:58:56,  1.00it/s][A

Iteration:  13%|█▎        | 2177/16404 [36:15<3:56:52,  1.00it/s][A

Iteration:  14%|█▍        | 2298/16404 [38:16<3:54:56,  1.00it/s][A

Iteration:  15%|█▍        | 2419/16404 [40:17<3:52:50,  1.00it/s][A

Iteration:  15%|█▌        | 2540/16404 [42:18<3:50:47,  1.00it/s][A

Iteration:  16%|█▌        | 2661/16404 [44:19<3:48:48,  1.00it/s][A

Iteration:  17%|█▋        | 2782/16404 [46:20<3:46:52,  1.00it/s][A

Iteration:  18%|█▊        | 2903/16404 [48:21<3:44:51,  1.00it/s][A

Iteration:  18%|█▊        | 3024/16404 [50:22<3:42:54,  1.00it/s][A

Iteration:  19%|█▉        | 3145/16404 [52:23<3:40:53,  1.00it/s][A07/16/2020 15:43:26 - INFO - transformers.configuration_utils -   Configuration saved in ./examples/models/test/checkpoint-6000/config.json
07/16/2020 15:43:27 - INFO - transformers.modeling_utils -   Model weights saved in ./examples/models/test/checkpoint-6000/pytorch_model.bin
07/16/2020 15:43:27 - INFO - __main__ -   Saving model checkpoint to ./examples/models/test/checkpoint-6000
07/16/2020 15:43:30 - INFO - __main__ -   Saving optimizer and scheduler states to ./examples/models/test/checkpoint-6000


Iteration:  20%|█▉        | 3266/16404 [54:27<3:40:56,  1.01s/it][A

Iteration:  21%|██        | 3387/16404 [56:28<3:38:16,  1.01s/it][A

Iteration:  21%|██▏       | 3508/16404 [58:29<3:35:47,  1.00s/it][A

Iteration:  22%|██▏       | 3628/16404 [1:00:29<3:33:33,  1.00s/it][A

Iteration:  23%|██▎       | 3748/16404 [1:02:29<3:31:24,  1.00s/it][A

Iteration:  24%|██▎       | 3869/16404 [1:04:30<3:29:11,  1.00s/it][A

Iteration:  24%|██▍       | 3990/16404 [1:06:31<3:27:03,  1.00s/it][A

Iteration:  25%|██▌       | 4111/16404 [1:08:32<3:24:51,  1.00it/s][A

Iteration:  26%|██▌       | 4232/16404 [1:10:33<3:22:47,  1.00it/s][A

Iteration:  27%|██▋       | 4353/16404 [1:12:33<3:20:40,  1.00it/s][A

Iteration:  27%|██▋       | 4474/16404 [1:14:34<3:18:39,  1.00it/s][A

Iteration:  28%|██▊       | 4595/16404 [1:16:35<3:16:37,  1.00it/s][A

Iteration:  29%|██▊       | 4716/16404 [1:18:36<3:14:42,  1.00it/s][A

Iteration:  29%|██▉       | 4837/16404 [1:20:37<3:12:42,  1.00it/s][A

Iteration:  30%|███       | 4958/16404 [1:22:38<3:10:43,  1.00it/s][A

Iteration:  31%|███       | 5079/16404 [1:24:39<3:08:42,  1.00it/s][A

Iteration:  32%|███▏      | 5200/16404 [1:26:40<3:06:41,  1.00it/s][A

Iteration:  32%|███▏      | 5321/16404 [1:28:41<3:04:38,  1.00it/s][A

Iteration:  33%|███▎      | 5442/16404 [1:30:42<3:02:40,  1.00it/s][A

Iteration:  34%|███▍      | 5563/16404 [1:32:43<3:00:33,  1.00it/s][A

Iteration:  35%|███▍      | 5684/16404 [1:34:44<2:58:26,  1.00it/s][A

Iteration:  35%|███▌      | 5805/16404 [1:36:44<2:56:23,  1.00it/s][A

Iteration:  36%|███▌      | 5926/16404 [1:38:45<2:54:22,  1.00it/s][A

Iteration:  37%|███▋      | 6047/16404 [1:40:46<2:52:21,  1.00it/s][A

Iteration:  38%|███▊      | 6168/16404 [1:42:47<2:50:26,  1.00it/s][A

Iteration:  38%|███▊      | 6289/16404 [1:44:48<2:48:27,  1.00it/s][A

Iteration:  39%|███▉      | 6410/16404 [1:46:49<2:46:26,  1.00it/s][A

Iteration:  40%|███▉      | 6531/16404 [1:48:50<2:44:20,  1.00it/s][A

Iteration:  41%|████      | 6652/16404 [1:50:51<2:42:20,  1.00it/s][A

Iteration:  41%|████▏     | 6773/16404 [1:52:51<2:40:20,  1.00it/s][A

Iteration:  42%|████▏     | 6894/16404 [1:54:52<2:38:22,  1.00it/s][A

Iteration:  43%|████▎     | 7015/16404 [1:56:53<2:36:22,  1.00it/s][A

Iteration:  44%|████▎     | 7136/16404 [1:58:54<2:34:18,  1.00it/s][A

Iteration:  44%|████▍     | 7257/16404 [2:00:55<2:32:14,  1.00it/s][A

Iteration:  45%|████▍     | 7378/16404 [2:02:56<2:30:13,  1.00it/s][A

Iteration:  46%|████▌     | 7499/16404 [2:04:57<2:28:11,  1.00it/s][A

Iteration:  46%|████▋     | 7620/16404 [2:06:57<2:26:14,  1.00it/s][A

Iteration:  47%|████▋     | 7741/16404 [2:08:58<2:24:10,  1.00it/s][A

Iteration:  48%|████▊     | 7862/16404 [2:10:59<2:22:09,  1.00it/s][A

Iteration:  49%|████▊     | 7983/16404 [2:13:00<2:20:09,  1.00it/s][A

Iteration:  49%|████▉     | 8104/16404 [2:15:01<2:18:12,  1.00it/s][A

Iteration:  50%|█████     | 8225/16404 [2:17:02<2:16:10,  1.00it/s][A

Iteration:  51%|█████     | 8346/16404 [2:19:03<2:14:12,  1.00it/s][A

Iteration:  52%|█████▏    | 8467/16404 [2:21:04<2:12:09,  1.00it/s][A

Iteration:  52%|█████▏    | 8588/16404 [2:23:04<2:10:06,  1.00it/s][A

Iteration:  53%|█████▎    | 8709/16404 [2:25:05<2:08:05,  1.00it/s][A

Iteration:  54%|█████▍    | 8830/16404 [2:27:06<2:06:04,  1.00it/s][A

Iteration:  55%|█████▍    | 8951/16404 [2:29:07<2:04:04,  1.00it/s][A

Iteration:  55%|█████▌    | 9072/16404 [2:31:08<2:02:05,  1.00it/s][A

Iteration:  56%|█████▌    | 9193/16404 [2:33:09<2:00:04,  1.00it/s][A

Iteration:  57%|█████▋    | 9314/16404 [2:35:10<1:58:03,  1.00it/s][A

Iteration:  58%|█████▊    | 9435/16404 [2:37:10<1:56:00,  1.00it/s][A

Iteration:  58%|█████▊    | 9556/16404 [2:39:11<1:53:58,  1.00it/s][A

Iteration:  59%|█████▉    | 9677/16404 [2:41:12<1:51:57,  1.00it/s][A

Iteration:  60%|█████▉    | 9798/16404 [2:43:13<1:49:59,  1.00it/s][A

Iteration:  60%|██████    | 9919/16404 [2:45:14<1:47:59,  1.00it/s][A

Iteration:  61%|██████    | 10040/16404 [2:47:15<1:45:56,  1.00it/s][A

Iteration:  62%|██████▏   | 10161/16404 [2:49:16<1:43:54,  1.00it/s][A

Iteration:  63%|██████▎   | 10282/16404 [2:51:16<1:41:54,  1.00it/s][A

Iteration:  63%|██████▎   | 10403/16404 [2:53:17<1:39:54,  1.00it/s][A

Iteration:  64%|██████▍   | 10525/16404 [2:55:18<1:37:39,  1.00it/s][A

Iteration:  65%|██████▍   | 10646/16404 [2:57:19<1:35:38,  1.00it/s][A

Iteration:  66%|██████▌   | 10767/16404 [2:59:19<1:33:37,  1.00it/s][A

Iteration:  66%|██████▋   | 10888/16404 [3:01:20<1:31:36,  1.00it/s][A

Iteration:  67%|██████▋   | 11009/16404 [3:03:20<1:29:34,  1.00it/s][A

Iteration:  68%|██████▊   | 11130/16404 [3:05:21<1:27:36,  1.00it/s][A

Iteration:  69%|██████▊   | 11251/16404 [3:07:22<1:25:33,  1.00it/s][A

Iteration:  69%|██████▉   | 11372/16404 [3:09:22<1:23:32,  1.00it/s][A

Iteration:  70%|███████   | 11493/16404 [3:11:23<1:21:31,  1.00it/s][A

Iteration:  71%|███████   | 11614/16404 [3:13:23<1:19:29,  1.00it/s][A

Iteration:  72%|███████▏  | 11735/16404 [3:15:24<1:17:29,  1.00it/s][A

Iteration:  72%|███████▏  | 11856/16404 [3:17:24<1:15:30,  1.00it/s][A

Iteration:  73%|███████▎  | 11977/16404 [3:19:25<1:13:28,  1.00it/s][A

Iteration:  74%|███████▍  | 12098/16404 [3:21:25<1:11:26,  1.00it/s][A

Iteration:  74%|███████▍  | 12219/16404 [3:23:25<1:09:25,  1.00it/s][A

Iteration:  75%|███████▌  | 12340/16404 [3:25:26<1:07:26,  1.00it/s][A

Iteration:  76%|███████▌  | 12461/16404 [3:27:26<1:05:25,  1.00it/s][A

Iteration:  77%|███████▋  | 12582/16404 [3:29:27<1:03:26,  1.00it/s][A

Iteration:  77%|███████▋  | 12703/16404 [3:31:27<1:01:26,  1.00it/s][A

Iteration:  78%|███████▊  | 12824/16404 [3:33:28<59:25,  1.00it/s]  [A

Iteration:  79%|███████▉  | 12945/16404 [3:35:29<57:25,  1.00it/s][A

Iteration:  80%|███████▉  | 13066/16404 [3:37:29<55:24,  1.00it/s][A

Iteration:  80%|████████  | 13187/16404 [3:39:30<53:24,  1.00it/s][A

Iteration:  81%|████████  | 13308/16404 [3:41:30<51:24,  1.00it/s][A

Iteration:  82%|████████▏ | 13429/16404 [3:43:31<49:24,  1.00it/s][A

Iteration:  83%|████████▎ | 13550/16404 [3:45:31<47:24,  1.00it/s][A

Iteration:  83%|████████▎ | 13671/16404 [3:47:32<45:24,  1.00it/s][A

Iteration:  84%|████████▍ | 13792/16404 [3:49:33<43:23,  1.00it/s][A

Iteration:  85%|████████▍ | 13913/16404 [3:51:33<41:21,  1.00it/s][A

Iteration:  86%|████████▌ | 14034/16404 [3:53:34<39:21,  1.00it/s][A

Iteration:  86%|████████▋ | 14155/16404 [3:55:34<37:19,  1.00it/s][A

Iteration:  87%|████████▋ | 14276/16404 [3:57:35<35:19,  1.00it/s][A

Iteration:  88%|████████▊ | 14397/16404 [3:59:35<33:17,  1.00it/s][A

Iteration:  89%|████████▊ | 14518/16404 [4:01:35<31:16,  1.00it/s][A

Iteration:  89%|████████▉ | 14639/16404 [4:03:36<29:16,  1.01it/s][A

Iteration:  90%|████████▉ | 14760/16404 [4:05:36<27:16,  1.00it/s][A

Iteration:  91%|█████████ | 14881/16404 [4:07:37<25:16,  1.00it/s][A

Iteration:  91%|█████████▏| 15002/16404 [4:09:37<23:15,  1.00it/s][A

Iteration:  92%|█████████▏| 15123/16404 [4:11:38<21:16,  1.00it/s][A07/16/2020 19:03:03 - INFO - transformers.configuration_utils -   Configuration saved in ./examples/models/test/checkpoint-8000/config.json
07/16/2020 19:03:04 - INFO - transformers.modeling_utils -   Model weights saved in ./examples/models/test/checkpoint-8000/pytorch_model.bin
07/16/2020 19:03:04 - INFO - __main__ -   Saving model checkpoint to ./examples/models/test/checkpoint-8000
07/16/2020 19:03:07 - INFO - __main__ -   Saving optimizer and scheduler states to ./examples/models/test/checkpoint-8000


Iteration:  93%|█████████▎| 15244/16404 [4:13:43<19:27,  1.01s/it][A

Iteration:  94%|█████████▎| 15365/16404 [4:15:43<17:22,  1.00s/it][A

Iteration:  94%|█████████▍| 15486/16404 [4:17:43<15:18,  1.00s/it][A

Iteration:  95%|█████████▌| 15607/16404 [4:19:43<13:15,  1.00it/s][A

Iteration:  96%|█████████▌| 15728/16404 [4:21:43<11:13,  1.00it/s][A

Iteration:  97%|█████████▋| 15849/16404 [4:23:44<09:12,  1.00it/s][A

Iteration:  97%|█████████▋| 15970/16404 [4:25:44<07:11,  1.01it/s][A

Iteration:  98%|█████████▊| 16091/16404 [4:27:44<05:11,  1.01it/s][A

Iteration:  99%|█████████▉| 16212/16404 [4:29:44<03:10,  1.01it/s][A

Iteration: 100%|█████████▉| 16333/16404 [4:31:44<01:10,  1.01it/s][A
Iteration: 100%|██████████| 16404/16404 [4:32:55<00:00,  1.00it/s]

Epoch: 100%|██████████| 3/3 [13:38:23<00:00, 16363.28s/it]
Epoch: 100%|██████████| 3/3 [13:38:23<00:00, 16367.83s/it]
07/16/2020 19:23:17 - INFO - __main__ -    Training done in total 13.639871 hours
07/16/2020 19:23:17 - INFO - __main__ -    global_step = 8203, average loss = 0.8324675223705326
07/16/2020 19:23:17 - INFO - __main__ -   Saving model checkpoint to ./examples/models/test/
07/16/2020 19:23:17 - INFO - transformers.configuration_utils -   Configuration saved in ./examples/models/test/config.json
07/16/2020 19:23:18 - INFO - transformers.modeling_utils -   Model weights saved in ./examples/models/test/pytorch_model.bin
07/16/2020 19:23:19 - INFO - transformers.configuration_utils -   loading configuration file ./examples/models/test/config.json
07/16/2020 19:23:19 - INFO - transformers.configuration_utils -   Model config RobertaConfig {
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

07/16/2020 19:23:19 - INFO - transformers.modeling_utils -   loading weights file ./examples/models/test/pytorch_model.bin
07/16/2020 19:23:30 - INFO - transformers.modeling_utils -   All model checkpoint weights were used when initializing RobertaForQuestionAnswering.

07/16/2020 19:23:30 - INFO - transformers.modeling_utils -   All the weights of RobertaForQuestionAnswering were initialized from the model checkpoint at ./examples/models/test/.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use RobertaForQuestionAnswering for predictions without further training.
07/16/2020 19:23:30 - INFO - transformers.configuration_utils -   loading configuration file ./examples/models/test/config.json
07/16/2020 19:23:30 - INFO - transformers.configuration_utils -   Model config RobertaConfig {
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

07/16/2020 19:23:30 - INFO - transformers.tokenization_utils_base -   Model name './examples/models/test/' not found in model shortcut name list (roberta-base, roberta-large, roberta-large-mnli, distilroberta-base, roberta-base-openai-detector, roberta-large-openai-detector). Assuming './examples/models/test/' is a path, a model identifier, or url to a directory containing tokenizer files.
07/16/2020 19:23:30 - INFO - transformers.tokenization_utils_base -   Didn't find file ./examples/models/test/added_tokens.json. We won't load it.
07/16/2020 19:23:30 - INFO - transformers.tokenization_utils_base -   Didn't find file ./examples/models/test/tokenizer.json. We won't load it.
07/16/2020 19:23:30 - INFO - transformers.tokenization_utils_base -   loading file ./examples/models/test/vocab.json
07/16/2020 19:23:30 - INFO - transformers.tokenization_utils_base -   loading file ./examples/models/test/merges.txt
07/16/2020 19:23:30 - INFO - transformers.tokenization_utils_base -   loading file None
07/16/2020 19:23:30 - INFO - transformers.tokenization_utils_base -   loading file ./examples/models/test/special_tokens_map.json
07/16/2020 19:23:30 - INFO - transformers.tokenization_utils_base -   loading file ./examples/models/test/tokenizer_config.json
07/16/2020 19:23:30 - INFO - transformers.tokenization_utils_base -   loading file None
07/16/2020 19:23:31 - INFO - __main__ -   Loading checkpoints saved during training for evaluation
07/16/2020 19:23:31 - INFO - __main__ -   Evaluate the following checkpoints: ['./examples/models/test/']
07/16/2020 19:23:31 - INFO - transformers.configuration_utils -   loading configuration file ./examples/models/test/config.json
07/16/2020 19:23:31 - INFO - transformers.configuration_utils -   Model config RobertaConfig {
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

07/16/2020 19:23:31 - INFO - transformers.modeling_utils -   loading weights file ./examples/models/test/pytorch_model.bin
07/16/2020 19:23:42 - INFO - transformers.modeling_utils -   All model checkpoint weights were used when initializing RobertaForQuestionAnswering.

07/16/2020 19:23:42 - INFO - transformers.modeling_utils -   All the weights of RobertaForQuestionAnswering were initialized from the model checkpoint at ./examples/models/test/.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use RobertaForQuestionAnswering for predictions without further training.
07/16/2020 19:23:43 - INFO - __main__ -   Creating features from dataset file at .

  0%|          | 0/35 [00:00<?, ?it/s]
  6%|▌         | 2/35 [00:00<00:02, 15.59it/s]
 11%|█▏        | 4/35 [00:00<00:01, 16.37it/s]
 17%|█▋        | 6/35 [00:00<00:01, 14.56it/s]
 20%|██        | 7/35 [00:00<00:02, 12.32it/s]
 26%|██▌       | 9/35 [00:00<00:02, 12.15it/s]
 31%|███▏      | 11/35 [00:00<00:02,  9.65it/s]
 34%|███▍      | 12/35 [00:01<00:02,  9.71it/s]
 40%|████      | 14/35 [00:01<00:02, 10.16it/s]
 46%|████▌     | 16/35 [00:01<00:01, 10.60it/s]
 51%|█████▏    | 18/35 [00:01<00:01, 11.03it/s]
 57%|█████▋    | 20/35 [00:01<00:01, 12.33it/s]
 63%|██████▎   | 22/35 [00:01<00:00, 13.02it/s]
 69%|██████▊   | 24/35 [00:02<00:01, 10.97it/s]
 74%|███████▍  | 26/35 [00:02<00:00,  9.41it/s]
 80%|████████  | 28/35 [00:02<00:00, 10.58it/s]
 86%|████████▌ | 30/35 [00:02<00:00, 10.29it/s]
 91%|█████████▏| 32/35 [00:03<00:00,  7.96it/s]
 94%|█████████▍| 33/35 [00:03<00:00,  7.96it/s]
 97%|█████████▋| 34/35 [00:03<00:00,  8.32it/s]
100%|██████████| 35/35 [00:03<00:00,  8.53it/s]
100%|██████████| 35/35 [00:03<00:00, 10.12it/s]

convert squad examples to features:   0%|          | 0/11873 [00:00<?, ?it/s]
convert squad examples to features:   0%|          | 1/11873 [00:00<1:22:46,  2.39it/s]
convert squad examples to features:   6%|▌         | 705/11873 [00:00<54:30,  3.41it/s]
convert squad examples to features:   9%|▉         | 1089/11873 [00:00<36:51,  4.88it/s]
convert squad examples to features:  11%|█         | 1313/11873 [00:00<25:17,  6.96it/s]
convert squad examples to features:  13%|█▎        | 1601/11873 [00:00<17:15,  9.92it/s]
convert squad examples to features:  17%|█▋        | 2017/11873 [00:01<11:36, 14.16it/s]
convert squad examples to features:  19%|█▉        | 2247/11873 [00:01<07:57, 20.15it/s]
convert squad examples to features:  21%|██        | 2462/11873 [00:01<05:28, 28.68it/s]
convert squad examples to features:  23%|██▎       | 2753/11873 [00:01<03:43, 40.73it/s]
convert squad examples to features:  25%|██▌       | 2977/11873 [00:01<02:36, 57.01it/s]
convert squad examples to features:  27%|██▋       | 3152/11873 [00:02<01:52, 77.38it/s]
convert squad examples to features:  28%|██▊       | 3289/11873 [00:02<01:26, 99.75it/s]
convert squad examples to features:  37%|███▋      | 4385/11873 [00:02<00:52, 141.94it/s]
convert squad examples to features:  40%|████      | 4782/11873 [00:02<00:35, 199.46it/s]
convert squad examples to features:  44%|████▍     | 5249/11873 [00:02<00:24, 272.69it/s]
convert squad examples to features:  48%|████▊     | 5665/11873 [00:03<00:16, 372.15it/s]
convert squad examples to features:  51%|█████▏    | 6113/11873 [00:03<00:11, 509.78it/s]
convert squad examples to features:  54%|█████▍    | 6438/11873 [00:03<00:08, 651.40it/s]
convert squad examples to features:  57%|█████▋    | 6722/11873 [00:03<00:07, 720.07it/s]
convert squad examples to features:  61%|██████▏   | 7297/11873 [00:03<00:04, 971.27it/s]
convert squad examples to features:  64%|██████▍   | 7613/11873 [00:04<00:03, 1161.24it/s]
convert squad examples to features:  67%|██████▋   | 7898/11873 [00:04<00:03, 1253.93it/s]
convert squad examples to features:  69%|██████▉   | 8193/11873 [00:04<00:02, 1458.06it/s]
convert squad examples to features:  71%|███████   | 8436/11873 [00:04<00:02, 1484.41it/s]
convert squad examples to features:  74%|███████▍  | 8801/11873 [00:04<00:01, 1762.85it/s]
convert squad examples to features:  76%|███████▋  | 9057/11873 [00:04<00:01, 1894.91it/s]
convert squad examples to features:  78%|███████▊  | 9313/11873 [00:04<00:01, 2035.52it/s]
convert squad examples to features:  81%|████████  | 9569/11873 [00:04<00:01, 1938.89it/s]
convert squad examples to features:  82%|████████▏ | 9791/11873 [00:05<00:01, 1991.99it/s]
convert squad examples to features:  85%|████████▌ | 10145/11873 [00:05<00:00, 2238.72it/s]
convert squad examples to features:  88%|████████▊ | 10393/11873 [00:05<00:00, 1644.06it/s]
convert squad examples to features:  91%|█████████ | 10817/11873 [00:05<00:00, 1212.40it/s]
convert squad examples to features:  97%|█████████▋| 11521/11873 [00:06<00:00, 1369.69it/s]
convert squad examples to features: 100%|██████████| 11873/11873 [00:06<00:00, 1872.36it/s]

add example index and unique id:   0%|          | 0/11873 [00:00<?, ?it/s]
add example index and unique id: 100%|██████████| 11873/11873 [00:00<00:00, 884748.81it/s]
07/16/2020 19:23:54 - INFO - __main__ -   Saving features into cached file ./cached_dev_roberta-large_512
07/16/2020 19:24:11 - INFO - __main__ -   ***** Running evaluation  *****
07/16/2020 19:24:11 - INFO - __main__ -     Num examples = 12161
07/16/2020 19:24:11 - INFO - __main__ -     Batch size = 12

Evaluating:   0%|          | 0/1014 [00:00<?, ?it/s]
Evaluating:   8%|▊         | 86/1014 [02:00<21:44,  1.41s/it]
Evaluating:  17%|█▋        | 170/1014 [04:00<19:52,  1.41s/it]
Evaluating:  25%|██▌       | 255/1014 [06:02<17:55,  1.42s/it]
Evaluating:  33%|███▎      | 337/1014 [08:02<16:09,  1.43s/it]
Evaluating:  42%|████▏     | 422/1014 [10:03<14:06,  1.43s/it]
Evaluating:  50%|█████     | 507/1014 [12:03<12:02,  1.43s/it]
Evaluating:  58%|█████▊    | 592/1014 [14:03<09:59,  1.42s/it]
Evaluating:  67%|██████▋   | 677/1014 [16:04<07:59,  1.42s/it]
Evaluating:  75%|███████▌  | 762/1014 [18:04<05:57,  1.42s/it]
Evaluating:  84%|████████▎ | 848/1014 [20:05<03:54,  1.42s/it]
Evaluating:  92%|█████████▏| 933/1014 [22:06<01:54,  1.41s/it]
Evaluating: 100%|██████████| 1014/1014 [23:59<00:00,  1.42s/it]
07/16/2020 19:48:11 - INFO - __main__ -     Evaluation done in total 1439.415438 secs (0.118363 sec per example)
07/16/2020 19:48:11 - INFO - transformers.data.metrics.squad_metrics -   Writing predictions to: ./examples/models/test/predictions_.json
07/16/2020 19:48:11 - INFO - transformers.data.metrics.squad_metrics -   Writing nbest to: ./examples/models/test/nbest_predictions_.json
07/16/2020 19:48:11 - INFO - transformers.data.metrics.squad_metrics -   Writing null_log_odds to: ./examples/models/test/null_odds_.json
07/16/2020 19:48:38 - INFO - __main__ -   Results: {'exact': 84.88166428029984, 'f1': 88.08101045556123, 'total': 11873, 'HasAns_exact': 81.5114709851552, 'HasAns_f1': 87.91933824879894, 'HasAns_total': 5928, 'NoAns_exact': 88.24222035323801, 'NoAns_f1': 88.24222035323801, 'NoAns_total': 5945, 'best_exact': 84.88166428029984, 'best_exact_thresh': 0.0, 'best_f1': 88.08101045556107, 'best_f1_thresh': 0.0}
*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
*****************************************

## run_gluon_roberta.sh
#!/bin/bash
set -e
set -x

export TASK=SQUAD
export SQUAD_VERSION=2.0
export MODEL_NAME=large
export SQUAD_DATA=/home/ubuntu/SQuAD_data

export BS=2
export ACCUMULATE=6
GBS=$(($BS * $ACCUMULATE))
export LR=3e-5
export MSL=512
export WD=0.01
export EP=3
export SEED=28
export MGN=0.1
export WUR=0.2
export OUTPUT_DIR=roberta/${TASK}${SQUAD_VERSION}_${MODEL_NAME}_${GBS}_${LR}_${WD}_${EP}_${MGN}_${WUR}_${SEED}
pip3 install numpy
set +x

mpirun -np 4 -H localhost:4 -bind-to none -map-by slot python3 -m run_squad \
    --model_name=fairseq_roberta_${MODEL_NAME} \
    --do_eval \
    --do_train \
    --data_dir=${SQUAD_DATA} \
    --output_dir=${OUTPUT_DIR} \
    --gpus=0,1,2,3 \
    --num_accumulate=${ACCUMULATE} \
    --version=${SQUAD_VERSION} \
    --batch_size=${BS} \
    --lr=${LR} \
    --wd=${WD} \
    --seed=${SEED} \
    --max_seq_length=${MSL} \
    --eval_batch_size=48 \
    --save_interval=200 \
    --log_interval=50 \
    --max_saved_ckpt=6 \
    --epochs=${EP} \
    --warmup_ratio=${WUR} \
    --max_grad_norm=${MGN} \
    --comm_backend horovod \

## run_hugggingface_squad.sh
export SQUAD_DIR=/home/ubuntu/SQuAD_data
python3 -m torch.distributed.launch --nproc_per_node=4 ./examples/question-answering/run_squad.py \
    --model_type roberta \
    --model_name_or_path roberta-large \
    --do_train \
    --do_eval \
    --version_2_with_negative \
    --train_file $SQUAD_DIR/train-v2.0.json \
    --predict_file $SQUAD_DIR/dev-v2.0.json \
    --learning_rate 3e-5 \
	--weight_decay 0.01 \
    --num_train_epochs 3 \
    --warmup_steps 1642 \
    --adam_epsilon 1e-6 \
    --max_seq_length 512 \
    --doc_stride 128 \
    --output_dir ./examples/models/test/ \
    --per_gpu_eval_batch_size=8   \
    --per_gpu_train_batch_size=2   \
    --per_gpu_eval_batch_size=12 \
	--gradient_accumulation_steps=6 \
    --threads 20 \
    --logging_steps 50 \
    --save_steps 2000 \
    --overwrite_cache \
	#!/bin/bash
	set -e
	set -x

	export TASK=SQUAD
	export SQUAD_VERSION=2.0
	export MODEL_NAME=large
	export SQUAD_DATA=/home/ubuntu/SQuAD_data

	export BS=2
	export ACCUMULATE=6
	GBS=$(($BS * $ACCUMULATE))
	export LR=3e-5
	export MSL=512
	export WD=0.01
	export EP=3
	export SEED=28
	export MGN=0.1
	export WUR=0.2
	export OUTPUT_DIR=roberta/${TASK}${SQUAD_VERSION}_${MODEL_NAME}_${GBS}_${LR}_${WD}_${EP}_${MGN}_${WUR}_${SEED}
	pip3 install numpy
	set +x

	mpirun -np 4 -H localhost:4 -bind-to none -map-by slot python3 -m run_squad \
	--model_name=fairseq_roberta_${MODEL_NAME} \
	--do_eval \
	--do_train \
	--data_dir=${SQUAD_DATA} \
	--output_dir=${OUTPUT_DIR} \
	--gpus=0,1,2,3 \
	--num_accumulate=${ACCUMULATE} \
	--version=${SQUAD_VERSION} \
	--batch_size=${BS} \
	--lr=${LR} \
	--wd=${WD} \
	--seed=${SEED} \
	--max_seq_length=${MSL} \
	--eval_batch_size=48 \
	--save_interval=200 \
	--log_interval=50 \
	--max_saved_ckpt=6 \
	--epochs=${EP} \
	--warmup_ratio=${WUR} \
	--max_grad_norm=${MGN} \
	--comm_backend horovod \
	export SQUAD_DIR=/home/ubuntu/SQuAD_data
	python3 -m torch.distributed.launch --nproc_per_node=4 ./examples/question-answering/run_squad.py \
	--model_type roberta \
	--model_name_or_path roberta-large \
	--do_train \
	--do_eval \
	--version_2_with_negative \
	--train_file $SQUAD_DIR/train-v2.0.json \
	--predict_file $SQUAD_DIR/dev-v2.0.json \
	--learning_rate 3e-5 \
	--weight_decay 0.01 \
	--num_train_epochs 3 \
	--warmup_steps 1642 \
	--adam_epsilon 1e-6 \
	--max_seq_length 512 \
	--doc_stride 128 \
	--output_dir ./examples/models/test/ \
	--per_gpu_eval_batch_size=8 \
	--per_gpu_train_batch_size=2 \
	--per_gpu_eval_batch_size=12 \
	--gradient_accumulation_steps=6 \
	--threads 20 \
	--logging_steps 50 \
	--save_steps 2000 \
	--overwrite_cache \