Last active
July 17, 2020 01:18
-
-
Save zheyuye/5fbf24a99ce57f1c10fbca69ba8fa5b0 to your computer and use it in GitHub Desktop.
Speed comparison: huggingface + torch.distributed (13.6 hours) vs gluonnlp + horovod (8.76 hours), resources: aws g4.12xlarge CUDA 10.1, V10.1.243, model: roberta large, hyper-parameters: global batch size = 48. \n gluon: em/f1=85.88/88.73; huggingface: em/f1=84.88/88.08;
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2020-07-14 08:24:58,197 - root - INFO - GPU communication supported by horovod | |
2020-07-14 08:24:58,197 - root - INFO - GPU communication supported by horovod | |
2020-07-14 08:24:58,197 - root - INFO - GPU communication supported by horovod | |
2020-07-14 08:24:58,197 - root - INFO - GPU communication supported by horovod | |
2020-07-14 08:25:06,274 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 08:25:06,286 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 08:25:06,298 - root - INFO - Prepare training data | |
2020-07-14 08:25:06,317 - root - INFO - Prepare training data | |
2020-07-14 08:25:06,340 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 08:25:06,381 - root - INFO - Prepare training data | |
2020-07-14 08:25:06,413 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 08:25:06,500 - root - INFO - Prepare training data | |
2020-07-14 08:25:08,757 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson | |
2020-07-14 08:25:08,758 - root - INFO - Processing the Training data: | |
2020-07-14 08:25:08,771 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson | |
2020-07-14 08:25:08,772 - root - INFO - Processing the Training data: | |
2020-07-14 08:25:08,793 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson | |
2020-07-14 08:25:08,794 - root - INFO - Processing the Training data: | |
2020-07-14 08:25:08,810 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson | |
2020-07-14 08:25:08,811 - root - INFO - Processing the Training data: | |
2020-07-14 08:25:09,365 - root - INFO - Done! #Unreliable Span=0 / #Mismatched Answer=0 / #Total=11873 | |
2020-07-14 08:25:09,367 - root - INFO - Before Chunking, #Train/Is Impossible = 11873/5945 | |
2020-07-14 08:25:09,367 - root - INFO - After Chunking, #Train Sample/Is Impossible = 12006/12006 | |
2020-07-14 08:25:09,367 - root - INFO - Creating distributed trainer... | |
2020-07-14 08:25:09,371 - root - INFO - Done! #Unreliable Span=0 / #Mismatched Answer=0 / #Total=11873 | |
2020-07-14 08:25:09,373 - root - INFO - Before Chunking, #Train/Is Impossible = 11873/5945 | |
2020-07-14 08:25:09,373 - root - INFO - After Chunking, #Train Sample/Is Impossible = 12006/12006 | |
2020-07-14 08:25:09,374 - root - INFO - Creating distributed trainer... | |
2020-07-14 08:25:09,380 - root - INFO - Using gradient accumulation. Effective global batch size = 48 | |
2020-07-14 08:25:09,386 - root - INFO - Using gradient accumulation. Effective global batch size = 48 | |
2020-07-14 08:25:09,400 - root - INFO - Done! #Unreliable Span=0 / #Mismatched Answer=0 / #Total=11873 | |
2020-07-14 08:25:09,402 - root - INFO - Before Chunking, #Train/Is Impossible = 11873/5945 | |
2020-07-14 08:25:09,403 - root - INFO - After Chunking, #Train Sample/Is Impossible = 12006/12006 | |
2020-07-14 08:25:09,403 - root - INFO - Creating distributed trainer... | |
2020-07-14 08:25:09,416 - root - INFO - Using gradient accumulation. Effective global batch size = 48 | |
2020-07-14 08:25:09,418 - root - INFO - Done! #Unreliable Span=0 / #Mismatched Answer=0 / #Total=11873 | |
2020-07-14 08:25:09,420 - root - INFO - Before Chunking, #Train/Is Impossible = 11873/5945 | |
2020-07-14 08:25:09,420 - root - INFO - After Chunking, #Train Sample/Is Impossible = 12006/12006 | |
2020-07-14 08:25:09,420 - root - INFO - Creating distributed trainer... | |
2020-07-14 08:25:09,422 - root - INFO - #Total Training Steps=750, Warmup=150, Save Interval=200 | |
2020-07-14 08:25:09,430 - root - INFO - #Total Training Steps=750, Warmup=150, Save Interval=200 | |
2020-07-14 08:25:09,433 - root - INFO - Using gradient accumulation. Effective global batch size = 48 | |
2020-07-14 08:25:09,458 - root - INFO - #Total Training Steps=750, Warmup=150, Save Interval=200 | |
2020-07-14 08:25:09,476 - root - INFO - #Total Training Steps=750, Warmup=150, Save Interval=200 | |
2020-07-14 08:28:37,071 - root - INFO - Step: 50/750, Loss span/answer/total=1.5229/0.0954/1.6184, LR=0.00001000, grad_norm=0.0102. Time cost=207.61, Throughput=2.89 samples/s ETA=0.81h | |
2020-07-14 08:32:13,367 - root - INFO - Step: 100/750, Loss span/answer/total=0.1254/0.0058/0.1312, LR=0.00002000, grad_norm=0.0170. Time cost=216.30, Throughput=2.77 samples/s ETA=0.77h | |
2020-07-14 08:35:48,985 - root - INFO - Step: 150/750, Loss span/answer/total=0.0079/0.0005/0.0083, LR=0.00003000, grad_norm=0.0235. Time cost=215.62, Throughput=2.78 samples/s ETA=0.71h | |
2020-07-14 08:42:47,430 - root - INFO - GPU communication supported by horovod | |
2020-07-14 08:42:47,430 - root - INFO - GPU communication supported by horovod | |
2020-07-14 08:42:47,430 - root - INFO - GPU communication supported by horovod | |
2020-07-14 08:42:47,431 - root - INFO - GPU communication supported by horovod | |
2020-07-14 08:42:55,415 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 08:42:55,433 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 08:42:55,441 - root - INFO - Prepare training data | |
2020-07-14 08:42:55,458 - root - INFO - Prepare training data | |
2020-07-14 08:42:55,572 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 08:42:55,618 - root - INFO - Prepare training data | |
2020-07-14 08:42:55,631 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 08:42:55,670 - root - INFO - Prepare training data | |
2020-07-14 08:43:22,759 - root - INFO - GPU communication supported by horovod | |
2020-07-14 08:43:22,759 - root - INFO - GPU communication supported by horovod | |
2020-07-14 08:43:22,759 - root - INFO - GPU communication supported by horovod | |
2020-07-14 08:43:22,759 - root - INFO - GPU communication supported by horovod | |
2020-07-14 08:43:30,723 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 08:43:30,748 - root - INFO - Prepare training data | |
2020-07-14 08:43:30,796 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 08:43:30,824 - root - INFO - Prepare training data | |
2020-07-14 08:43:30,832 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 08:43:30,870 - root - INFO - Prepare training data | |
2020-07-14 08:43:30,872 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 08:43:30,927 - root - INFO - Prepare training data | |
2020-07-14 08:43:51,700 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson | |
2020-07-14 08:43:51,701 - root - INFO - Processing the Training data: | |
2020-07-14 08:43:51,870 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson | |
2020-07-14 08:43:51,870 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson | |
2020-07-14 08:43:51,871 - root - INFO - Processing the Training data: | |
2020-07-14 08:43:51,871 - root - INFO - Processing the Training data: | |
2020-07-14 08:43:51,934 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson | |
2020-07-14 08:43:51,935 - root - INFO - Processing the Training data: | |
2020-07-14 08:43:57,968 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319 | |
2020-07-14 08:43:57,989 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498 | |
2020-07-14 08:43:57,990 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697 | |
2020-07-14 08:43:57,990 - root - INFO - Creating distributed trainer... | |
2020-07-14 08:43:58,002 - root - INFO - Using gradient accumulation. Effective global batch size = 48 | |
2020-07-14 08:43:58,044 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200 | |
2020-07-14 08:43:58,094 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319 | |
2020-07-14 08:43:58,116 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498 | |
2020-07-14 08:43:58,116 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697 | |
2020-07-14 08:43:58,116 - root - INFO - Creating distributed trainer... | |
2020-07-14 08:43:58,129 - root - INFO - Using gradient accumulation. Effective global batch size = 48 | |
2020-07-14 08:43:58,140 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319 | |
2020-07-14 08:43:58,161 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498 | |
2020-07-14 08:43:58,161 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697 | |
2020-07-14 08:43:58,161 - root - INFO - Creating distributed trainer... | |
2020-07-14 08:43:58,163 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319 | |
2020-07-14 08:43:58,171 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200 | |
2020-07-14 08:43:58,174 - root - INFO - Using gradient accumulation. Effective global batch size = 48 | |
2020-07-14 08:43:58,185 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498 | |
2020-07-14 08:43:58,185 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697 | |
2020-07-14 08:43:58,185 - root - INFO - Creating distributed trainer... | |
2020-07-14 08:43:58,198 - root - INFO - Using gradient accumulation. Effective global batch size = 48 | |
2020-07-14 08:43:58,218 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200 | |
2020-07-14 08:43:58,242 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200 | |
2020-07-14 08:47:11,580 - root - INFO - Step: 50/8161, Loss span/answer/total=4.5936/0.3121/4.9057, LR=0.00000092, grad_norm=0.8805. Time cost=193.50, Throughput=3.10 samples/s ETA=8.72h | |
2020-07-14 08:50:21,126 - root - INFO - Step: 100/8161, Loss span/answer/total=3.9519/0.2963/4.2482, LR=0.00000184, grad_norm=0.8129. Time cost=189.55, Throughput=3.17 samples/s ETA=8.58h | |
2020-07-14 08:53:40,545 - root - INFO - Step: 150/8161, Loss span/answer/total=3.5912/0.2979/3.8891, LR=0.00000276, grad_norm=1.1070. Time cost=199.42, Throughput=3.01 samples/s ETA=8.64h | |
2020-07-14 08:56:52,530 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_200.params | |
2020-07-14 08:56:52,530 - root - INFO - Step: 200/8161, Loss span/answer/total=2.5279/0.2770/2.8049, LR=0.00000368, grad_norm=1.7652. Time cost=191.98, Throughput=3.13 samples/s ETA=8.56h | |
2020-07-14 09:00:01,731 - root - INFO - Step: 250/8161, Loss span/answer/total=1.6767/0.2792/1.9560, LR=0.00000460, grad_norm=2.1414. Time cost=189.20, Throughput=3.17 samples/s ETA=8.47h | |
2020-07-14 09:03:10,564 - root - INFO - Step: 300/8161, Loss span/answer/total=1.3609/0.2229/1.5838, LR=0.00000551, grad_norm=2.2598. Time cost=188.83, Throughput=3.18 samples/s ETA=8.39h | |
2020-07-14 09:06:21,258 - root - INFO - Step: 350/8161, Loss span/answer/total=1.1037/0.2142/1.3180, LR=0.00000643, grad_norm=1.6036. Time cost=190.69, Throughput=3.15 samples/s ETA=8.33h | |
2020-07-14 09:09:32,098 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_400.params | |
2020-07-14 09:09:32,099 - root - INFO - Step: 400/8161, Loss span/answer/total=1.0808/0.2131/1.2938, LR=0.00000735, grad_norm=2.0335. Time cost=190.84, Throughput=3.14 samples/s ETA=8.27h | |
2020-07-14 09:12:43,010 - root - INFO - Step: 450/8161, Loss span/answer/total=1.0440/0.1907/1.2347, LR=0.00000827, grad_norm=1.5600. Time cost=190.91, Throughput=3.14 samples/s ETA=8.21h | |
2020-07-14 09:15:47,598 - root - INFO - Step: 500/8161, Loss span/answer/total=0.9771/0.1689/1.1460, LR=0.00000919, grad_norm=3.2921. Time cost=184.59, Throughput=3.25 samples/s ETA=8.13h | |
2020-07-14 09:18:58,111 - root - INFO - Step: 550/8161, Loss span/answer/total=0.9907/0.1919/1.1826, LR=0.00001011, grad_norm=1.9369. Time cost=190.51, Throughput=3.15 samples/s ETA=8.07h | |
2020-07-14 09:22:12,371 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_600.params | |
2020-07-14 09:22:12,371 - root - INFO - Step: 600/8161, Loss span/answer/total=0.9575/0.1871/1.1446, LR=0.00001103, grad_norm=2.0379. Time cost=194.26, Throughput=3.09 samples/s ETA=8.03h | |
2020-07-14 09:25:16,471 - root - INFO - Step: 650/8161, Loss span/answer/total=0.9153/0.2056/1.1209, LR=0.00001195, grad_norm=2.2053. Time cost=184.10, Throughput=3.26 samples/s ETA=7.96h | |
2020-07-14 09:28:21,716 - root - INFO - Step: 700/8161, Loss span/answer/total=0.9022/0.1800/1.0822, LR=0.00001287, grad_norm=2.0880. Time cost=185.24, Throughput=3.24 samples/s ETA=7.89h | |
2020-07-14 09:31:34,459 - root - INFO - Step: 750/8161, Loss span/answer/total=0.8562/0.1595/1.0157, LR=0.00001379, grad_norm=2.0007. Time cost=192.74, Throughput=3.11 samples/s ETA=7.84h | |
2020-07-14 09:34:47,966 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_800.params | |
2020-07-14 09:34:47,966 - root - INFO - Step: 800/8161, Loss span/answer/total=0.8080/0.1869/0.9949, LR=0.00001471, grad_norm=2.1405. Time cost=193.51, Throughput=3.10 samples/s ETA=7.80h | |
2020-07-14 09:38:06,716 - root - INFO - Step: 850/8161, Loss span/answer/total=0.8854/0.1687/1.0541, LR=0.00001563, grad_norm=2.4760. Time cost=198.75, Throughput=3.02 samples/s ETA=7.76h | |
2020-07-14 09:41:22,141 - root - INFO - Step: 900/8161, Loss span/answer/total=0.8633/0.1698/1.0331, LR=0.00001654, grad_norm=2.0626. Time cost=195.42, Throughput=3.07 samples/s ETA=7.72h | |
2020-07-14 09:44:35,609 - root - INFO - Step: 950/8161, Loss span/answer/total=0.8498/0.1884/1.0382, LR=0.00001746, grad_norm=6.7273. Time cost=193.47, Throughput=3.10 samples/s ETA=7.67h | |
2020-07-14 09:47:50,034 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1000.params | |
2020-07-14 09:47:50,034 - root - INFO - Step: 1000/8161, Loss span/answer/total=0.8966/0.1889/1.0856, LR=0.00001838, grad_norm=1.4678. Time cost=194.42, Throughput=3.09 samples/s ETA=7.62h | |
2020-07-14 09:50:57,237 - root - INFO - Step: 1050/8161, Loss span/answer/total=0.8836/0.1864/1.0700, LR=0.00001930, grad_norm=2.0181. Time cost=187.20, Throughput=3.21 samples/s ETA=7.56h | |
2020-07-14 09:54:07,456 - root - INFO - Step: 1100/8161, Loss span/answer/total=0.8858/0.1835/1.0693, LR=0.00002022, grad_norm=1.9293. Time cost=190.22, Throughput=3.15 samples/s ETA=7.51h | |
2020-07-14 09:57:22,497 - root - INFO - Step: 1150/8161, Loss span/answer/total=0.8753/0.1828/1.0581, LR=0.00002114, grad_norm=2.4784. Time cost=195.04, Throughput=3.08 samples/s ETA=7.46h | |
2020-07-14 10:00:35,263 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1200.params | |
2020-07-14 10:00:35,263 - root - INFO - Step: 1200/8161, Loss span/answer/total=0.8590/0.1843/1.0433, LR=0.00002206, grad_norm=2.2267. Time cost=192.77, Throughput=3.11 samples/s ETA=7.41h | |
2020-07-14 10:03:46,197 - root - INFO - Step: 1250/8161, Loss span/answer/total=0.8105/0.1847/0.9952, LR=0.00002298, grad_norm=9.9131. Time cost=190.93, Throughput=3.14 samples/s ETA=7.35h | |
2020-07-14 10:07:01,651 - root - INFO - Step: 1300/8161, Loss span/answer/total=0.7724/0.1644/0.9368, LR=0.00002390, grad_norm=2.0911. Time cost=195.45, Throughput=3.07 samples/s ETA=7.31h | |
2020-07-14 10:10:10,936 - root - INFO - Step: 1350/8161, Loss span/answer/total=0.9344/0.1920/1.1264, LR=0.00002482, grad_norm=2.0462. Time cost=189.28, Throughput=3.17 samples/s ETA=7.25h | |
2020-07-14 10:13:30,035 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1400.params | |
2020-07-14 10:13:30,035 - root - INFO - Step: 1400/8161, Loss span/answer/total=0.7847/0.1689/0.9536, LR=0.00002574, grad_norm=1.6321. Time cost=199.10, Throughput=3.01 samples/s ETA=7.21h | |
2020-07-14 10:16:37,514 - root - INFO - Step: 1450/8161, Loss span/answer/total=0.7959/0.1377/0.9336, LR=0.00002665, grad_norm=11.1343. Time cost=187.48, Throughput=3.20 samples/s ETA=7.15h | |
2020-07-14 10:19:49,740 - root - INFO - Step: 1500/8161, Loss span/answer/total=0.8426/0.1708/1.0134, LR=0.00002757, grad_norm=2.4158. Time cost=192.23, Throughput=3.12 samples/s ETA=7.09h | |
2020-07-14 10:23:04,349 - root - INFO - Step: 1550/8161, Loss span/answer/total=0.7925/0.1872/0.9797, LR=0.00002849, grad_norm=2.5468. Time cost=194.61, Throughput=3.08 samples/s ETA=7.04h | |
2020-07-14 10:26:21,144 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1600.params | |
2020-07-14 10:26:21,144 - root - INFO - Step: 1600/8161, Loss span/answer/total=0.7817/0.1799/0.9616, LR=0.00002941, grad_norm=1.3321. Time cost=196.79, Throughput=3.05 samples/s ETA=7.00h | |
2020-07-14 10:29:26,828 - root - INFO - Step: 1650/8161, Loss span/answer/total=0.7816/0.1583/0.9399, LR=0.00002992, grad_norm=2.3034. Time cost=185.68, Throughput=3.23 samples/s ETA=6.94h | |
2020-07-14 10:32:40,330 - root - INFO - Step: 1700/8161, Loss span/answer/total=0.7563/0.1498/0.9061, LR=0.00002969, grad_norm=1.5616. Time cost=193.50, Throughput=3.10 samples/s ETA=6.89h | |
2020-07-14 10:35:56,491 - root - INFO - Step: 1750/8161, Loss span/answer/total=0.8360/0.1616/0.9976, LR=0.00002946, grad_norm=1.8526. Time cost=196.16, Throughput=3.06 samples/s ETA=6.84h | |
2020-07-14 10:39:10,844 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1800.params | |
2020-07-14 10:39:10,844 - root - INFO - Step: 1800/8161, Loss span/answer/total=0.8494/0.1844/1.0338, LR=0.00002923, grad_norm=2.0884. Time cost=194.35, Throughput=3.09 samples/s ETA=6.79h | |
2020-07-14 10:42:28,789 - root - INFO - Step: 1850/8161, Loss span/answer/total=0.8517/0.1800/1.0317, LR=0.00002900, grad_norm=2.0119. Time cost=197.95, Throughput=3.03 samples/s ETA=6.74h | |
2020-07-14 10:45:40,738 - root - INFO - Step: 1900/8161, Loss span/answer/total=0.7495/0.1561/0.9056, LR=0.00002877, grad_norm=1.3511. Time cost=191.95, Throughput=3.13 samples/s ETA=6.68h | |
2020-07-14 10:48:50,613 - root - INFO - Step: 1950/8161, Loss span/answer/total=0.7947/0.1719/0.9665, LR=0.00002854, grad_norm=2.0416. Time cost=189.87, Throughput=3.16 samples/s ETA=6.63h | |
2020-07-14 10:52:09,451 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2000.params | |
2020-07-14 10:52:09,451 - root - INFO - Step: 2000/8161, Loss span/answer/total=0.7748/0.1672/0.9420, LR=0.00002831, grad_norm=1.7656. Time cost=198.84, Throughput=3.02 samples/s ETA=6.58h | |
2020-07-14 10:55:18,951 - root - INFO - Step: 2050/8161, Loss span/answer/total=0.7177/0.1886/0.9063, LR=0.00002808, grad_norm=1.1931. Time cost=189.50, Throughput=3.17 samples/s ETA=6.53h | |
2020-07-14 10:58:32,587 - root - INFO - Step: 2100/8161, Loss span/answer/total=0.6977/0.1282/0.8259, LR=0.00002785, grad_norm=1.8929. Time cost=193.64, Throughput=3.10 samples/s ETA=6.47h | |
2020-07-14 11:01:47,010 - root - INFO - Step: 2150/8161, Loss span/answer/total=0.6713/0.1432/0.8145, LR=0.00002762, grad_norm=1.4084. Time cost=194.42, Throughput=3.09 samples/s ETA=6.42h | |
2020-07-14 11:05:09,671 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2200.params | |
2020-07-14 11:05:09,671 - root - INFO - Step: 2200/8161, Loss span/answer/total=0.7879/0.1481/0.9360, LR=0.00002739, grad_norm=1.8508. Time cost=202.66, Throughput=2.96 samples/s ETA=6.38h | |
2020-07-14 11:08:22,018 - root - INFO - Step: 2250/8161, Loss span/answer/total=0.8174/0.1796/0.9970, LR=0.00002716, grad_norm=1.8646. Time cost=192.35, Throughput=3.12 samples/s ETA=6.32h | |
2020-07-14 11:11:32,462 - root - INFO - Step: 2300/8161, Loss span/answer/total=0.7551/0.1536/0.9087, LR=0.00002693, grad_norm=0.7570. Time cost=190.44, Throughput=3.15 samples/s ETA=6.27h | |
2020-07-14 11:14:44,478 - root - INFO - Step: 2350/8161, Loss span/answer/total=0.7227/0.1467/0.8693, LR=0.00002670, grad_norm=1.6918. Time cost=192.02, Throughput=3.12 samples/s ETA=6.21h | |
2020-07-14 11:17:54,186 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2400.params | |
2020-07-14 11:17:54,186 - root - INFO - Step: 2400/8161, Loss span/answer/total=0.7196/0.1537/0.8733, LR=0.00002647, grad_norm=1.4427. Time cost=189.71, Throughput=3.16 samples/s ETA=6.16h | |
2020-07-14 11:21:09,349 - root - INFO - Step: 2450/8161, Loss span/answer/total=0.7637/0.1652/0.9289, LR=0.00002624, grad_norm=2.0733. Time cost=195.16, Throughput=3.07 samples/s ETA=6.11h | |
2020-07-14 11:24:19,783 - root - INFO - Step: 2500/8161, Loss span/answer/total=0.7494/0.1332/0.8825, LR=0.00002601, grad_norm=1.4492. Time cost=190.43, Throughput=3.15 samples/s ETA=6.05h | |
2020-07-14 11:27:40,839 - root - INFO - Step: 2550/8161, Loss span/answer/total=0.6923/0.1732/0.8655, LR=0.00002578, grad_norm=1.6334. Time cost=201.06, Throughput=2.98 samples/s ETA=6.00h | |
2020-07-14 11:30:47,481 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2600.params | |
2020-07-14 11:30:47,481 - root - INFO - Step: 2600/8161, Loss span/answer/total=0.6443/0.1479/0.7922, LR=0.00002555, grad_norm=1.6361. Time cost=186.64, Throughput=3.21 samples/s ETA=5.95h | |
2020-07-14 11:34:03,299 - root - INFO - Step: 2650/8161, Loss span/answer/total=0.7569/0.1562/0.9131, LR=0.00002532, grad_norm=1.7191. Time cost=195.82, Throughput=3.06 samples/s ETA=5.90h | |
2020-07-14 11:37:19,334 - root - INFO - Step: 2700/8161, Loss span/answer/total=0.7257/0.1302/0.8559, LR=0.00002509, grad_norm=1.3729. Time cost=196.03, Throughput=3.06 samples/s ETA=5.84h | |
2020-07-14 11:40:34,398 - root - INFO - Step: 2750/8161, Loss span/answer/total=0.5917/0.1189/0.7106, LR=0.00002486, grad_norm=1.6643. Time cost=195.06, Throughput=3.07 samples/s ETA=5.79h | |
2020-07-14 11:43:50,551 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2800.params | |
2020-07-14 11:43:50,552 - root - INFO - Step: 2800/8161, Loss span/answer/total=0.6178/0.1140/0.7318, LR=0.00002463, grad_norm=1.8395. Time cost=196.15, Throughput=3.06 samples/s ETA=5.74h | |
2020-07-14 11:46:59,723 - root - INFO - Step: 2850/8161, Loss span/answer/total=0.6697/0.1092/0.7789, LR=0.00002440, grad_norm=1.7596. Time cost=189.17, Throughput=3.17 samples/s ETA=5.68h | |
2020-07-14 11:50:12,712 - root - INFO - Step: 2900/8161, Loss span/answer/total=0.6541/0.1132/0.7672, LR=0.00002417, grad_norm=1.9585. Time cost=192.99, Throughput=3.11 samples/s ETA=5.63h | |
2020-07-14 11:53:26,457 - root - INFO - Step: 2950/8161, Loss span/answer/total=0.6785/0.1367/0.8152, LR=0.00002394, grad_norm=0.9884. Time cost=193.74, Throughput=3.10 samples/s ETA=5.58h | |
2020-07-14 11:56:34,236 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3000.params | |
2020-07-14 11:56:34,236 - root - INFO - Step: 3000/8161, Loss span/answer/total=0.5995/0.1241/0.7236, LR=0.00002371, grad_norm=1.6040. Time cost=187.78, Throughput=3.20 samples/s ETA=5.52h | |
2020-07-14 11:59:45,391 - root - INFO - Step: 3050/8161, Loss span/answer/total=0.5789/0.1026/0.6815, LR=0.00002348, grad_norm=1.7711. Time cost=191.15, Throughput=3.14 samples/s ETA=5.47h | |
2020-07-14 12:02:50,252 - root - INFO - Step: 3100/8161, Loss span/answer/total=0.6229/0.1321/0.7550, LR=0.00002325, grad_norm=1.6470. Time cost=184.86, Throughput=3.25 samples/s ETA=5.41h | |
2020-07-14 12:06:08,501 - root - INFO - Step: 3150/8161, Loss span/answer/total=0.5569/0.1089/0.6657, LR=0.00002302, grad_norm=1.3360. Time cost=198.25, Throughput=3.03 samples/s ETA=5.36h | |
2020-07-14 12:09:23,235 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3200.params | |
2020-07-14 12:09:23,235 - root - INFO - Step: 3200/8161, Loss span/answer/total=0.6164/0.1195/0.7359, LR=0.00002280, grad_norm=1.3351. Time cost=194.73, Throughput=3.08 samples/s ETA=5.31h | |
2020-07-14 12:12:32,239 - root - INFO - Step: 3250/8161, Loss span/answer/total=0.6190/0.1171/0.7360, LR=0.00002257, grad_norm=1.3754. Time cost=189.00, Throughput=3.17 samples/s ETA=5.25h | |
2020-07-14 12:15:42,266 - root - INFO - Step: 3300/8161, Loss span/answer/total=0.5693/0.1307/0.6999, LR=0.00002234, grad_norm=3.7025. Time cost=190.03, Throughput=3.16 samples/s ETA=5.20h | |
2020-07-14 12:18:53,482 - root - INFO - Step: 3350/8161, Loss span/answer/total=0.6057/0.1060/0.7117, LR=0.00002211, grad_norm=0.9262. Time cost=191.22, Throughput=3.14 samples/s ETA=5.14h | |
2020-07-14 12:22:13,217 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3400.params | |
2020-07-14 12:22:13,217 - root - INFO - Step: 3400/8161, Loss span/answer/total=0.5592/0.1263/0.6855, LR=0.00002188, grad_norm=1.5454. Time cost=199.73, Throughput=3.00 samples/s ETA=5.09h | |
2020-07-14 12:25:29,405 - root - INFO - Step: 3450/8161, Loss span/answer/total=0.7056/0.1154/0.8210, LR=0.00002165, grad_norm=1.3269. Time cost=196.19, Throughput=3.06 samples/s ETA=5.04h | |
2020-07-14 12:28:49,735 - root - INFO - Step: 3500/8161, Loss span/answer/total=0.6430/0.1189/0.7620, LR=0.00002142, grad_norm=2.3717. Time cost=200.33, Throughput=3.00 samples/s ETA=4.99h | |
2020-07-14 12:32:01,502 - root - INFO - Step: 3550/8161, Loss span/answer/total=0.6160/0.1029/0.7189, LR=0.00002119, grad_norm=2.2066. Time cost=191.77, Throughput=3.13 samples/s ETA=4.94h | |
2020-07-14 12:35:21,378 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3600.params | |
2020-07-14 12:35:21,378 - root - INFO - Step: 3600/8161, Loss span/answer/total=0.6436/0.1139/0.7575, LR=0.00002096, grad_norm=1.7208. Time cost=199.88, Throughput=3.00 samples/s ETA=4.89h | |
2020-07-14 12:38:27,183 - root - INFO - Step: 3650/8161, Loss span/answer/total=0.5716/0.1296/0.7011, LR=0.00002073, grad_norm=2.7656. Time cost=185.80, Throughput=3.23 samples/s ETA=4.83h | |
2020-07-14 12:41:37,339 - root - INFO - Step: 3700/8161, Loss span/answer/total=0.5560/0.1315/0.6875, LR=0.00002050, grad_norm=1.5470. Time cost=190.16, Throughput=3.16 samples/s ETA=4.78h | |
2020-07-14 12:44:53,016 - root - INFO - Step: 3750/8161, Loss span/answer/total=0.6238/0.1626/0.7864, LR=0.00002027, grad_norm=1.3384. Time cost=195.68, Throughput=3.07 samples/s ETA=4.72h | |
2020-07-14 12:48:10,997 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3800.params | |
2020-07-14 12:48:10,998 - root - INFO - Step: 3800/8161, Loss span/answer/total=0.5691/0.0919/0.6610, LR=0.00002004, grad_norm=1.3039. Time cost=197.98, Throughput=3.03 samples/s ETA=4.67h | |
2020-07-14 12:51:22,231 - root - INFO - Step: 3850/8161, Loss span/answer/total=0.5391/0.1053/0.6444, LR=0.00001981, grad_norm=1.7181. Time cost=191.23, Throughput=3.14 samples/s ETA=4.62h | |
2020-07-14 12:54:34,710 - root - INFO - Step: 3900/8161, Loss span/answer/total=0.5623/0.0929/0.6552, LR=0.00001958, grad_norm=0.9861. Time cost=192.48, Throughput=3.12 samples/s ETA=4.56h | |
2020-07-14 12:57:51,075 - root - INFO - Step: 3950/8161, Loss span/answer/total=0.5110/0.1019/0.6129, LR=0.00001935, grad_norm=1.2498. Time cost=196.37, Throughput=3.06 samples/s ETA=4.51h | |
2020-07-14 13:01:05,949 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4000.params | |
2020-07-14 13:01:05,950 - root - INFO - Step: 4000/8161, Loss span/answer/total=0.5466/0.1039/0.6504, LR=0.00001912, grad_norm=1.2746. Time cost=194.87, Throughput=3.08 samples/s ETA=4.46h | |
2020-07-14 13:04:22,529 - root - INFO - Step: 4050/8161, Loss span/answer/total=0.5686/0.1003/0.6689, LR=0.00001889, grad_norm=1.2573. Time cost=196.58, Throughput=3.05 samples/s ETA=4.41h | |
2020-07-14 13:07:43,253 - root - INFO - Step: 4100/8161, Loss span/answer/total=0.5485/0.0978/0.6464, LR=0.00001866, grad_norm=1.8622. Time cost=200.72, Throughput=2.99 samples/s ETA=4.35h | |
2020-07-14 13:10:51,018 - root - INFO - Step: 4150/8161, Loss span/answer/total=0.5211/0.0910/0.6121, LR=0.00001843, grad_norm=1.3872. Time cost=187.76, Throughput=3.20 samples/s ETA=4.30h | |
2020-07-14 13:14:03,317 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4200.params | |
2020-07-14 13:14:03,317 - root - INFO - Step: 4200/8161, Loss span/answer/total=0.5542/0.0973/0.6515, LR=0.00001820, grad_norm=1.5859. Time cost=192.30, Throughput=3.12 samples/s ETA=4.25h | |
2020-07-14 13:17:11,683 - root - INFO - Step: 4250/8161, Loss span/answer/total=0.5439/0.1210/0.6649, LR=0.00001797, grad_norm=1.0025. Time cost=188.37, Throughput=3.19 samples/s ETA=4.19h | |
2020-07-14 13:20:17,267 - root - INFO - Step: 4300/8161, Loss span/answer/total=0.5733/0.1004/0.6737, LR=0.00001774, grad_norm=1.1620. Time cost=185.58, Throughput=3.23 samples/s ETA=4.14h | |
2020-07-14 13:23:30,074 - root - INFO - Step: 4350/8161, Loss span/answer/total=0.5385/0.0984/0.6369, LR=0.00001751, grad_norm=1.2707. Time cost=192.81, Throughput=3.11 samples/s ETA=4.08h | |
2020-07-14 13:26:44,018 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4400.params | |
2020-07-14 13:26:44,018 - root - INFO - Step: 4400/8161, Loss span/answer/total=0.5379/0.1110/0.6489, LR=0.00001728, grad_norm=0.8478. Time cost=193.94, Throughput=3.09 samples/s ETA=4.03h | |
2020-07-14 13:29:56,690 - root - INFO - Step: 4450/8161, Loss span/answer/total=0.5146/0.1009/0.6156, LR=0.00001705, grad_norm=1.1027. Time cost=192.67, Throughput=3.11 samples/s ETA=3.97h | |
2020-07-14 13:33:12,646 - root - INFO - Step: 4500/8161, Loss span/answer/total=0.5094/0.0938/0.6032, LR=0.00001682, grad_norm=1.5611. Time cost=195.96, Throughput=3.06 samples/s ETA=3.92h | |
2020-07-14 13:36:22,556 - root - INFO - Step: 4550/8161, Loss span/answer/total=0.5033/0.0982/0.6015, LR=0.00001659, grad_norm=1.6522. Time cost=189.91, Throughput=3.16 samples/s ETA=3.87h | |
2020-07-14 13:39:40,829 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4600.params | |
2020-07-14 13:39:40,829 - root - INFO - Step: 4600/8161, Loss span/answer/total=0.5157/0.0860/0.6016, LR=0.00001636, grad_norm=1.8351. Time cost=198.27, Throughput=3.03 samples/s ETA=3.82h | |
2020-07-14 13:42:53,607 - root - INFO - Step: 4650/8161, Loss span/answer/total=0.5027/0.0865/0.5892, LR=0.00001613, grad_norm=1.8733. Time cost=192.78, Throughput=3.11 samples/s ETA=3.76h | |
2020-07-14 13:46:08,456 - root - INFO - Step: 4700/8161, Loss span/answer/total=0.5850/0.0978/0.6828, LR=0.00001590, grad_norm=1.7329. Time cost=194.85, Throughput=3.08 samples/s ETA=3.71h | |
2020-07-14 13:49:19,529 - root - INFO - Step: 4750/8161, Loss span/answer/total=0.5094/0.1020/0.6114, LR=0.00001567, grad_norm=1.2870. Time cost=191.07, Throughput=3.14 samples/s ETA=3.65h | |
2020-07-14 13:52:37,218 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4800.params | |
2020-07-14 13:52:37,218 - root - INFO - Step: 4800/8161, Loss span/answer/total=0.5063/0.0959/0.6022, LR=0.00001544, grad_norm=1.8758. Time cost=197.69, Throughput=3.04 samples/s ETA=3.60h | |
2020-07-14 13:55:46,301 - root - INFO - Step: 4850/8161, Loss span/answer/total=0.5025/0.1018/0.6043, LR=0.00001521, grad_norm=2.3371. Time cost=189.08, Throughput=3.17 samples/s ETA=3.55h | |
2020-07-14 13:58:56,580 - root - INFO - Step: 4900/8161, Loss span/answer/total=0.5882/0.1120/0.7002, LR=0.00001498, grad_norm=1.2556. Time cost=190.28, Throughput=3.15 samples/s ETA=3.49h | |
2020-07-14 14:02:03,431 - root - INFO - Step: 4950/8161, Loss span/answer/total=0.5735/0.0958/0.6693, LR=0.00001475, grad_norm=1.2067. Time cost=186.85, Throughput=3.21 samples/s ETA=3.44h | |
2020-07-14 14:05:20,063 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5000.params | |
2020-07-14 14:05:20,063 - root - INFO - Step: 5000/8161, Loss span/answer/total=0.5814/0.0990/0.6804, LR=0.00001452, grad_norm=1.4738. Time cost=196.63, Throughput=3.05 samples/s ETA=3.39h | |
2020-07-14 14:08:31,317 - root - INFO - Step: 5050/8161, Loss span/answer/total=0.5513/0.0932/0.6446, LR=0.00001429, grad_norm=1.6197. Time cost=191.25, Throughput=3.14 samples/s ETA=3.33h | |
2020-07-14 14:11:39,658 - root - INFO - Step: 5100/8161, Loss span/answer/total=0.4253/0.0981/0.5234, LR=0.00001406, grad_norm=1.7552. Time cost=188.34, Throughput=3.19 samples/s ETA=3.28h | |
2020-07-14 14:14:55,279 - root - INFO - Step: 5150/8161, Loss span/answer/total=0.4877/0.0816/0.5693, LR=0.00001384, grad_norm=1.3550. Time cost=195.62, Throughput=3.07 samples/s ETA=3.22h | |
2020-07-14 14:18:13,879 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5200.params | |
2020-07-14 14:18:13,879 - root - INFO - Step: 5200/8161, Loss span/answer/total=0.5461/0.1109/0.6571, LR=0.00001361, grad_norm=2.4072. Time cost=198.60, Throughput=3.02 samples/s ETA=3.17h | |
2020-07-14 14:21:24,121 - root - INFO - Step: 5250/8161, Loss span/answer/total=0.5578/0.0978/0.6556, LR=0.00001338, grad_norm=1.7026. Time cost=190.24, Throughput=3.15 samples/s ETA=3.12h | |
2020-07-14 14:24:41,272 - root - INFO - Step: 5300/8161, Loss span/answer/total=0.5165/0.0762/0.5927, LR=0.00001315, grad_norm=2.2658. Time cost=197.15, Throughput=3.04 samples/s ETA=3.07h | |
2020-07-14 14:27:45,172 - root - INFO - Step: 5350/8161, Loss span/answer/total=0.5096/0.1067/0.6163, LR=0.00001292, grad_norm=1.2365. Time cost=183.90, Throughput=3.26 samples/s ETA=3.01h | |
2020-07-14 14:31:00,387 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5400.params | |
2020-07-14 14:31:00,387 - root - INFO - Step: 5400/8161, Loss span/answer/total=0.5312/0.0831/0.6143, LR=0.00001269, grad_norm=1.4942. Time cost=195.21, Throughput=3.07 samples/s ETA=2.96h | |
2020-07-14 14:34:17,245 - root - INFO - Step: 5450/8161, Loss span/answer/total=0.4819/0.1096/0.5914, LR=0.00001246, grad_norm=1.1723. Time cost=196.86, Throughput=3.04 samples/s ETA=2.90h | |
2020-07-14 14:37:35,542 - root - INFO - Step: 5500/8161, Loss span/answer/total=0.3815/0.0786/0.4601, LR=0.00001223, grad_norm=1.7482. Time cost=198.30, Throughput=3.03 samples/s ETA=2.85h | |
2020-07-14 14:40:56,047 - root - INFO - Step: 5550/8161, Loss span/answer/total=0.4540/0.0679/0.5218, LR=0.00001200, grad_norm=1.0213. Time cost=200.50, Throughput=2.99 samples/s ETA=2.80h | |
2020-07-14 14:44:05,191 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5600.params | |
2020-07-14 14:44:05,192 - root - INFO - Step: 5600/8161, Loss span/answer/total=0.3761/0.0647/0.4408, LR=0.00001177, grad_norm=1.6066. Time cost=189.14, Throughput=3.17 samples/s ETA=2.74h | |
2020-07-14 14:47:18,057 - root - INFO - Step: 5650/8161, Loss span/answer/total=0.4483/0.0466/0.4949, LR=0.00001154, grad_norm=1.1719. Time cost=192.87, Throughput=3.11 samples/s ETA=2.69h | |
2020-07-14 14:50:32,480 - root - INFO - Step: 5700/8161, Loss span/answer/total=0.4071/0.0735/0.4806, LR=0.00001131, grad_norm=0.9141. Time cost=194.42, Throughput=3.09 samples/s ETA=2.64h | |
2020-07-14 14:53:43,863 - root - INFO - Step: 5750/8161, Loss span/answer/total=0.4998/0.0811/0.5809, LR=0.00001108, grad_norm=1.7222. Time cost=191.38, Throughput=3.14 samples/s ETA=2.58h | |
2020-07-14 14:56:50,952 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5800.params | |
2020-07-14 14:56:50,952 - root - INFO - Step: 5800/8161, Loss span/answer/total=0.4418/0.0593/0.5011, LR=0.00001085, grad_norm=1.5506. Time cost=187.09, Throughput=3.21 samples/s ETA=2.53h | |
2020-07-14 15:00:05,308 - root - INFO - Step: 5850/8161, Loss span/answer/total=0.3896/0.0692/0.4588, LR=0.00001062, grad_norm=1.0585. Time cost=194.36, Throughput=3.09 samples/s ETA=2.48h | |
2020-07-14 15:03:15,127 - root - INFO - Step: 5900/8161, Loss span/answer/total=0.3892/0.0616/0.4508, LR=0.00001039, grad_norm=1.7596. Time cost=189.82, Throughput=3.16 samples/s ETA=2.42h | |
2020-07-14 15:06:35,571 - root - INFO - Step: 5950/8161, Loss span/answer/total=0.4063/0.0653/0.4716, LR=0.00001016, grad_norm=1.4484. Time cost=200.44, Throughput=2.99 samples/s ETA=2.37h | |
2020-07-14 15:09:44,694 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_6000.params | |
2020-07-14 15:09:44,694 - root - INFO - Step: 6000/8161, Loss span/answer/total=0.3973/0.0681/0.4654, LR=0.00000993, grad_norm=1.6632. Time cost=189.12, Throughput=3.17 samples/s ETA=2.32h | |
2020-07-14 15:12:57,243 - root - INFO - Step: 6050/8161, Loss span/answer/total=0.4605/0.0628/0.5233, LR=0.00000970, grad_norm=1.5790. Time cost=192.55, Throughput=3.12 samples/s ETA=2.26h | |
2020-07-14 15:16:12,702 - root - INFO - Step: 6100/8161, Loss span/answer/total=0.3830/0.0520/0.4351, LR=0.00000947, grad_norm=0.9119. Time cost=195.46, Throughput=3.07 samples/s ETA=2.21h | |
2020-07-14 15:19:21,712 - root - INFO - Step: 6150/8161, Loss span/answer/total=0.3837/0.0588/0.4425, LR=0.00000924, grad_norm=0.8564. Time cost=189.01, Throughput=3.17 samples/s ETA=2.15h | |
2020-07-14 15:22:30,802 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_6200.params | |
2020-07-14 15:22:30,802 - root - INFO - Step: 6200/8161, Loss span/answer/total=0.4001/0.0646/0.4647, LR=0.00000901, grad_norm=2.1057. Time cost=189.09, Throughput=3.17 samples/s ETA=2.10h | |
2020-07-14 15:25:42,962 - root - INFO - Step: 6250/8161, Loss span/answer/total=0.3856/0.0626/0.4482, LR=0.00000878, grad_norm=1.3489. Time cost=192.16, Throughput=3.12 samples/s ETA=2.05h | |
2020-07-14 15:28:53,283 - root - INFO - Step: 6300/8161, Loss span/answer/total=0.4150/0.0671/0.4821, LR=0.00000855, grad_norm=1.3946. Time cost=190.32, Throughput=3.15 samples/s ETA=1.99h | |
2020-07-14 15:32:00,157 - root - INFO - GPU communication supported by horovod | |
2020-07-14 15:32:00,157 - root - INFO - GPU communication supported by horovod | |
2020-07-14 15:32:00,158 - root - INFO - GPU communication supported by horovod | |
2020-07-14 15:32:00,158 - root - INFO - GPU communication supported by horovod | |
2020-07-14 15:32:08,155 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 15:32:08,182 - root - INFO - Prepare training data | |
2020-07-14 15:32:08,204 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 15:32:08,206 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 15:32:08,218 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 15:32:08,229 - root - INFO - Prepare training data | |
2020-07-14 15:32:08,261 - root - INFO - Prepare training data | |
2020-07-14 15:32:08,276 - root - INFO - Prepare training data | |
2020-07-14 15:32:29,201 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson | |
2020-07-14 15:32:29,202 - root - INFO - Processing the Training data: | |
2020-07-14 15:32:29,251 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson | |
2020-07-14 15:32:29,252 - root - INFO - Processing the Training data: | |
2020-07-14 15:32:29,291 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson | |
2020-07-14 15:32:29,292 - root - INFO - Processing the Training data: | |
2020-07-14 15:32:29,489 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson | |
2020-07-14 15:32:29,489 - root - INFO - Processing the Training data: | |
2020-07-14 15:32:35,456 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319 | |
2020-07-14 15:32:35,478 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498 | |
2020-07-14 15:32:35,478 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697 | |
2020-07-14 15:32:35,478 - root - INFO - Creating distributed trainer... | |
2020-07-14 15:32:35,491 - root - INFO - Using gradient accumulation. Effective global batch size = 48 | |
2020-07-14 15:32:35,505 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319 | |
2020-07-14 15:32:35,515 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319 | |
2020-07-14 15:32:35,527 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498 | |
2020-07-14 15:32:35,527 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697 | |
2020-07-14 15:32:35,527 - root - INFO - Creating distributed trainer... | |
2020-07-14 15:32:35,533 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200 | |
2020-07-14 15:32:35,537 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498 | |
2020-07-14 15:32:35,537 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697 | |
2020-07-14 15:32:35,537 - root - INFO - Creating distributed trainer... | |
2020-07-14 15:32:35,540 - root - INFO - Using gradient accumulation. Effective global batch size = 48 | |
2020-07-14 15:32:35,550 - root - INFO - Using gradient accumulation. Effective global batch size = 48 | |
2020-07-14 15:32:35,582 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200 | |
2020-07-14 15:32:35,591 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200 | |
2020-07-14 15:32:35,977 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319 | |
2020-07-14 15:32:36,025 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498 | |
2020-07-14 15:32:36,025 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697 | |
2020-07-14 15:32:36,025 - root - INFO - Creating distributed trainer... | |
2020-07-14 15:32:36,053 - root - INFO - Using gradient accumulation. Effective global batch size = 48 | |
2020-07-14 15:32:36,156 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200 | |
2020-07-14 15:34:29,605 - root - INFO - GPU communication supported by horovod | |
2020-07-14 15:34:29,605 - root - INFO - GPU communication supported by horovod | |
2020-07-14 15:34:29,605 - root - INFO - GPU communication supported by horovod | |
2020-07-14 15:34:29,605 - root - INFO - GPU communication supported by horovod | |
2020-07-14 15:34:37,602 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 15:34:37,621 - root - INFO - Prepare training data | |
2020-07-14 15:34:37,633 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 15:34:37,672 - root - INFO - Prepare training data | |
2020-07-14 15:34:37,729 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 15:34:37,768 - root - INFO - Prepare training data | |
2020-07-14 15:34:37,828 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-14 15:34:37,868 - root - INFO - Prepare training data | |
2020-07-14 15:34:58,672 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson | |
2020-07-14 15:34:58,673 - root - INFO - Processing the Training data: | |
2020-07-14 15:34:58,703 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson | |
2020-07-14 15:34:58,703 - root - INFO - Processing the Training data: | |
2020-07-14 15:34:58,843 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson | |
2020-07-14 15:34:58,844 - root - INFO - Processing the Training data: | |
2020-07-14 15:34:58,845 - root - INFO - Found cached data features, load from /home/ubuntu/gluon-nlp/scripts/question_answering/cached/dev_fairseq_roberta_large_squad_2.0.ndjson | |
2020-07-14 15:34:58,846 - root - INFO - Processing the Training data: | |
2020-07-14 15:35:04,931 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319 | |
2020-07-14 15:35:04,949 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319 | |
2020-07-14 15:35:04,953 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498 | |
2020-07-14 15:35:04,953 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697 | |
2020-07-14 15:35:04,953 - root - INFO - Creating distributed trainer... | |
2020-07-14 15:35:04,966 - root - INFO - Using gradient accumulation. Effective global batch size = 48 | |
2020-07-14 15:35:04,970 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498 | |
2020-07-14 15:35:04,970 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697 | |
2020-07-14 15:35:04,970 - root - INFO - Creating distributed trainer... | |
2020-07-14 15:35:04,983 - root - INFO - Using gradient accumulation. Effective global batch size = 48 | |
2020-07-14 15:35:05,008 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200 | |
2020-07-14 15:35:05,025 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200 | |
2020-07-14 15:35:05,081 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319 | |
2020-07-14 15:35:05,082 - root - INFO - Done! #Unreliable Span=19 / #Mismatched Answer=29 / #Total=130319 | |
2020-07-14 15:35:05,103 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498 | |
2020-07-14 15:35:05,103 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697 | |
2020-07-14 15:35:05,103 - root - INFO - Creating distributed trainer... | |
2020-07-14 15:35:05,104 - root - INFO - Before Chunking, #Train/Is Impossible = 130319/43498 | |
2020-07-14 15:35:05,104 - root - INFO - After Chunking, #Train Sample/Is Impossible = 130580/43697 | |
2020-07-14 15:35:05,104 - root - INFO - Creating distributed trainer... | |
2020-07-14 15:35:05,116 - root - INFO - Using gradient accumulation. Effective global batch size = 48 | |
2020-07-14 15:35:05,117 - root - INFO - Using gradient accumulation. Effective global batch size = 48 | |
2020-07-14 15:35:05,163 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200 | |
2020-07-14 15:35:05,176 - root - INFO - #Total Training Steps=8161, Warmup=1632, Save Interval=200 | |
2020-07-14 15:38:19,569 - root - INFO - Step: 50/8161, Loss span/answer/total=4.5740/0.3139/4.8879, LR=0.00000092, grad_norm=0.9138. Time cost=194.30, Throughput=3.09 samples/s ETA=8.76h | |
2020-07-14 15:41:27,980 - root - INFO - Step: 100/8161, Loss span/answer/total=3.9481/0.2985/4.2466, LR=0.00000184, grad_norm=0.9825. Time cost=188.41, Throughput=3.18 samples/s ETA=8.57h | |
2020-07-14 15:44:46,282 - root - INFO - Step: 150/8161, Loss span/answer/total=3.5266/0.2936/3.8203, LR=0.00000276, grad_norm=1.0096. Time cost=198.30, Throughput=3.03 samples/s ETA=8.62h | |
2020-07-14 15:48:00,005 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_200.params | |
2020-07-14 15:48:00,005 - root - INFO - Step: 200/8161, Loss span/answer/total=2.4381/0.2765/2.7145, LR=0.00000368, grad_norm=2.1141. Time cost=193.72, Throughput=3.10 samples/s ETA=8.57h | |
2020-07-14 15:51:09,429 - root - INFO - Step: 250/8161, Loss span/answer/total=1.6377/0.2685/1.9062, LR=0.00000460, grad_norm=2.5878. Time cost=189.42, Throughput=3.17 samples/s ETA=8.47h | |
2020-07-14 15:54:18,691 - root - INFO - Step: 300/8161, Loss span/answer/total=1.4293/0.2237/1.6530, LR=0.00000551, grad_norm=2.2391. Time cost=189.26, Throughput=3.17 samples/s ETA=8.40h | |
2020-07-14 15:57:28,142 - root - INFO - Step: 350/8161, Loss span/answer/total=1.0939/0.1943/1.2882, LR=0.00000643, grad_norm=2.0628. Time cost=189.45, Throughput=3.17 samples/s ETA=8.32h | |
2020-07-14 16:00:38,758 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_400.params | |
2020-07-14 16:00:38,758 - root - INFO - Step: 400/8161, Loss span/answer/total=1.0850/0.2150/1.3000, LR=0.00000735, grad_norm=2.0075. Time cost=190.62, Throughput=3.15 samples/s ETA=8.26h | |
2020-07-14 16:03:48,092 - root - INFO - Step: 450/8161, Loss span/answer/total=1.0578/0.2005/1.2584, LR=0.00000827, grad_norm=1.5312. Time cost=189.33, Throughput=3.17 samples/s ETA=8.20h | |
2020-07-14 16:06:51,413 - root - INFO - Step: 500/8161, Loss span/answer/total=0.9698/0.1584/1.1282, LR=0.00000919, grad_norm=14.4826. Time cost=183.32, Throughput=3.27 samples/s ETA=8.11h | |
2020-07-14 16:10:01,519 - root - INFO - Step: 550/8161, Loss span/answer/total=0.9736/0.1793/1.1529, LR=0.00001011, grad_norm=2.6735. Time cost=190.11, Throughput=3.16 samples/s ETA=8.06h | |
2020-07-14 16:13:17,019 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_600.params | |
2020-07-14 16:13:17,019 - root - INFO - Step: 600/8161, Loss span/answer/total=0.9496/0.1810/1.1306, LR=0.00001103, grad_norm=1.9650. Time cost=195.50, Throughput=3.07 samples/s ETA=8.02h | |
2020-07-14 16:16:23,175 - root - INFO - Step: 650/8161, Loss span/answer/total=0.9297/0.2164/1.1460, LR=0.00001195, grad_norm=2.0462. Time cost=186.16, Throughput=3.22 samples/s ETA=7.95h | |
2020-07-14 16:19:28,832 - root - INFO - Step: 700/8161, Loss span/answer/total=0.9484/0.1668/1.1152, LR=0.00001287, grad_norm=1.7099. Time cost=185.66, Throughput=3.23 samples/s ETA=7.89h | |
2020-07-14 16:22:41,756 - root - INFO - Step: 750/8161, Loss span/answer/total=0.9283/0.1833/1.1116, LR=0.00001379, grad_norm=1.8619. Time cost=192.92, Throughput=3.11 samples/s ETA=7.84h | |
2020-07-14 16:25:56,779 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_800.params | |
2020-07-14 16:25:56,779 - root - INFO - Step: 800/8161, Loss span/answer/total=0.8120/0.1885/1.0005, LR=0.00001471, grad_norm=2.3574. Time cost=195.02, Throughput=3.08 samples/s ETA=7.80h | |
2020-07-14 16:29:15,093 - root - INFO - Step: 850/8161, Loss span/answer/total=0.9007/0.1828/1.0835, LR=0.00001563, grad_norm=3.1871. Time cost=198.31, Throughput=3.03 samples/s ETA=7.76h | |
2020-07-14 16:32:30,411 - root - INFO - Step: 900/8161, Loss span/answer/total=0.8275/0.1522/0.9796, LR=0.00001654, grad_norm=1.6902. Time cost=195.32, Throughput=3.07 samples/s ETA=7.72h | |
2020-07-14 16:35:43,656 - root - INFO - Step: 950/8161, Loss span/answer/total=0.9085/0.1903/1.0988, LR=0.00001746, grad_norm=9.4860. Time cost=193.24, Throughput=3.10 samples/s ETA=7.67h | |
2020-07-14 16:38:58,355 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1000.params | |
2020-07-14 16:38:58,355 - root - INFO - Step: 1000/8161, Loss span/answer/total=0.9499/0.2015/1.1514, LR=0.00001838, grad_norm=2.2885. Time cost=194.70, Throughput=3.08 samples/s ETA=7.62h | |
2020-07-14 16:42:04,167 - root - INFO - Step: 1050/8161, Loss span/answer/total=0.8268/0.1694/0.9962, LR=0.00001930, grad_norm=2.0775. Time cost=185.81, Throughput=3.23 samples/s ETA=7.56h | |
2020-07-14 16:45:15,834 - root - INFO - Step: 1100/8161, Loss span/answer/total=0.8575/0.1950/1.0525, LR=0.00002022, grad_norm=1.4896. Time cost=191.67, Throughput=3.13 samples/s ETA=7.51h | |
2020-07-14 16:48:29,671 - root - INFO - Step: 1150/8161, Loss span/answer/total=0.8812/0.1893/1.0705, LR=0.00002114, grad_norm=1.5334. Time cost=193.84, Throughput=3.10 samples/s ETA=7.46h | |
2020-07-14 16:51:43,537 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1200.params | |
2020-07-14 16:51:43,537 - root - INFO - Step: 1200/8161, Loss span/answer/total=0.8344/0.1673/1.0017, LR=0.00002206, grad_norm=2.3005. Time cost=193.87, Throughput=3.09 samples/s ETA=7.41h | |
2020-07-14 16:54:52,849 - root - INFO - Step: 1250/8161, Loss span/answer/total=0.8273/0.1657/0.9931, LR=0.00002298, grad_norm=2.6358. Time cost=189.31, Throughput=3.17 samples/s ETA=7.35h | |
2020-07-14 16:58:09,046 - root - INFO - Step: 1300/8161, Loss span/answer/total=0.8334/0.1783/1.0117, LR=0.00002390, grad_norm=2.0325. Time cost=196.20, Throughput=3.06 samples/s ETA=7.31h | |
2020-07-14 17:01:18,507 - root - INFO - Step: 1350/8161, Loss span/answer/total=0.8737/0.1763/1.0500, LR=0.00002482, grad_norm=1.6385. Time cost=189.46, Throughput=3.17 samples/s ETA=7.25h | |
2020-07-14 17:04:39,333 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1400.params | |
2020-07-14 17:04:39,333 - root - INFO - Step: 1400/8161, Loss span/answer/total=0.7660/0.1648/0.9308, LR=0.00002574, grad_norm=1.3930. Time cost=200.83, Throughput=2.99 samples/s ETA=7.21h | |
2020-07-14 17:07:47,821 - root - INFO - Step: 1450/8161, Loss span/answer/total=0.8390/0.1736/1.0126, LR=0.00002665, grad_norm=1.5518. Time cost=188.49, Throughput=3.18 samples/s ETA=7.15h | |
2020-07-14 17:11:00,857 - root - INFO - Step: 1500/8161, Loss span/answer/total=0.8812/0.1966/1.0778, LR=0.00002757, grad_norm=3.7684. Time cost=193.04, Throughput=3.11 samples/s ETA=7.10h | |
2020-07-14 17:14:15,423 - root - INFO - Step: 1550/8161, Loss span/answer/total=0.7900/0.1759/0.9659, LR=0.00002849, grad_norm=2.1649. Time cost=194.57, Throughput=3.08 samples/s ETA=7.05h | |
2020-07-14 17:17:32,327 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1600.params | |
2020-07-14 17:17:32,327 - root - INFO - Step: 1600/8161, Loss span/answer/total=0.7428/0.1530/0.8959, LR=0.00002941, grad_norm=1.8114. Time cost=196.90, Throughput=3.05 samples/s ETA=7.00h | |
2020-07-14 17:20:38,112 - root - INFO - Step: 1650/8161, Loss span/answer/total=0.7954/0.1645/0.9599, LR=0.00002992, grad_norm=1.5165. Time cost=185.78, Throughput=3.23 samples/s ETA=6.94h | |
2020-07-14 17:23:52,630 - root - INFO - Step: 1700/8161, Loss span/answer/total=0.7653/0.1553/0.9206, LR=0.00002969, grad_norm=1.9788. Time cost=194.52, Throughput=3.08 samples/s ETA=6.89h | |
2020-07-14 17:27:10,097 - root - INFO - Step: 1750/8161, Loss span/answer/total=0.8212/0.1642/0.9854, LR=0.00002946, grad_norm=2.0279. Time cost=197.47, Throughput=3.04 samples/s ETA=6.84h | |
2020-07-14 17:30:23,722 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_1800.params | |
2020-07-14 17:30:23,722 - root - INFO - Step: 1800/8161, Loss span/answer/total=0.8525/0.1771/1.0295, LR=0.00002923, grad_norm=1.6133. Time cost=193.62, Throughput=3.10 samples/s ETA=6.79h | |
2020-07-14 17:33:42,373 - root - INFO - Step: 1850/8161, Loss span/answer/total=0.8413/0.1776/1.0190, LR=0.00002900, grad_norm=1.4633. Time cost=198.65, Throughput=3.02 samples/s ETA=6.74h | |
2020-07-14 17:36:53,698 - root - INFO - Step: 1900/8161, Loss span/answer/total=0.7954/0.1642/0.9596, LR=0.00002877, grad_norm=1.3822. Time cost=191.32, Throughput=3.14 samples/s ETA=6.69h | |
2020-07-14 17:40:03,430 - root - INFO - Step: 1950/8161, Loss span/answer/total=0.7761/0.1468/0.9229, LR=0.00002854, grad_norm=1.9558. Time cost=189.73, Throughput=3.16 samples/s ETA=6.63h | |
2020-07-14 17:43:22,785 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2000.params | |
2020-07-14 17:43:22,785 - root - INFO - Step: 2000/8161, Loss span/answer/total=0.7829/0.1690/0.9519, LR=0.00002831, grad_norm=1.7791. Time cost=199.35, Throughput=3.01 samples/s ETA=6.59h | |
2020-07-14 17:46:30,740 - root - INFO - Step: 2050/8161, Loss span/answer/total=0.7582/0.1827/0.9409, LR=0.00002808, grad_norm=1.2299. Time cost=187.96, Throughput=3.19 samples/s ETA=6.53h | |
2020-07-14 17:49:46,439 - root - INFO - Step: 2100/8161, Loss span/answer/total=0.7431/0.1347/0.8778, LR=0.00002785, grad_norm=1.3342. Time cost=195.70, Throughput=3.07 samples/s ETA=6.48h | |
2020-07-14 17:53:02,863 - root - INFO - Step: 2150/8161, Loss span/answer/total=0.9334/0.1548/1.0882, LR=0.00002762, grad_norm=1.5187. Time cost=196.42, Throughput=3.05 samples/s ETA=6.43h | |
2020-07-14 17:56:25,867 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2200.params | |
2020-07-14 17:56:25,867 - root - INFO - Step: 2200/8161, Loss span/answer/total=0.7940/0.1577/0.9516, LR=0.00002739, grad_norm=1.2215. Time cost=203.00, Throughput=2.96 samples/s ETA=6.38h | |
2020-07-14 17:59:38,356 - root - INFO - Step: 2250/8161, Loss span/answer/total=0.7854/0.1825/0.9679, LR=0.00002716, grad_norm=1.3242. Time cost=192.49, Throughput=3.12 samples/s ETA=6.33h | |
2020-07-14 18:02:48,119 - root - INFO - Step: 2300/8161, Loss span/answer/total=0.7437/0.1325/0.8762, LR=0.00002693, grad_norm=1.1554. Time cost=189.76, Throughput=3.16 samples/s ETA=6.27h | |
2020-07-14 18:05:59,453 - root - INFO - Step: 2350/8161, Loss span/answer/total=0.7186/0.1501/0.8687, LR=0.00002670, grad_norm=1.6597. Time cost=191.33, Throughput=3.14 samples/s ETA=6.22h | |
2020-07-14 18:09:09,587 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2400.params | |
2020-07-14 18:09:09,587 - root - INFO - Step: 2400/8161, Loss span/answer/total=0.7207/0.1471/0.8677, LR=0.00002647, grad_norm=1.6368. Time cost=190.13, Throughput=3.16 samples/s ETA=6.16h | |
2020-07-14 18:12:24,348 - root - INFO - Step: 2450/8161, Loss span/answer/total=0.7684/0.1547/0.9231, LR=0.00002624, grad_norm=1.9939. Time cost=194.76, Throughput=3.08 samples/s ETA=6.11h | |
2020-07-14 18:15:35,223 - root - INFO - Step: 2500/8161, Loss span/answer/total=0.7108/0.1386/0.8494, LR=0.00002601, grad_norm=1.6097. Time cost=190.87, Throughput=3.14 samples/s ETA=6.06h | |
2020-07-14 18:18:55,555 - root - INFO - Step: 2550/8161, Loss span/answer/total=0.7025/0.1647/0.8672, LR=0.00002578, grad_norm=1.1942. Time cost=200.33, Throughput=3.00 samples/s ETA=6.01h | |
2020-07-14 18:22:03,399 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2600.params | |
2020-07-14 18:22:03,399 - root - INFO - Step: 2600/8161, Loss span/answer/total=0.6790/0.1576/0.8366, LR=0.00002555, grad_norm=1.4794. Time cost=187.84, Throughput=3.19 samples/s ETA=5.95h | |
2020-07-14 18:25:18,675 - root - INFO - Step: 2650/8161, Loss span/answer/total=0.7288/0.1643/0.8931, LR=0.00002532, grad_norm=2.1849. Time cost=195.28, Throughput=3.07 samples/s ETA=5.90h | |
2020-07-14 18:28:34,113 - root - INFO - Step: 2700/8161, Loss span/answer/total=0.7185/0.1293/0.8477, LR=0.00002509, grad_norm=1.6405. Time cost=195.44, Throughput=3.07 samples/s ETA=5.85h | |
2020-07-14 18:31:50,425 - root - INFO - Step: 2750/8161, Loss span/answer/total=0.6064/0.1159/0.7223, LR=0.00002486, grad_norm=2.7561. Time cost=196.31, Throughput=3.05 samples/s ETA=5.80h | |
2020-07-14 18:35:05,860 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_2800.params | |
2020-07-14 18:35:05,861 - root - INFO - Step: 2800/8161, Loss span/answer/total=0.6658/0.1403/0.8061, LR=0.00002463, grad_norm=1.1489. Time cost=195.43, Throughput=3.07 samples/s ETA=5.74h | |
2020-07-14 18:38:14,197 - root - INFO - Step: 2850/8161, Loss span/answer/total=0.6983/0.1294/0.8278, LR=0.00002440, grad_norm=1.4723. Time cost=188.34, Throughput=3.19 samples/s ETA=5.69h | |
2020-07-14 18:41:26,990 - root - INFO - Step: 2900/8161, Loss span/answer/total=0.6289/0.1120/0.7409, LR=0.00002417, grad_norm=1.9911. Time cost=192.79, Throughput=3.11 samples/s ETA=5.63h | |
2020-07-14 18:44:38,514 - root - INFO - Step: 2950/8161, Loss span/answer/total=0.6349/0.1189/0.7538, LR=0.00002394, grad_norm=1.2215. Time cost=191.52, Throughput=3.13 samples/s ETA=5.58h | |
2020-07-14 18:47:46,802 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3000.params | |
2020-07-14 18:47:46,802 - root - INFO - Step: 3000/8161, Loss span/answer/total=0.6051/0.1230/0.7281, LR=0.00002371, grad_norm=1.9404. Time cost=188.29, Throughput=3.19 samples/s ETA=5.52h | |
2020-07-14 18:50:58,991 - root - INFO - Step: 3050/8161, Loss span/answer/total=0.6007/0.1108/0.7115, LR=0.00002348, grad_norm=1.5713. Time cost=192.19, Throughput=3.12 samples/s ETA=5.47h | |
2020-07-14 18:54:03,981 - root - INFO - Step: 3100/8161, Loss span/answer/total=0.6201/0.1198/0.7399, LR=0.00002325, grad_norm=15.4591. Time cost=184.99, Throughput=3.24 samples/s ETA=5.41h | |
2020-07-14 18:57:24,423 - root - INFO - Step: 3150/8161, Loss span/answer/total=0.5833/0.1168/0.7001, LR=0.00002302, grad_norm=1.2660. Time cost=200.44, Throughput=2.99 samples/s ETA=5.36h | |
2020-07-14 19:00:38,314 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3200.params | |
2020-07-14 19:00:38,314 - root - INFO - Step: 3200/8161, Loss span/answer/total=0.5754/0.1392/0.7147, LR=0.00002280, grad_norm=1.7093. Time cost=193.89, Throughput=3.09 samples/s ETA=5.31h | |
2020-07-14 19:03:46,482 - root - INFO - Step: 3250/8161, Loss span/answer/total=0.5912/0.1243/0.7155, LR=0.00002257, grad_norm=1.6054. Time cost=188.17, Throughput=3.19 samples/s ETA=5.26h | |
2020-07-14 19:06:57,204 - root - INFO - Step: 3300/8161, Loss span/answer/total=0.5784/0.1375/0.7159, LR=0.00002234, grad_norm=1.7708. Time cost=190.72, Throughput=3.15 samples/s ETA=5.20h | |
2020-07-14 19:10:08,916 - root - INFO - Step: 3350/8161, Loss span/answer/total=0.6490/0.1394/0.7884, LR=0.00002211, grad_norm=0.9929. Time cost=191.71, Throughput=3.13 samples/s ETA=5.15h | |
2020-07-14 19:13:30,356 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3400.params | |
2020-07-14 19:13:30,356 - root - INFO - Step: 3400/8161, Loss span/answer/total=0.5344/0.1239/0.6584, LR=0.00002188, grad_norm=1.0861. Time cost=201.44, Throughput=2.98 samples/s ETA=5.10h | |
2020-07-14 19:16:46,255 - root - INFO - Step: 3450/8161, Loss span/answer/total=0.6563/0.1212/0.7775, LR=0.00002165, grad_norm=1.6354. Time cost=195.90, Throughput=3.06 samples/s ETA=5.05h | |
2020-07-14 19:20:08,618 - root - INFO - Step: 3500/8161, Loss span/answer/total=0.6558/0.1089/0.7648, LR=0.00002142, grad_norm=1.7389. Time cost=202.36, Throughput=2.96 samples/s ETA=5.00h | |
2020-07-14 19:23:20,439 - root - INFO - Step: 3550/8161, Loss span/answer/total=0.6205/0.1071/0.7276, LR=0.00002119, grad_norm=1.1037. Time cost=191.82, Throughput=3.13 samples/s ETA=4.94h | |
2020-07-14 19:26:39,713 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3600.params | |
2020-07-14 19:26:39,714 - root - INFO - Step: 3600/8161, Loss span/answer/total=0.6081/0.1148/0.7230, LR=0.00002096, grad_norm=1.3200. Time cost=199.27, Throughput=3.01 samples/s ETA=4.89h | |
2020-07-14 19:29:45,317 - root - INFO - Step: 3650/8161, Loss span/answer/total=0.5934/0.1191/0.7126, LR=0.00002073, grad_norm=1.8144. Time cost=185.60, Throughput=3.23 samples/s ETA=4.83h | |
2020-07-14 19:32:56,956 - root - INFO - Step: 3700/8161, Loss span/answer/total=0.5790/0.1221/0.7011, LR=0.00002050, grad_norm=1.2696. Time cost=191.64, Throughput=3.13 samples/s ETA=4.78h | |
2020-07-14 19:36:12,898 - root - INFO - Step: 3750/8161, Loss span/answer/total=0.5821/0.1224/0.7045, LR=0.00002027, grad_norm=1.7185. Time cost=195.94, Throughput=3.06 samples/s ETA=4.73h | |
2020-07-14 19:39:32,290 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_3800.params | |
2020-07-14 19:39:32,290 - root - INFO - Step: 3800/8161, Loss span/answer/total=0.5515/0.0821/0.6336, LR=0.00002004, grad_norm=2.3873. Time cost=199.39, Throughput=3.01 samples/s ETA=4.68h | |
2020-07-14 19:42:42,255 - root - INFO - Step: 3850/8161, Loss span/answer/total=0.5287/0.0982/0.6269, LR=0.00001981, grad_norm=1.6992. Time cost=189.97, Throughput=3.16 samples/s ETA=4.62h | |
2020-07-14 19:45:55,782 - root - INFO - Step: 3900/8161, Loss span/answer/total=0.5403/0.0919/0.6322, LR=0.00001958, grad_norm=1.2932. Time cost=193.53, Throughput=3.10 samples/s ETA=4.57h | |
2020-07-14 19:49:13,083 - root - INFO - Step: 3950/8161, Loss span/answer/total=0.4978/0.1099/0.6077, LR=0.00001935, grad_norm=1.2998. Time cost=197.30, Throughput=3.04 samples/s ETA=4.52h | |
2020-07-14 19:52:27,483 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4000.params | |
2020-07-14 19:52:27,483 - root - INFO - Step: 4000/8161, Loss span/answer/total=0.5036/0.1098/0.6134, LR=0.00001912, grad_norm=1.1717. Time cost=194.40, Throughput=3.09 samples/s ETA=4.46h | |
2020-07-14 19:55:44,433 - root - INFO - Step: 4050/8161, Loss span/answer/total=0.5325/0.0789/0.6114, LR=0.00001889, grad_norm=1.2614. Time cost=196.95, Throughput=3.05 samples/s ETA=4.41h | |
2020-07-14 19:59:04,390 - root - INFO - Step: 4100/8161, Loss span/answer/total=0.5558/0.0984/0.6542, LR=0.00001866, grad_norm=2.0315. Time cost=199.96, Throughput=3.00 samples/s ETA=4.36h | |
2020-07-14 20:02:13,006 - root - INFO - Step: 4150/8161, Loss span/answer/total=0.5618/0.1028/0.6645, LR=0.00001843, grad_norm=1.5764. Time cost=188.62, Throughput=3.18 samples/s ETA=4.30h | |
2020-07-14 20:05:27,300 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4200.params | |
2020-07-14 20:05:27,301 - root - INFO - Step: 4200/8161, Loss span/answer/total=0.5585/0.1095/0.6680, LR=0.00001820, grad_norm=1.5410. Time cost=194.29, Throughput=3.09 samples/s ETA=4.25h | |
2020-07-14 20:08:35,748 - root - INFO - Step: 4250/8161, Loss span/answer/total=0.5177/0.0987/0.6164, LR=0.00001797, grad_norm=1.0852. Time cost=188.45, Throughput=3.18 samples/s ETA=4.19h | |
2020-07-14 20:11:39,837 - root - INFO - Step: 4300/8161, Loss span/answer/total=0.5651/0.0906/0.6557, LR=0.00001774, grad_norm=1.4587. Time cost=184.09, Throughput=3.26 samples/s ETA=4.14h | |
2020-07-14 20:14:54,241 - root - INFO - Step: 4350/8161, Loss span/answer/total=0.5509/0.1009/0.6519, LR=0.00001751, grad_norm=6.5584. Time cost=194.40, Throughput=3.09 samples/s ETA=4.09h | |
2020-07-14 20:18:08,281 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4400.params | |
2020-07-14 20:18:08,281 - root - INFO - Step: 4400/8161, Loss span/answer/total=0.5717/0.1150/0.6867, LR=0.00001728, grad_norm=1.2226. Time cost=194.04, Throughput=3.09 samples/s ETA=4.03h | |
2020-07-14 20:21:20,248 - root - INFO - Step: 4450/8161, Loss span/answer/total=0.5255/0.0910/0.6166, LR=0.00001705, grad_norm=1.2802. Time cost=191.97, Throughput=3.13 samples/s ETA=3.98h | |
2020-07-14 20:24:35,265 - root - INFO - Step: 4500/8161, Loss span/answer/total=0.5217/0.1029/0.6246, LR=0.00001682, grad_norm=1.4404. Time cost=195.02, Throughput=3.08 samples/s ETA=3.93h | |
2020-07-14 20:27:46,910 - root - INFO - Step: 4550/8161, Loss span/answer/total=0.5017/0.1006/0.6024, LR=0.00001659, grad_norm=2.1950. Time cost=191.64, Throughput=3.13 samples/s ETA=3.87h | |
2020-07-14 20:31:06,490 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4600.params | |
2020-07-14 20:31:06,490 - root - INFO - Step: 4600/8161, Loss span/answer/total=0.5070/0.1011/0.6081, LR=0.00001636, grad_norm=1.7031. Time cost=199.58, Throughput=3.01 samples/s ETA=3.82h | |
2020-07-14 20:34:20,369 - root - INFO - Step: 4650/8161, Loss span/answer/total=0.4975/0.0881/0.5856, LR=0.00001613, grad_norm=1.5950. Time cost=193.88, Throughput=3.09 samples/s ETA=3.77h | |
2020-07-14 20:37:35,882 - root - INFO - Step: 4700/8161, Loss span/answer/total=0.5709/0.0977/0.6686, LR=0.00001590, grad_norm=2.0156. Time cost=195.51, Throughput=3.07 samples/s ETA=3.71h | |
2020-07-14 20:40:49,278 - root - INFO - Step: 4750/8161, Loss span/answer/total=0.5213/0.0948/0.6161, LR=0.00001567, grad_norm=1.8299. Time cost=193.40, Throughput=3.10 samples/s ETA=3.66h | |
2020-07-14 20:44:06,587 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_4800.params | |
2020-07-14 20:44:06,587 - root - INFO - Step: 4800/8161, Loss span/answer/total=0.5009/0.1144/0.6153, LR=0.00001544, grad_norm=1.5089. Time cost=197.31, Throughput=3.04 samples/s ETA=3.61h | |
2020-07-14 20:47:16,889 - root - INFO - Step: 4850/8161, Loss span/answer/total=0.4927/0.0957/0.5884, LR=0.00001521, grad_norm=1.3726. Time cost=190.30, Throughput=3.15 samples/s ETA=3.55h | |
2020-07-14 20:50:27,539 - root - INFO - Step: 4900/8161, Loss span/answer/total=0.6038/0.0994/0.7032, LR=0.00001498, grad_norm=1.5305. Time cost=190.65, Throughput=3.15 samples/s ETA=3.50h | |
2020-07-14 20:53:35,318 - root - INFO - Step: 4950/8161, Loss span/answer/total=0.5941/0.1062/0.7003, LR=0.00001475, grad_norm=5.3665. Time cost=187.78, Throughput=3.20 samples/s ETA=3.44h | |
2020-07-14 20:56:53,535 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5000.params | |
2020-07-14 20:56:53,535 - root - INFO - Step: 5000/8161, Loss span/answer/total=0.5886/0.0964/0.6851, LR=0.00001452, grad_norm=1.9062. Time cost=198.22, Throughput=3.03 samples/s ETA=3.39h | |
2020-07-14 21:00:06,070 - root - INFO - Step: 5050/8161, Loss span/answer/total=0.5621/0.0939/0.6560, LR=0.00001429, grad_norm=1.8509. Time cost=192.53, Throughput=3.12 samples/s ETA=3.34h | |
2020-07-14 21:03:15,464 - root - INFO - Step: 5100/8161, Loss span/answer/total=0.4969/0.0967/0.5936, LR=0.00001406, grad_norm=1.7891. Time cost=189.39, Throughput=3.17 samples/s ETA=3.28h | |
2020-07-14 21:06:32,697 - root - INFO - Step: 5150/8161, Loss span/answer/total=0.5231/0.0906/0.6137, LR=0.00001384, grad_norm=1.6945. Time cost=197.23, Throughput=3.04 samples/s ETA=3.23h | |
2020-07-14 21:09:53,806 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5200.params | |
2020-07-14 21:09:53,806 - root - INFO - Step: 5200/8161, Loss span/answer/total=0.6016/0.1198/0.7214, LR=0.00001361, grad_norm=1.8797. Time cost=201.11, Throughput=2.98 samples/s ETA=3.18h | |
2020-07-14 21:13:04,751 - root - INFO - Step: 5250/8161, Loss span/answer/total=0.4959/0.0973/0.5931, LR=0.00001338, grad_norm=1.7449. Time cost=190.94, Throughput=3.14 samples/s ETA=3.12h | |
2020-07-14 21:16:22,823 - root - INFO - Step: 5300/8161, Loss span/answer/total=0.5179/0.0829/0.6008, LR=0.00001315, grad_norm=2.0806. Time cost=198.07, Throughput=3.03 samples/s ETA=3.07h | |
2020-07-14 21:19:27,094 - root - INFO - Step: 5350/8161, Loss span/answer/total=0.5125/0.0961/0.6085, LR=0.00001292, grad_norm=1.8463. Time cost=184.27, Throughput=3.26 samples/s ETA=3.02h | |
2020-07-14 21:22:45,648 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5400.params | |
2020-07-14 21:22:45,648 - root - INFO - Step: 5400/8161, Loss span/answer/total=0.5466/0.0818/0.6284, LR=0.00001269, grad_norm=1.5377. Time cost=198.55, Throughput=3.02 samples/s ETA=2.96h | |
2020-07-14 21:26:01,508 - root - INFO - Step: 5450/8161, Loss span/answer/total=0.5089/0.1053/0.6142, LR=0.00001246, grad_norm=1.8162. Time cost=195.86, Throughput=3.06 samples/s ETA=2.91h | |
2020-07-14 21:29:18,997 - root - INFO - Step: 5500/8161, Loss span/answer/total=0.4228/0.0791/0.5018, LR=0.00001223, grad_norm=3.5414. Time cost=197.49, Throughput=3.04 samples/s ETA=2.86h | |
2020-07-14 21:32:39,895 - root - INFO - Step: 5550/8161, Loss span/answer/total=0.4608/0.0821/0.5430, LR=0.00001200, grad_norm=1.7035. Time cost=200.90, Throughput=2.99 samples/s ETA=2.80h | |
2020-07-14 21:35:53,498 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5600.params | |
2020-07-14 21:35:53,498 - root - INFO - Step: 5600/8161, Loss span/answer/total=0.3661/0.0575/0.4236, LR=0.00001177, grad_norm=1.7812. Time cost=193.60, Throughput=3.10 samples/s ETA=2.75h | |
2020-07-14 21:39:07,211 - root - INFO - Step: 5650/8161, Loss span/answer/total=0.4743/0.0522/0.5264, LR=0.00001154, grad_norm=1.2576. Time cost=193.71, Throughput=3.10 samples/s ETA=2.70h | |
2020-07-14 21:42:22,387 - root - INFO - Step: 5700/8161, Loss span/answer/total=0.3931/0.0696/0.4628, LR=0.00001131, grad_norm=1.3404. Time cost=195.18, Throughput=3.07 samples/s ETA=2.64h | |
2020-07-14 21:45:34,159 - root - INFO - Step: 5750/8161, Loss span/answer/total=0.4859/0.0606/0.5466, LR=0.00001108, grad_norm=1.0631. Time cost=191.77, Throughput=3.13 samples/s ETA=2.59h | |
2020-07-14 21:48:47,578 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_5800.params | |
2020-07-14 21:48:47,579 - root - INFO - Step: 5800/8161, Loss span/answer/total=0.4083/0.0600/0.4682, LR=0.00001085, grad_norm=2.0642. Time cost=193.42, Throughput=3.10 samples/s ETA=2.54h | |
2020-07-14 21:52:01,216 - root - INFO - Step: 5850/8161, Loss span/answer/total=0.3876/0.0574/0.4450, LR=0.00001062, grad_norm=1.1036. Time cost=193.64, Throughput=3.10 samples/s ETA=2.48h | |
2020-07-14 21:55:11,557 - root - INFO - Step: 5900/8161, Loss span/answer/total=0.4148/0.0775/0.4923, LR=0.00001039, grad_norm=2.2148. Time cost=190.34, Throughput=3.15 samples/s ETA=2.43h | |
2020-07-14 21:58:29,072 - root - INFO - Step: 5950/8161, Loss span/answer/total=0.4107/0.0566/0.4672, LR=0.00001016, grad_norm=1.2852. Time cost=197.51, Throughput=3.04 samples/s ETA=2.37h | |
2020-07-14 22:01:41,868 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_6000.params | |
2020-07-14 22:01:41,869 - root - INFO - Step: 6000/8161, Loss span/answer/total=0.4159/0.0675/0.4834, LR=0.00000993, grad_norm=1.3154. Time cost=192.80, Throughput=3.11 samples/s ETA=2.32h | |
2020-07-14 22:04:53,352 - root - INFO - Step: 6050/8161, Loss span/answer/total=0.4306/0.0664/0.4969, LR=0.00000970, grad_norm=5.1835. Time cost=191.48, Throughput=3.13 samples/s ETA=2.27h | |
2020-07-14 22:08:09,185 - root - INFO - Step: 6100/8161, Loss span/answer/total=0.4099/0.0597/0.4696, LR=0.00000947, grad_norm=7.6002. Time cost=195.83, Throughput=3.06 samples/s ETA=2.21h | |
2020-07-14 22:11:17,269 - root - INFO - Step: 6150/8161, Loss span/answer/total=0.4242/0.0839/0.5081, LR=0.00000924, grad_norm=1.1508. Time cost=188.08, Throughput=3.19 samples/s ETA=2.16h | |
2020-07-14 22:14:29,056 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_6200.params | |
2020-07-14 22:14:29,056 - root - INFO - Step: 6200/8161, Loss span/answer/total=0.4776/0.0728/0.5504, LR=0.00000901, grad_norm=1.9973. Time cost=191.79, Throughput=3.13 samples/s ETA=2.11h | |
2020-07-14 22:17:39,767 - root - INFO - Step: 6250/8161, Loss span/answer/total=0.3884/0.0599/0.4483, LR=0.00000878, grad_norm=2.3867. Time cost=190.71, Throughput=3.15 samples/s ETA=2.05h | |
2020-07-14 22:20:49,213 - root - INFO - Step: 6300/8161, Loss span/answer/total=0.4076/0.0672/0.4748, LR=0.00000855, grad_norm=1.6369. Time cost=189.45, Throughput=3.17 samples/s ETA=2.00h | |
2020-07-14 22:24:01,249 - root - INFO - Step: 6350/8161, Loss span/answer/total=0.4142/0.0682/0.4824, LR=0.00000832, grad_norm=1.6283. Time cost=192.04, Throughput=3.12 samples/s ETA=1.94h | |
2020-07-14 22:27:13,813 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_6400.params | |
2020-07-14 22:27:13,814 - root - INFO - Step: 6400/8161, Loss span/answer/total=0.4449/0.0575/0.5024, LR=0.00000809, grad_norm=1.5339. Time cost=192.56, Throughput=3.12 samples/s ETA=1.89h | |
2020-07-14 22:30:25,934 - root - INFO - Step: 6450/8161, Loss span/answer/total=0.3957/0.0432/0.4389, LR=0.00000786, grad_norm=1.1557. Time cost=192.12, Throughput=3.12 samples/s ETA=1.84h | |
2020-07-14 22:33:34,587 - root - INFO - Step: 6500/8161, Loss span/answer/total=0.3982/0.0674/0.4657, LR=0.00000763, grad_norm=1.1749. Time cost=188.65, Throughput=3.18 samples/s ETA=1.78h | |
2020-07-14 22:36:45,707 - root - INFO - Step: 6550/8161, Loss span/answer/total=0.4521/0.0805/0.5326, LR=0.00000740, grad_norm=0.9745. Time cost=191.12, Throughput=3.14 samples/s ETA=1.73h | |
2020-07-14 22:39:57,453 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_6600.params | |
2020-07-14 22:39:57,454 - root - INFO - Step: 6600/8161, Loss span/answer/total=0.5043/0.0921/0.5964, LR=0.00000717, grad_norm=1.6489. Time cost=191.75, Throughput=3.13 samples/s ETA=1.67h | |
2020-07-14 22:43:05,283 - root - INFO - Step: 6650/8161, Loss span/answer/total=0.4128/0.0532/0.4661, LR=0.00000694, grad_norm=1.1854. Time cost=187.83, Throughput=3.19 samples/s ETA=1.62h | |
2020-07-14 22:46:11,795 - root - INFO - Step: 6700/8161, Loss span/answer/total=0.3934/0.0498/0.4433, LR=0.00000671, grad_norm=1.5059. Time cost=186.51, Throughput=3.22 samples/s ETA=1.57h | |
2020-07-14 22:49:26,882 - root - INFO - Step: 6750/8161, Loss span/answer/total=0.4352/0.0500/0.4853, LR=0.00000648, grad_norm=1.4848. Time cost=195.09, Throughput=3.08 samples/s ETA=1.51h | |
2020-07-14 22:52:45,660 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_6800.params | |
2020-07-14 22:52:45,661 - root - INFO - Step: 6800/8161, Loss span/answer/total=0.4016/0.0566/0.4583, LR=0.00000625, grad_norm=1.5634. Time cost=198.78, Throughput=3.02 samples/s ETA=1.46h | |
2020-07-14 22:55:53,687 - root - INFO - Step: 6850/8161, Loss span/answer/total=0.4129/0.0549/0.4677, LR=0.00000602, grad_norm=1.7989. Time cost=188.03, Throughput=3.19 samples/s ETA=1.41h | |
2020-07-14 22:59:07,587 - root - INFO - Step: 6900/8161, Loss span/answer/total=0.4209/0.0564/0.4773, LR=0.00000579, grad_norm=1.4212. Time cost=193.90, Throughput=3.09 samples/s ETA=1.35h | |
2020-07-14 23:02:20,821 - root - INFO - Step: 6950/8161, Loss span/answer/total=0.4443/0.0777/0.5220, LR=0.00000556, grad_norm=5.6529. Time cost=193.23, Throughput=3.11 samples/s ETA=1.30h | |
2020-07-14 23:05:35,147 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_7000.params | |
2020-07-14 23:05:35,147 - root - INFO - Step: 7000/8161, Loss span/answer/total=0.4073/0.0603/0.4676, LR=0.00000533, grad_norm=1.5356. Time cost=194.32, Throughput=3.09 samples/s ETA=1.25h | |
2020-07-14 23:08:46,964 - root - INFO - Step: 7050/8161, Loss span/answer/total=0.4259/0.0756/0.5016, LR=0.00000510, grad_norm=2.0354. Time cost=191.82, Throughput=3.13 samples/s ETA=1.19h | |
2020-07-14 23:12:08,230 - root - INFO - Step: 7100/8161, Loss span/answer/total=0.4025/0.0516/0.4541, LR=0.00000488, grad_norm=1.6929. Time cost=201.26, Throughput=2.98 samples/s ETA=1.14h | |
2020-07-14 23:15:19,463 - root - INFO - Step: 7150/8161, Loss span/answer/total=0.4131/0.0626/0.4757, LR=0.00000465, grad_norm=1.0583. Time cost=191.23, Throughput=3.14 samples/s ETA=1.08h | |
2020-07-14 23:18:29,325 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_7200.params | |
2020-07-14 23:18:29,326 - root - INFO - Step: 7200/8161, Loss span/answer/total=0.3899/0.0588/0.4487, LR=0.00000442, grad_norm=1.0074. Time cost=189.86, Throughput=3.16 samples/s ETA=1.03h | |
2020-07-14 23:21:47,642 - root - INFO - Step: 7250/8161, Loss span/answer/total=0.4479/0.0600/0.5080, LR=0.00000419, grad_norm=2.7129. Time cost=198.32, Throughput=3.03 samples/s ETA=0.98h | |
2020-07-14 23:25:09,485 - root - INFO - Step: 7300/8161, Loss span/answer/total=0.3741/0.0544/0.4284, LR=0.00000396, grad_norm=1.6513. Time cost=201.84, Throughput=2.97 samples/s ETA=0.92h | |
2020-07-14 23:28:21,112 - root - INFO - Step: 7350/8161, Loss span/answer/total=0.4304/0.0623/0.4927, LR=0.00000373, grad_norm=1.1648. Time cost=191.63, Throughput=3.13 samples/s ETA=0.87h | |
2020-07-14 23:31:42,116 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_7400.params | |
2020-07-14 23:31:42,116 - root - INFO - Step: 7400/8161, Loss span/answer/total=0.4392/0.0491/0.4883, LR=0.00000350, grad_norm=1.5138. Time cost=201.00, Throughput=2.99 samples/s ETA=0.82h | |
2020-07-14 23:34:56,042 - root - INFO - Step: 7450/8161, Loss span/answer/total=0.4220/0.0668/0.4887, LR=0.00000327, grad_norm=1.7039. Time cost=193.93, Throughput=3.09 samples/s ETA=0.76h | |
2020-07-14 23:38:12,700 - root - INFO - Step: 7500/8161, Loss span/answer/total=0.3895/0.0651/0.4546, LR=0.00000304, grad_norm=1.4577. Time cost=196.66, Throughput=3.05 samples/s ETA=0.71h | |
2020-07-14 23:41:23,998 - root - INFO - Step: 7550/8161, Loss span/answer/total=0.4179/0.0453/0.4632, LR=0.00000281, grad_norm=1.5650. Time cost=191.30, Throughput=3.14 samples/s ETA=0.66h | |
2020-07-14 23:44:55,111 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_7600.params | |
2020-07-14 23:44:55,111 - root - INFO - Step: 7600/8161, Loss span/answer/total=0.4465/0.0511/0.4975, LR=0.00000258, grad_norm=1.3331. Time cost=211.11, Throughput=2.84 samples/s ETA=0.60h | |
2020-07-14 23:47:58,099 - root - INFO - Step: 7650/8161, Loss span/answer/total=0.3817/0.0608/0.4425, LR=0.00000235, grad_norm=1.0029. Time cost=182.99, Throughput=3.28 samples/s ETA=0.55h | |
2020-07-14 23:51:12,415 - root - INFO - Step: 7700/8161, Loss span/answer/total=0.3911/0.0538/0.4449, LR=0.00000212, grad_norm=4.2882. Time cost=194.32, Throughput=3.09 samples/s ETA=0.50h | |
2020-07-14 23:54:20,809 - root - INFO - Step: 7750/8161, Loss span/answer/total=0.4156/0.0548/0.4704, LR=0.00000189, grad_norm=1.4461. Time cost=188.39, Throughput=3.18 samples/s ETA=0.44h | |
2020-07-14 23:57:37,034 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_7800.params | |
2020-07-14 23:57:37,034 - root - INFO - Step: 7800/8161, Loss span/answer/total=0.3519/0.0528/0.4047, LR=0.00000166, grad_norm=1.4581. Time cost=196.22, Throughput=3.06 samples/s ETA=0.39h | |
2020-07-15 00:00:50,075 - root - INFO - Step: 7850/8161, Loss span/answer/total=0.4029/0.0635/0.4664, LR=0.00000143, grad_norm=0.9792. Time cost=193.04, Throughput=3.11 samples/s ETA=0.33h | |
2020-07-15 00:03:59,176 - root - INFO - Step: 7900/8161, Loss span/answer/total=0.4431/0.0649/0.5080, LR=0.00000120, grad_norm=1.0435. Time cost=189.10, Throughput=3.17 samples/s ETA=0.28h | |
2020-07-15 00:07:12,219 - root - INFO - Step: 7950/8161, Loss span/answer/total=0.4379/0.0521/0.4900, LR=0.00000097, grad_norm=1.5092. Time cost=193.04, Throughput=3.11 samples/s ETA=0.23h | |
2020-07-15 00:10:26,027 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_8000.params | |
2020-07-15 00:10:26,027 - root - INFO - Step: 8000/8161, Loss span/answer/total=0.4136/0.0473/0.4609, LR=0.00000074, grad_norm=1.1394. Time cost=193.81, Throughput=3.10 samples/s ETA=0.17h | |
2020-07-15 00:13:38,697 - root - INFO - Step: 8050/8161, Loss span/answer/total=0.3939/0.0683/0.4621, LR=0.00000051, grad_norm=12.0716. Time cost=192.67, Throughput=3.11 samples/s ETA=0.12h | |
2020-07-15 00:16:48,837 - root - INFO - Step: 8100/8161, Loss span/answer/total=0.3792/0.0419/0.4211, LR=0.00000028, grad_norm=2.0027. Time cost=190.14, Throughput=3.16 samples/s ETA=0.07h | |
2020-07-15 00:20:03,947 - root - INFO - Step: 8150/8161, Loss span/answer/total=0.3541/0.0522/0.4064, LR=0.00000005, grad_norm=1.3096. Time cost=195.11, Throughput=3.08 samples/s ETA=0.01h | |
2020-07-15 00:20:53,389 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_8161.params | |
2020-07-15 00:20:53,389 - root - INFO - Finish training step: 8161 within 8.763373736739158 hours | |
2020-07-15 00:20:53,395 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_8161.params | |
2020-07-15 00:20:53,395 - root - INFO - Finish training step: 8161 within 8.763424505790075 hours | |
2020-07-15 00:20:53,419 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_8161.params | |
2020-07-15 00:20:53,420 - root - INFO - Finish training step: 8161 within 8.763374406364228 hours | |
2020-07-15 00:20:53,535 - root - INFO - Params saved in: roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28/fairseq_roberta_large_squad2.0_8161.params | |
2020-07-15 00:20:53,535 - root - INFO - Finish training step: 8161 within 8.763468142085605 hours | |
2020-07-15 03:39:13,789 - root - INFO - GPU communication supported by KVStore | |
2020-07-15 03:39:13,789 - root - INFO - Srarting inference without horovod on the first node on device [gpu(0), gpu(1), gpu(2), gpu(3)] | |
2020-07-15 03:39:31,627 - root - INFO - Loading Backbone Model from /home/ubuntu/.mxnet/models/nlp/fairseq_roberta_large/model-6b043b91.params, with total/fixd parameters=354307072/0 | |
2020-07-15 03:39:31,648 - root - INFO - Prepare dev data | |
2020-07-15 03:39:31,750 - root - INFO - Tokenize Data: | |
2020-07-15 03:39:41,793 - root - INFO - Done! Time spent:10.04 seconds | |
2020-07-15 03:39:43,327 - root - INFO - Starting evaluate the checkpoint fairseq_roberta_large_squad2.0_8161.params | |
2020-07-15 03:41:44,712 - root - INFO - [batch 10], Time cost=120.41, Throughput=15.95 samples/s, ETA=0.17h | |
2020-07-15 03:44:17,184 - root - INFO - [batch 20], Time cost=152.47, Throughput=12.59 samples/s, ETA=0.16h | |
2020-07-15 03:46:38,659 - root - INFO - [batch 30], Time cost=141.47, Throughput=13.57 samples/s, ETA=0.12h | |
2020-07-15 03:48:49,060 - root - INFO - [batch 40], Time cost=130.40, Throughput=14.72 samples/s, ETA=0.08h | |
2020-07-15 03:51:03,864 - root - INFO - [batch 50], Time cost=134.80, Throughput=14.24 samples/s, ETA=0.04h | |
2020-07-15 03:53:11,686 - root - INFO - [batch 60], Time cost=127.82, Throughput=15.02 samples/s, ETA=0.01h | |
2020-07-15 03:53:47,278 - root - INFO - Time cost=842.975556 s, Thoughput=14.24 samples/s | |
2020-07-15 03:53:52,697 - root - INFO - The evaluated results are {"exact": 44.251663438052724, "f1": 47.47999540626087, "total": 11873, "HasAns_exact": 87.36504723346829, "HasAns_f1": 93.83096920690542, "HasAns_total": 5928, "NoAns_exact": 1.2615643397813288, "NoAns_f1": 1.2615643397813288, "NoAns_total": 5945, "best_exact": 85.88393834751116, "best_exact_thresh": -1.9132816791534424, "best_f1": 88.73247007018989, "best_f1_thresh": -1.6530208587646484} | |
2020-07-15 03:53:52,697 - root - INFO - The evaluated files are saved in roberta/SQUAD2.0_large_12_3e-5_0.01_3_1.0_0.2_28 | |
2020-07-15 03:53:53,513 - root - INFO - The best evaluated results are {"exact": 44.251663438052724, "f1": 47.47999540626087, "total": 11873, "HasAns_exact": 87.36504723346829, "HasAns_f1": 93.83096920690542, "HasAns_total": 5928, "NoAns_exact": 1.2615643397813288, "NoAns_f1": 1.2615643397813288, "NoAns_total": 5945, "best_exact": 85.88393834751116, "best_exact_thresh": -1.9132816791534424, "best_f1": 88.73247007018989, "best_f1_thresh": -1.6530208587646484, "best_ckpt": "fairseq_roberta_large_squad2.0_8161.params"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
07/16/2020 05:44:07 - WARNING - __main__ - Process rank: 2, device: cuda:2, n_gpu: 1, distributed training: True, 16-bits training: False | |
07/16/2020 05:44:07 - WARNING - __main__ - Process rank: 3, device: cuda:3, n_gpu: 1, distributed training: True, 16-bits training: False | |
07/16/2020 05:44:07 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1, distributed training: True, 16-bits training: False | |
07/16/2020 05:44:07 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, 16-bits training: False | |
07/16/2020 05:44:07 - INFO - transformers.configuration_utils - loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json from cache at /home/ubuntu/.cache/torch/transformers/c22e0b5bbb7c0cb93a87a2ae01263ae715b4c18d692b1740ce72cacaa99ad184.2d28da311092e99a05f9ee17520204614d60b0bfdb32f8a75644df7737b6a748 | |
07/16/2020 05:44:07 - INFO - transformers.configuration_utils - Model config RobertaConfig { | |
"architectures": [ | |
"RobertaForMaskedLM" | |
], | |
"attention_probs_dropout_prob": 0.1, | |
"bos_token_id": 0, | |
"eos_token_id": 2, | |
"gradient_checkpointing": false, | |
"hidden_act": "gelu", | |
"hidden_dropout_prob": 0.1, | |
"hidden_size": 1024, | |
"initializer_range": 0.02, | |
"intermediate_size": 4096, | |
"layer_norm_eps": 1e-05, | |
"max_position_embeddings": 514, | |
"model_type": "roberta", | |
"num_attention_heads": 16, | |
"num_hidden_layers": 24, | |
"pad_token_id": 1, | |
"type_vocab_size": 1, | |
"vocab_size": 50265 | |
} | |
07/16/2020 05:44:07 - INFO - transformers.configuration_utils - loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json from cache at /home/ubuntu/.cache/torch/transformers/c22e0b5bbb7c0cb93a87a2ae01263ae715b4c18d692b1740ce72cacaa99ad184.2d28da311092e99a05f9ee17520204614d60b0bfdb32f8a75644df7737b6a748 | |
07/16/2020 05:44:07 - INFO - transformers.configuration_utils - Model config RobertaConfig { | |
"architectures": [ | |
"RobertaForMaskedLM" | |
], | |
"attention_probs_dropout_prob": 0.1, | |
"bos_token_id": 0, | |
"eos_token_id": 2, | |
"gradient_checkpointing": false, | |
"hidden_act": "gelu", | |
"hidden_dropout_prob": 0.1, | |
"hidden_size": 1024, | |
"initializer_range": 0.02, | |
"intermediate_size": 4096, | |
"layer_norm_eps": 1e-05, | |
"max_position_embeddings": 514, | |
"model_type": "roberta", | |
"num_attention_heads": 16, | |
"num_hidden_layers": 24, | |
"pad_token_id": 1, | |
"type_vocab_size": 1, | |
"vocab_size": 50265 | |
} | |
07/16/2020 05:44:07 - INFO - transformers.tokenization_utils_base - loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json from cache at /home/ubuntu/.cache/torch/transformers/1ae1f5b6e2b22b25ccc04c000bb79ca847aa226d0761536b011cf7e5868f0655.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b | |
07/16/2020 05:44:07 - INFO - transformers.tokenization_utils_base - loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt from cache at /home/ubuntu/.cache/torch/transformers/f8f83199a6270d582d6245dc100e99c4155de81c9745c6248077018fe01abcfb.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda | |
07/16/2020 05:44:07 - INFO - transformers.modeling_utils - loading weights file https://cdn.huggingface.co/roberta-large-pytorch_model.bin from cache at /home/ubuntu/.cache/torch/transformers/2339ac1858323405dffff5156947669fed6f63a0c34cfab35bda4f78791893d2.fc7abf72755ecc4a75d0d336a93c1c63358d2334f5998ed326f3b0da380bf536 | |
07/16/2020 05:44:20 - WARNING - transformers.modeling_utils - Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight'] | |
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model). | |
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). | |
07/16/2020 05:44:20 - WARNING - transformers.modeling_utils - Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-large and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias'] | |
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. | |
07/16/2020 05:44:23 - INFO - __main__ - Training/evaluation parameters Namespace(adam_epsilon=1e-06, cache_dir='', config_name='', data_dir=None, device=device(type='cuda', index=0), do_eval=True, do_lower_case=False, do_train=True, doc_stride=128, eval_all_checkpoints=False, evaluate_during_training=False, fp16=False, fp16_opt_level='O1', gradient_accumulation_steps=6, lang_id=0, learning_rate=3e-05, local_rank=0, logging_steps=50, max_answer_length=30, max_grad_norm=1.0, max_query_length=64, max_seq_length=512, max_steps=-1, model_name_or_path='roberta-large', model_type='roberta', n_best_size=20, n_gpu=1, no_cuda=False, null_score_diff_threshold=0.0, num_train_epochs=3.0, output_dir='./examples/models/test/', overwrite_cache=False, overwrite_output_dir=False, per_gpu_eval_batch_size=12, per_gpu_train_batch_size=2, predict_file='/home/ubuntu/SQuAD_data/dev-v2.0.json', save_steps=2000, seed=42, server_ip='', server_port='', threads=20, tokenizer_name='', train_file='/home/ubuntu/SQuAD_data/train-v2.0.json', verbose_logging=False, version_2_with_negative=True, warmup_steps=1642, weight_decay=0.01) | |
07/16/2020 05:44:23 - INFO - __main__ - Loading features from cached file ./cached_train_roberta-large_512 | |
07/16/2020 05:44:37 - WARNING - transformers.modeling_utils - Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight'] | |
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model). | |
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). | |
07/16/2020 05:44:37 - WARNING - transformers.modeling_utils - Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-large and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias'] | |
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. | |
07/16/2020 05:44:37 - WARNING - transformers.modeling_utils - Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight'] | |
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model). | |
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). | |
07/16/2020 05:44:37 - WARNING - transformers.modeling_utils - Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-large and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias'] | |
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. | |
07/16/2020 05:44:37 - WARNING - transformers.modeling_utils - Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight'] | |
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model). | |
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). | |
07/16/2020 05:44:37 - WARNING - transformers.modeling_utils - Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-large and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias'] | |
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. | |
07/16/2020 05:44:54 - INFO - __main__ - Starting training | |
07/16/2020 05:44:54 - INFO - __main__ - ***** Running training ***** | |
07/16/2020 05:44:54 - INFO - __main__ - Num examples = 131232 | |
07/16/2020 05:44:54 - INFO - __main__ - Num Epochs = 3 | |
07/16/2020 05:44:54 - INFO - __main__ - Instantaneous batch size per GPU = 2 | |
07/16/2020 05:44:54 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 48 | |
07/16/2020 05:44:54 - INFO - __main__ - Gradient Accumulation steps = 6 | |
07/16/2020 05:44:54 - INFO - __main__ - Total optimization steps = 8202 | |
Epoch: 0%| | 0/3 [00:00<?, ?it/s] | |
Iteration: 0%| | 0/16404 [00:00<?, ?it/s][A | |
Iteration: 1%| | 92/16404 [02:00<5:55:35, 1.31s/it][A | |
Iteration: 1%|▏ | 212/16404 [04:00<5:28:17, 1.22s/it][A/home/ubuntu/.local/lib/python3.6/site-packages/torch/optim/lr_scheduler.py:231: UserWarning: To get the last learning rate computed by the scheduler, please use `get_last_lr()`. | |
warnings.warn("To get the last learning rate computed by the scheduler, " | |
Iteration: 2%|▏ | 332/16404 [06:01<5:08:43, 1.15s/it][A | |
Iteration: 3%|▎ | 452/16404 [08:01<4:54:27, 1.11s/it][A | |
Iteration: 3%|▎ | 572/16404 [10:01<4:43:56, 1.08s/it][A | |
Iteration: 4%|▍ | 692/16404 [12:02<4:36:00, 1.05s/it][A | |
Iteration: 5%|▍ | 812/16404 [14:02<4:29:52, 1.04s/it][A | |
Iteration: 6%|▌ | 932/16404 [16:02<4:25:00, 1.03s/it][A | |
Iteration: 6%|▋ | 1053/16404 [18:03<4:20:40, 1.02s/it][A | |
Iteration: 7%|▋ | 1175/16404 [20:04<4:16:20, 1.01s/it][A | |
Iteration: 8%|▊ | 1297/16404 [22:04<4:12:44, 1.00s/it][A | |
Iteration: 9%|▊ | 1419/16404 [24:05<4:09:30, 1.00it/s][A | |
Iteration: 9%|▉ | 1541/16404 [26:05<4:06:44, 1.00it/s][A | |
Iteration: 10%|█ | 1663/16404 [28:06<4:04:14, 1.01it/s][A | |
Iteration: 11%|█ | 1785/16404 [30:07<4:01:51, 1.01it/s][A | |
Iteration: 12%|█▏ | 1907/16404 [32:08<3:59:33, 1.01it/s][A | |
Iteration: 12%|█▏ | 2029/16404 [34:08<3:57:25, 1.01it/s][A | |
Iteration: 13%|█▎ | 2151/16404 [36:09<3:55:15, 1.01it/s][A | |
Iteration: 14%|█▍ | 2273/16404 [38:10<3:53:07, 1.01it/s][A | |
Iteration: 15%|█▍ | 2395/16404 [40:10<3:51:06, 1.01it/s][A | |
Iteration: 15%|█▌ | 2517/16404 [42:11<3:48:55, 1.01it/s][A | |
Iteration: 16%|█▌ | 2639/16404 [44:11<3:46:50, 1.01it/s][A | |
Iteration: 17%|█▋ | 2761/16404 [46:12<3:44:46, 1.01it/s][A | |
Iteration: 18%|█▊ | 2883/16404 [48:12<3:42:40, 1.01it/s][A | |
Iteration: 18%|█▊ | 3005/16404 [50:14<3:41:28, 1.01it/s][A | |
Iteration: 19%|█▉ | 3126/16404 [52:15<3:39:42, 1.01it/s][A | |
Iteration: 20%|█▉ | 3247/16404 [54:15<3:37:40, 1.01it/s][A | |
Iteration: 21%|██ | 3369/16404 [56:16<3:35:30, 1.01it/s][A | |
Iteration: 21%|██▏ | 3491/16404 [58:16<3:33:19, 1.01it/s][A | |
Iteration: 22%|██▏ | 3613/16404 [1:00:17<3:31:19, 1.01it/s][A | |
Iteration: 23%|██▎ | 3735/16404 [1:02:18<3:29:13, 1.01it/s][A | |
Iteration: 24%|██▎ | 3857/16404 [1:04:19<3:27:10, 1.01it/s][A | |
Iteration: 24%|██▍ | 3979/16404 [1:06:20<3:25:11, 1.01it/s][A | |
Iteration: 25%|██▌ | 4101/16404 [1:08:21<3:23:12, 1.01it/s][A | |
Iteration: 26%|██▌ | 4223/16404 [1:10:22<3:21:15, 1.01it/s][A | |
Iteration: 26%|██▋ | 4345/16404 [1:12:23<3:19:21, 1.01it/s][A | |
Iteration: 27%|██▋ | 4466/16404 [1:14:23<3:17:23, 1.01it/s][A | |
Iteration: 28%|██▊ | 4587/16404 [1:16:23<3:15:22, 1.01it/s][A | |
Iteration: 29%|██▊ | 4708/16404 [1:18:23<3:13:24, 1.01it/s][A | |
Iteration: 29%|██▉ | 4829/16404 [1:20:23<3:11:24, 1.01it/s][A | |
Iteration: 30%|███ | 4950/16404 [1:22:23<3:09:29, 1.01it/s][A | |
Iteration: 31%|███ | 5071/16404 [1:24:24<3:07:30, 1.01it/s][A | |
Iteration: 32%|███▏ | 5192/16404 [1:26:24<3:05:33, 1.01it/s][A | |
Iteration: 32%|███▏ | 5313/16404 [1:28:24<3:03:32, 1.01it/s][A | |
Iteration: 33%|███▎ | 5434/16404 [1:30:24<3:01:28, 1.01it/s][A | |
Iteration: 34%|███▍ | 5555/16404 [1:32:24<2:59:27, 1.01it/s][A | |
Iteration: 35%|███▍ | 5676/16404 [1:34:24<2:57:28, 1.01it/s][A | |
Iteration: 35%|███▌ | 5797/16404 [1:36:24<2:55:30, 1.01it/s][A | |
Iteration: 36%|███▌ | 5919/16404 [1:38:25<2:53:24, 1.01it/s][A | |
Iteration: 37%|███▋ | 6040/16404 [1:40:25<2:51:24, 1.01it/s][A | |
Iteration: 38%|███▊ | 6161/16404 [1:42:26<2:49:26, 1.01it/s][A | |
Iteration: 38%|███▊ | 6282/16404 [1:44:26<2:47:34, 1.01it/s][A | |
Iteration: 39%|███▉ | 6403/16404 [1:46:26<2:45:36, 1.01it/s][A | |
Iteration: 40%|███▉ | 6524/16404 [1:48:26<2:43:37, 1.01it/s][A | |
Iteration: 41%|████ | 6645/16404 [1:50:27<2:41:36, 1.01it/s][A | |
Iteration: 41%|████ | 6766/16404 [1:52:27<2:39:38, 1.01it/s][A | |
Iteration: 42%|████▏ | 6887/16404 [1:54:27<2:37:35, 1.01it/s][A | |
Iteration: 43%|████▎ | 7008/16404 [1:56:27<2:35:37, 1.01it/s][A | |
Iteration: 43%|████▎ | 7129/16404 [1:58:28<2:33:38, 1.01it/s][A | |
Iteration: 44%|████▍ | 7250/16404 [2:00:28<2:31:38, 1.01it/s][A | |
Iteration: 45%|████▍ | 7371/16404 [2:02:28<2:29:37, 1.01it/s][A | |
Iteration: 46%|████▌ | 7492/16404 [2:04:29<2:27:37, 1.01it/s][A | |
Iteration: 46%|████▋ | 7613/16404 [2:06:29<2:25:38, 1.01it/s][A | |
Iteration: 47%|████▋ | 7734/16404 [2:08:29<2:23:40, 1.01it/s][A | |
Iteration: 48%|████▊ | 7855/16404 [2:10:29<2:21:37, 1.01it/s][A | |
Iteration: 49%|████▊ | 7976/16404 [2:12:30<2:19:35, 1.01it/s][A | |
Iteration: 49%|████▉ | 8097/16404 [2:14:30<2:17:34, 1.01it/s][A | |
Iteration: 50%|█████ | 8218/16404 [2:16:30<2:15:36, 1.01it/s][A | |
Iteration: 51%|█████ | 8339/16404 [2:18:30<2:13:35, 1.01it/s][A | |
Iteration: 52%|█████▏ | 8460/16404 [2:20:31<2:11:37, 1.01it/s][A | |
Iteration: 52%|█████▏ | 8581/16404 [2:22:31<2:09:34, 1.01it/s][A | |
Iteration: 53%|█████▎ | 8702/16404 [2:24:31<2:07:36, 1.01it/s][A | |
Iteration: 54%|█████▍ | 8823/16404 [2:26:32<2:05:35, 1.01it/s][A | |
Iteration: 55%|█████▍ | 8944/16404 [2:28:32<2:03:33, 1.01it/s][A | |
Iteration: 55%|█████▌ | 9065/16404 [2:30:32<2:01:36, 1.01it/s][A | |
Iteration: 56%|█████▌ | 9186/16404 [2:32:32<1:59:36, 1.01it/s][A | |
Iteration: 57%|█████▋ | 9307/16404 [2:34:33<1:57:34, 1.01it/s][A | |
Iteration: 57%|█████▋ | 9428/16404 [2:36:33<1:55:32, 1.01it/s][A | |
Iteration: 58%|█████▊ | 9549/16404 [2:38:33<1:53:31, 1.01it/s][A | |
Iteration: 59%|█████▉ | 9670/16404 [2:40:33<1:51:31, 1.01it/s][A | |
Iteration: 60%|█████▉ | 9791/16404 [2:42:33<1:49:29, 1.01it/s][A | |
Iteration: 60%|██████ | 9912/16404 [2:44:34<1:47:31, 1.01it/s][A | |
Iteration: 61%|██████ | 10033/16404 [2:46:34<1:45:28, 1.01it/s][A | |
Iteration: 62%|██████▏ | 10155/16404 [2:48:35<1:43:22, 1.01it/s][A | |
Iteration: 63%|██████▎ | 10276/16404 [2:50:35<1:41:21, 1.01it/s][A | |
Iteration: 63%|██████▎ | 10397/16404 [2:52:35<1:39:20, 1.01it/s][A | |
Iteration: 64%|██████▍ | 10518/16404 [2:54:35<1:37:22, 1.01it/s][A | |
Iteration: 65%|██████▍ | 10640/16404 [2:56:36<1:35:19, 1.01it/s][A | |
Iteration: 66%|██████▌ | 10761/16404 [2:58:36<1:33:18, 1.01it/s][A | |
Iteration: 66%|██████▋ | 10882/16404 [3:00:36<1:31:18, 1.01it/s][A | |
Iteration: 67%|██████▋ | 11003/16404 [3:02:36<1:29:19, 1.01it/s][A | |
Iteration: 68%|██████▊ | 11124/16404 [3:04:36<1:27:20, 1.01it/s][A | |
Iteration: 69%|██████▊ | 11245/16404 [3:06:36<1:25:19, 1.01it/s][A | |
Iteration: 69%|██████▉ | 11366/16404 [3:08:36<1:23:19, 1.01it/s][A | |
Iteration: 70%|███████ | 11487/16404 [3:10:36<1:21:19, 1.01it/s][A | |
Iteration: 71%|███████ | 11608/16404 [3:12:37<1:19:19, 1.01it/s][A | |
Iteration: 72%|███████▏ | 11729/16404 [3:14:37<1:17:21, 1.01it/s][A | |
Iteration: 72%|███████▏ | 11850/16404 [3:16:37<1:15:23, 1.01it/s][A | |
Iteration: 73%|███████▎ | 11971/16404 [3:18:37<1:13:24, 1.01it/s][A07/16/2020 09:03:54 - INFO - transformers.configuration_utils - Configuration saved in ./examples/models/test/checkpoint-2000/config.json | |
07/16/2020 09:03:56 - INFO - transformers.modeling_utils - Model weights saved in ./examples/models/test/checkpoint-2000/pytorch_model.bin | |
07/16/2020 09:03:56 - INFO - __main__ - Saving model checkpoint to ./examples/models/test/checkpoint-2000 | |
/home/ubuntu/.local/lib/python3.6/site-packages/torch/optim/lr_scheduler.py:200: UserWarning: Please also save or load the state of the optimzer when saving or loading the scheduler. | |
warnings.warn(SAVE_STATE_WARNING, UserWarning) | |
07/16/2020 09:03:58 - INFO - __main__ - Saving optimizer and scheduler states to ./examples/models/test/checkpoint-2000 | |
Iteration: 74%|███████▎ | 12092/16404 [3:20:42<1:12:12, 1.00s/it][A | |
Iteration: 74%|███████▍ | 12213/16404 [3:22:43<1:09:59, 1.00s/it][A | |
Iteration: 75%|███████▌ | 12333/16404 [3:24:43<1:07:58, 1.00s/it][A | |
Iteration: 76%|███████▌ | 12454/16404 [3:26:43<1:05:49, 1.00it/s][A | |
Iteration: 77%|███████▋ | 12575/16404 [3:28:43<1:03:38, 1.00it/s][A | |
Iteration: 77%|███████▋ | 12696/16404 [3:30:44<1:01:41, 1.00it/s][A | |
Iteration: 78%|███████▊ | 12817/16404 [3:32:45<59:41, 1.00it/s] [A | |
Iteration: 79%|███████▉ | 12938/16404 [3:34:46<57:41, 1.00it/s][A | |
Iteration: 80%|███████▉ | 13059/16404 [3:36:47<55:41, 1.00it/s][A | |
Iteration: 80%|████████ | 13180/16404 [3:38:48<53:40, 1.00it/s][A | |
Iteration: 81%|████████ | 13301/16404 [3:40:49<51:38, 1.00it/s][A | |
Iteration: 82%|████████▏ | 13422/16404 [3:42:50<49:39, 1.00it/s][A | |
Iteration: 83%|████████▎ | 13543/16404 [3:44:50<47:38, 1.00it/s][A | |
Iteration: 83%|████████▎ | 13664/16404 [3:46:51<45:36, 1.00it/s][A | |
Iteration: 84%|████████▍ | 13785/16404 [3:48:52<43:36, 1.00it/s][A | |
Iteration: 85%|████████▍ | 13906/16404 [3:50:53<41:35, 1.00it/s][A | |
Iteration: 86%|████████▌ | 14027/16404 [3:52:54<39:35, 1.00it/s][A | |
Iteration: 86%|████████▌ | 14148/16404 [3:54:55<37:35, 1.00it/s][A | |
Iteration: 87%|████████▋ | 14269/16404 [3:56:56<35:34, 1.00it/s][A | |
Iteration: 88%|████████▊ | 14390/16404 [3:58:57<33:33, 1.00it/s][A | |
Iteration: 88%|████████▊ | 14511/16404 [4:00:58<31:32, 1.00it/s][A | |
Iteration: 89%|████████▉ | 14632/16404 [4:02:59<29:31, 1.00it/s][A | |
Iteration: 90%|████████▉ | 14753/16404 [4:05:00<27:30, 1.00it/s][A | |
Iteration: 91%|█████████ | 14874/16404 [4:07:01<25:30, 1.00s/it][A | |
Iteration: 91%|█████████▏| 14995/16404 [4:09:02<23:28, 1.00it/s][A | |
Iteration: 92%|█████████▏| 15116/16404 [4:11:03<21:27, 1.00it/s][A | |
Iteration: 93%|█████████▎| 15237/16404 [4:13:04<19:26, 1.00it/s][A | |
Iteration: 94%|█████████▎| 15358/16404 [4:15:05<17:25, 1.00it/s][A | |
Iteration: 94%|█████████▍| 15479/16404 [4:17:06<15:24, 1.00it/s][A | |
Iteration: 95%|█████████▌| 15600/16404 [4:19:07<13:23, 1.00it/s][A | |
Iteration: 96%|█████████▌| 15721/16404 [4:21:07<11:22, 1.00it/s][A | |
Iteration: 97%|█████████▋| 15842/16404 [4:23:08<09:21, 1.00it/s][A | |
Iteration: 97%|█████████▋| 15963/16404 [4:25:09<07:20, 1.00it/s][A | |
Iteration: 98%|█████████▊| 16084/16404 [4:27:09<05:19, 1.00it/s][A | |
Iteration: 99%|█████████▉| 16205/16404 [4:29:10<03:18, 1.00it/s][A | |
Iteration: 100%|█████████▉| 16326/16404 [4:31:11<01:17, 1.00it/s][A | |
Iteration: 100%|██████████| 16404/16404 [4:32:29<00:00, 1.00it/s] | |
Epoch: 33%|███▎ | 1/3 [4:32:29<9:04:58, 16349.13s/it] | |
Iteration: 0%| | 0/16404 [00:00<?, ?it/s][A | |
Iteration: 1%| | 121/16404 [02:00<4:30:50, 1.00it/s][A | |
Iteration: 1%|▏ | 242/16404 [04:01<4:28:50, 1.00it/s][A | |
Iteration: 2%|▏ | 363/16404 [06:02<4:26:49, 1.00it/s][A | |
Iteration: 3%|▎ | 484/16404 [08:03<4:24:54, 1.00it/s][A | |
Iteration: 4%|▎ | 605/16404 [10:04<4:22:57, 1.00it/s][A | |
Iteration: 4%|▍ | 726/16404 [12:05<4:21:04, 1.00it/s][A | |
Iteration: 5%|▌ | 847/16404 [14:06<4:19:03, 1.00it/s][A | |
Iteration: 6%|▌ | 968/16404 [16:06<4:17:03, 1.00it/s][A | |
Iteration: 7%|▋ | 1089/16404 [18:07<4:15:04, 1.00it/s][A | |
Iteration: 7%|▋ | 1209/16404 [20:07<4:13:07, 1.00it/s][A | |
Iteration: 8%|▊ | 1330/16404 [22:08<4:11:06, 1.00it/s][A | |
Iteration: 9%|▉ | 1451/16404 [24:09<4:09:06, 1.00it/s][A | |
Iteration: 10%|▉ | 1571/16404 [26:09<4:07:08, 1.00it/s][A | |
Iteration: 10%|█ | 1692/16404 [28:10<4:05:11, 1.00it/s][A | |
Iteration: 11%|█ | 1812/16404 [30:10<4:03:12, 1.00s/it][A | |
Iteration: 12%|█▏ | 1933/16404 [32:11<4:01:10, 1.00it/s][A | |
Iteration: 13%|█▎ | 2053/16404 [34:11<3:59:10, 1.00it/s][A | |
Iteration: 13%|█▎ | 2174/16404 [36:12<3:57:07, 1.00it/s][A | |
Iteration: 14%|█▍ | 2295/16404 [38:13<3:55:03, 1.00it/s][A | |
Iteration: 15%|█▍ | 2416/16404 [40:14<3:53:00, 1.00it/s][A | |
Iteration: 15%|█▌ | 2537/16404 [42:15<3:50:58, 1.00it/s][A | |
Iteration: 16%|█▌ | 2658/16404 [44:16<3:49:02, 1.00it/s][A | |
Iteration: 17%|█▋ | 2778/16404 [46:16<3:47:05, 1.00it/s][A | |
Iteration: 18%|█▊ | 2899/16404 [48:17<3:45:02, 1.00it/s][A | |
Iteration: 18%|█▊ | 3019/16404 [50:17<3:43:05, 1.00s/it][A | |
Iteration: 19%|█▉ | 3140/16404 [52:17<3:40:33, 1.00it/s][A | |
Iteration: 20%|█▉ | 3261/16404 [54:18<3:38:47, 1.00it/s][A | |
Iteration: 21%|██ | 3382/16404 [56:19<3:36:46, 1.00it/s][A | |
Iteration: 21%|██▏ | 3503/16404 [58:20<3:34:45, 1.00it/s][A | |
Iteration: 22%|██▏ | 3624/16404 [1:00:21<3:32:53, 1.00it/s][A | |
Iteration: 23%|██▎ | 3745/16404 [1:02:22<3:30:53, 1.00it/s][A | |
Iteration: 24%|██▎ | 3866/16404 [1:04:23<3:28:52, 1.00it/s][A | |
Iteration: 24%|██▍ | 3987/16404 [1:06:24<3:26:54, 1.00it/s][A | |
Iteration: 25%|██▌ | 4108/16404 [1:08:25<3:24:52, 1.00it/s][A | |
Iteration: 26%|██▌ | 4229/16404 [1:10:26<3:22:52, 1.00it/s][A | |
Iteration: 27%|██▋ | 4350/16404 [1:12:27<3:20:54, 1.00s/it][A | |
Iteration: 27%|██▋ | 4470/16404 [1:14:27<3:18:55, 1.00s/it][A | |
Iteration: 28%|██▊ | 4591/16404 [1:16:28<3:16:51, 1.00it/s][A | |
Iteration: 29%|██▊ | 4712/16404 [1:18:29<3:14:49, 1.00it/s][A | |
Iteration: 29%|██▉ | 4833/16404 [1:20:30<3:12:48, 1.00it/s][A | |
Iteration: 30%|███ | 4954/16404 [1:22:31<3:10:46, 1.00it/s][A | |
Iteration: 31%|███ | 5075/16404 [1:24:32<3:08:44, 1.00it/s][A | |
Iteration: 32%|███▏ | 5196/16404 [1:26:33<3:06:49, 1.00s/it][A | |
Iteration: 32%|███▏ | 5317/16404 [1:28:34<3:04:47, 1.00s/it][A | |
Iteration: 33%|███▎ | 5438/16404 [1:30:35<3:02:37, 1.00it/s][A | |
Iteration: 34%|███▍ | 5560/16404 [1:32:35<2:59:59, 1.00it/s][A | |
Iteration: 35%|███▍ | 5682/16404 [1:34:36<2:57:45, 1.01it/s][A | |
Iteration: 35%|███▌ | 5803/16404 [1:36:37<2:55:59, 1.00it/s][A | |
Iteration: 36%|███▌ | 5924/16404 [1:38:38<2:53:56, 1.00it/s][A | |
Iteration: 37%|███▋ | 6045/16404 [1:40:38<2:51:52, 1.00it/s][A | |
Iteration: 38%|███▊ | 6166/16404 [1:42:38<2:49:49, 1.00it/s][A | |
Iteration: 38%|███▊ | 6287/16404 [1:44:39<2:47:46, 1.00it/s][A | |
Iteration: 39%|███▉ | 6408/16404 [1:46:39<2:45:49, 1.00it/s][A | |
Iteration: 40%|███▉ | 6529/16404 [1:48:39<2:43:43, 1.01it/s][A | |
Iteration: 41%|████ | 6650/16404 [1:50:40<2:41:39, 1.01it/s][A | |
Iteration: 41%|████▏ | 6771/16404 [1:52:40<2:39:34, 1.01it/s][A | |
Iteration: 42%|████▏ | 6892/16404 [1:54:40<2:37:33, 1.01it/s][A | |
Iteration: 43%|████▎ | 7013/16404 [1:56:40<2:35:31, 1.01it/s][A | |
Iteration: 43%|████▎ | 7134/16404 [1:58:41<2:33:35, 1.01it/s][A | |
Iteration: 44%|████▍ | 7255/16404 [2:00:41<2:31:36, 1.01it/s][A | |
Iteration: 45%|████▍ | 7376/16404 [2:02:41<2:29:33, 1.01it/s][A | |
Iteration: 46%|████▌ | 7497/16404 [2:04:41<2:27:32, 1.01it/s][A07/16/2020 12:23:37 - INFO - transformers.configuration_utils - Configuration saved in ./examples/models/test/checkpoint-4000/config.json | |
07/16/2020 12:23:38 - INFO - transformers.modeling_utils - Model weights saved in ./examples/models/test/checkpoint-4000/pytorch_model.bin | |
07/16/2020 12:23:38 - INFO - __main__ - Saving model checkpoint to ./examples/models/test/checkpoint-4000 | |
07/16/2020 12:23:41 - INFO - __main__ - Saving optimizer and scheduler states to ./examples/models/test/checkpoint-4000 | |
Iteration: 46%|████▋ | 7618/16404 [2:06:45<2:26:53, 1.00s/it][A | |
Iteration: 47%|████▋ | 7739/16404 [2:08:46<2:24:37, 1.00s/it][A | |
Iteration: 48%|████▊ | 7860/16404 [2:10:47<2:22:32, 1.00s/it][A | |
Iteration: 49%|████▊ | 7981/16404 [2:12:48<2:20:26, 1.00s/it][A | |
Iteration: 49%|████▉ | 8102/16404 [2:14:49<2:18:22, 1.00s/it][A | |
Iteration: 50%|█████ | 8223/16404 [2:16:50<2:16:14, 1.00it/s][A | |
Iteration: 51%|█████ | 8344/16404 [2:18:50<2:14:11, 1.00it/s][A | |
Iteration: 52%|█████▏ | 8465/16404 [2:20:51<2:12:08, 1.00it/s][A | |
Iteration: 52%|█████▏ | 8586/16404 [2:22:52<2:10:10, 1.00it/s][A | |
Iteration: 53%|█████▎ | 8707/16404 [2:24:53<2:08:07, 1.00it/s][A | |
Iteration: 54%|█████▍ | 8828/16404 [2:26:54<2:06:07, 1.00it/s][A | |
Iteration: 55%|█████▍ | 8949/16404 [2:28:55<2:04:06, 1.00it/s][A | |
Iteration: 55%|█████▌ | 9070/16404 [2:30:55<2:02:03, 1.00it/s][A | |
Iteration: 56%|█████▌ | 9191/16404 [2:32:56<2:00:01, 1.00it/s][A | |
Iteration: 57%|█████▋ | 9312/16404 [2:34:57<1:58:02, 1.00it/s][A | |
Iteration: 58%|█████▊ | 9433/16404 [2:36:58<1:55:59, 1.00it/s][A | |
Iteration: 58%|█████▊ | 9554/16404 [2:38:58<1:53:56, 1.00it/s][A | |
Iteration: 59%|█████▉ | 9675/16404 [2:40:59<1:51:57, 1.00it/s][A | |
Iteration: 60%|█████▉ | 9796/16404 [2:43:00<1:49:57, 1.00it/s][A | |
Iteration: 60%|██████ | 9917/16404 [2:45:01<1:47:57, 1.00it/s][A | |
Iteration: 61%|██████ | 10038/16404 [2:47:02<1:46:00, 1.00it/s][A | |
Iteration: 62%|██████▏ | 10159/16404 [2:49:03<1:43:59, 1.00it/s][A | |
Iteration: 63%|██████▎ | 10280/16404 [2:51:04<1:41:58, 1.00it/s][A | |
Iteration: 63%|██████▎ | 10401/16404 [2:53:05<1:39:58, 1.00it/s][A | |
Iteration: 64%|██████▍ | 10522/16404 [2:55:06<1:37:56, 1.00it/s][A | |
Iteration: 65%|██████▍ | 10643/16404 [2:57:06<1:35:52, 1.00it/s][A | |
Iteration: 66%|██████▌ | 10764/16404 [2:59:07<1:33:50, 1.00it/s][A | |
Iteration: 66%|██████▋ | 10885/16404 [3:01:08<1:31:49, 1.00it/s][A | |
Iteration: 67%|██████▋ | 11006/16404 [3:03:09<1:29:48, 1.00it/s][A | |
Iteration: 68%|██████▊ | 11127/16404 [3:05:09<1:27:48, 1.00it/s][A | |
Iteration: 69%|██████▊ | 11248/16404 [3:07:10<1:25:47, 1.00it/s][A | |
Iteration: 69%|██████▉ | 11369/16404 [3:09:11<1:23:47, 1.00it/s][A | |
Iteration: 70%|███████ | 11490/16404 [3:11:12<1:21:44, 1.00it/s][A | |
Iteration: 71%|███████ | 11611/16404 [3:13:12<1:19:41, 1.00it/s][A | |
Iteration: 72%|███████▏ | 11732/16404 [3:15:13<1:17:38, 1.00it/s][A | |
Iteration: 72%|███████▏ | 11853/16404 [3:17:14<1:15:40, 1.00it/s][A | |
Iteration: 73%|███████▎ | 11974/16404 [3:19:15<1:13:41, 1.00it/s][A | |
Iteration: 74%|███████▎ | 12095/16404 [3:21:15<1:11:41, 1.00it/s][A | |
Iteration: 74%|███████▍ | 12216/16404 [3:23:16<1:09:41, 1.00it/s][A | |
Iteration: 75%|███████▌ | 12337/16404 [3:25:17<1:07:39, 1.00it/s][A | |
Iteration: 76%|███████▌ | 12458/16404 [3:27:18<1:05:38, 1.00it/s][A | |
Iteration: 77%|███████▋ | 12579/16404 [3:29:18<1:03:36, 1.00it/s][A | |
Iteration: 77%|███████▋ | 12700/16404 [3:31:19<1:01:35, 1.00it/s][A | |
Iteration: 78%|███████▊ | 12821/16404 [3:33:20<59:36, 1.00it/s] [A | |
Iteration: 79%|███████▉ | 12942/16404 [3:35:21<57:37, 1.00it/s][A | |
Iteration: 80%|███████▉ | 13063/16404 [3:37:22<55:35, 1.00it/s][A | |
Iteration: 80%|████████ | 13184/16404 [3:39:22<53:35, 1.00it/s][A | |
Iteration: 81%|████████ | 13305/16404 [3:41:23<51:33, 1.00it/s][A | |
Iteration: 82%|████████▏ | 13426/16404 [3:43:24<49:31, 1.00it/s][A | |
Iteration: 83%|████████▎ | 13547/16404 [3:45:24<47:28, 1.00it/s][A | |
Iteration: 83%|████████▎ | 13668/16404 [3:47:25<45:28, 1.00it/s][A | |
Iteration: 84%|████████▍ | 13789/16404 [3:49:26<43:27, 1.00it/s][A | |
Iteration: 85%|████████▍ | 13910/16404 [3:51:26<41:26, 1.00it/s][A | |
Iteration: 86%|████████▌ | 14031/16404 [3:53:27<39:27, 1.00it/s][A | |
Iteration: 86%|████████▋ | 14152/16404 [3:55:28<37:27, 1.00it/s][A | |
Iteration: 87%|████████▋ | 14273/16404 [3:57:29<35:26, 1.00it/s][A | |
Iteration: 88%|████████▊ | 14394/16404 [3:59:29<33:26, 1.00it/s][A | |
Iteration: 88%|████████▊ | 14515/16404 [4:01:30<31:25, 1.00it/s][A | |
Iteration: 89%|████████▉ | 14636/16404 [4:03:31<29:24, 1.00it/s][A | |
Iteration: 90%|████████▉ | 14757/16404 [4:05:32<27:24, 1.00it/s][A | |
Iteration: 91%|█████████ | 14878/16404 [4:07:33<25:23, 1.00it/s][A | |
Iteration: 91%|█████████▏| 14999/16404 [4:09:34<23:23, 1.00it/s][A | |
Iteration: 92%|█████████▏| 15120/16404 [4:11:35<21:22, 1.00it/s][A | |
Iteration: 93%|█████████▎| 15241/16404 [4:13:36<19:21, 1.00it/s][A | |
Iteration: 94%|█████████▎| 15362/16404 [4:15:37<17:21, 1.00it/s][A | |
Iteration: 94%|█████████▍| 15483/16404 [4:17:37<15:20, 1.00it/s][A | |
Iteration: 95%|█████████▌| 15604/16404 [4:19:38<13:19, 1.00it/s][A | |
Iteration: 96%|█████████▌| 15725/16404 [4:21:39<11:18, 1.00it/s][A | |
Iteration: 97%|█████████▋| 15846/16404 [4:23:41<09:17, 1.00it/s][A | |
Iteration: 97%|█████████▋| 15967/16404 [4:25:42<07:17, 1.00s/it][A | |
Iteration: 98%|█████████▊| 16087/16404 [4:27:42<05:17, 1.00s/it][A | |
Iteration: 99%|█████████▉| 16208/16404 [4:29:43<03:15, 1.00it/s][A | |
Iteration: 100%|█████████▉| 16329/16404 [4:31:43<01:14, 1.00it/s][A | |
Iteration: 100%|██████████| 16404/16404 [4:32:58<00:00, 1.00it/s] | |
Epoch: 67%|██████▋ | 2/3 [9:05:28<4:32:38, 16358.08s/it] | |
Iteration: 0%| | 0/16404 [00:00<?, ?it/s][A | |
Iteration: 1%| | 121/16404 [02:00<4:31:09, 1.00it/s][A | |
Iteration: 1%|▏ | 242/16404 [04:01<4:29:10, 1.00it/s][A | |
Iteration: 2%|▏ | 363/16404 [06:02<4:27:08, 1.00it/s][A | |
Iteration: 3%|▎ | 484/16404 [08:03<4:25:09, 1.00it/s][A | |
Iteration: 4%|▎ | 605/16404 [10:04<4:23:09, 1.00it/s][A | |
Iteration: 4%|▍ | 726/16404 [12:05<4:21:15, 1.00it/s][A | |
Iteration: 5%|▌ | 846/16404 [14:05<4:19:16, 1.00it/s][A | |
Iteration: 6%|▌ | 967/16404 [16:06<4:17:14, 1.00it/s][A | |
Iteration: 7%|▋ | 1088/16404 [18:07<4:15:10, 1.00it/s][A | |
Iteration: 7%|▋ | 1209/16404 [20:08<4:13:09, 1.00it/s][A | |
Iteration: 8%|▊ | 1330/16404 [22:09<4:11:04, 1.00it/s][A | |
Iteration: 9%|▉ | 1451/16404 [24:10<4:09:02, 1.00it/s][A | |
Iteration: 10%|▉ | 1572/16404 [26:11<4:07:02, 1.00it/s][A | |
Iteration: 10%|█ | 1693/16404 [28:12<4:05:01, 1.00it/s][A | |
Iteration: 11%|█ | 1814/16404 [30:13<4:03:00, 1.00it/s][A | |
Iteration: 12%|█▏ | 1935/16404 [32:13<4:00:57, 1.00it/s][A | |
Iteration: 13%|█▎ | 2056/16404 [34:14<3:58:56, 1.00it/s][A | |
Iteration: 13%|█▎ | 2177/16404 [36:15<3:56:52, 1.00it/s][A | |
Iteration: 14%|█▍ | 2298/16404 [38:16<3:54:56, 1.00it/s][A | |
Iteration: 15%|█▍ | 2419/16404 [40:17<3:52:50, 1.00it/s][A | |
Iteration: 15%|█▌ | 2540/16404 [42:18<3:50:47, 1.00it/s][A | |
Iteration: 16%|█▌ | 2661/16404 [44:19<3:48:48, 1.00it/s][A | |
Iteration: 17%|█▋ | 2782/16404 [46:20<3:46:52, 1.00it/s][A | |
Iteration: 18%|█▊ | 2903/16404 [48:21<3:44:51, 1.00it/s][A | |
Iteration: 18%|█▊ | 3024/16404 [50:22<3:42:54, 1.00it/s][A | |
Iteration: 19%|█▉ | 3145/16404 [52:23<3:40:53, 1.00it/s][A07/16/2020 15:43:26 - INFO - transformers.configuration_utils - Configuration saved in ./examples/models/test/checkpoint-6000/config.json | |
07/16/2020 15:43:27 - INFO - transformers.modeling_utils - Model weights saved in ./examples/models/test/checkpoint-6000/pytorch_model.bin | |
07/16/2020 15:43:27 - INFO - __main__ - Saving model checkpoint to ./examples/models/test/checkpoint-6000 | |
07/16/2020 15:43:30 - INFO - __main__ - Saving optimizer and scheduler states to ./examples/models/test/checkpoint-6000 | |
Iteration: 20%|█▉ | 3266/16404 [54:27<3:40:56, 1.01s/it][A | |
Iteration: 21%|██ | 3387/16404 [56:28<3:38:16, 1.01s/it][A | |
Iteration: 21%|██▏ | 3508/16404 [58:29<3:35:47, 1.00s/it][A | |
Iteration: 22%|██▏ | 3628/16404 [1:00:29<3:33:33, 1.00s/it][A | |
Iteration: 23%|██▎ | 3748/16404 [1:02:29<3:31:24, 1.00s/it][A | |
Iteration: 24%|██▎ | 3869/16404 [1:04:30<3:29:11, 1.00s/it][A | |
Iteration: 24%|██▍ | 3990/16404 [1:06:31<3:27:03, 1.00s/it][A | |
Iteration: 25%|██▌ | 4111/16404 [1:08:32<3:24:51, 1.00it/s][A | |
Iteration: 26%|██▌ | 4232/16404 [1:10:33<3:22:47, 1.00it/s][A | |
Iteration: 27%|██▋ | 4353/16404 [1:12:33<3:20:40, 1.00it/s][A | |
Iteration: 27%|██▋ | 4474/16404 [1:14:34<3:18:39, 1.00it/s][A | |
Iteration: 28%|██▊ | 4595/16404 [1:16:35<3:16:37, 1.00it/s][A | |
Iteration: 29%|██▊ | 4716/16404 [1:18:36<3:14:42, 1.00it/s][A | |
Iteration: 29%|██▉ | 4837/16404 [1:20:37<3:12:42, 1.00it/s][A | |
Iteration: 30%|███ | 4958/16404 [1:22:38<3:10:43, 1.00it/s][A | |
Iteration: 31%|███ | 5079/16404 [1:24:39<3:08:42, 1.00it/s][A | |
Iteration: 32%|███▏ | 5200/16404 [1:26:40<3:06:41, 1.00it/s][A | |
Iteration: 32%|███▏ | 5321/16404 [1:28:41<3:04:38, 1.00it/s][A | |
Iteration: 33%|███▎ | 5442/16404 [1:30:42<3:02:40, 1.00it/s][A | |
Iteration: 34%|███▍ | 5563/16404 [1:32:43<3:00:33, 1.00it/s][A | |
Iteration: 35%|███▍ | 5684/16404 [1:34:44<2:58:26, 1.00it/s][A | |
Iteration: 35%|███▌ | 5805/16404 [1:36:44<2:56:23, 1.00it/s][A | |
Iteration: 36%|███▌ | 5926/16404 [1:38:45<2:54:22, 1.00it/s][A | |
Iteration: 37%|███▋ | 6047/16404 [1:40:46<2:52:21, 1.00it/s][A | |
Iteration: 38%|███▊ | 6168/16404 [1:42:47<2:50:26, 1.00it/s][A | |
Iteration: 38%|███▊ | 6289/16404 [1:44:48<2:48:27, 1.00it/s][A | |
Iteration: 39%|███▉ | 6410/16404 [1:46:49<2:46:26, 1.00it/s][A | |
Iteration: 40%|███▉ | 6531/16404 [1:48:50<2:44:20, 1.00it/s][A | |
Iteration: 41%|████ | 6652/16404 [1:50:51<2:42:20, 1.00it/s][A | |
Iteration: 41%|████▏ | 6773/16404 [1:52:51<2:40:20, 1.00it/s][A | |
Iteration: 42%|████▏ | 6894/16404 [1:54:52<2:38:22, 1.00it/s][A | |
Iteration: 43%|████▎ | 7015/16404 [1:56:53<2:36:22, 1.00it/s][A | |
Iteration: 44%|████▎ | 7136/16404 [1:58:54<2:34:18, 1.00it/s][A | |
Iteration: 44%|████▍ | 7257/16404 [2:00:55<2:32:14, 1.00it/s][A | |
Iteration: 45%|████▍ | 7378/16404 [2:02:56<2:30:13, 1.00it/s][A | |
Iteration: 46%|████▌ | 7499/16404 [2:04:57<2:28:11, 1.00it/s][A | |
Iteration: 46%|████▋ | 7620/16404 [2:06:57<2:26:14, 1.00it/s][A | |
Iteration: 47%|████▋ | 7741/16404 [2:08:58<2:24:10, 1.00it/s][A | |
Iteration: 48%|████▊ | 7862/16404 [2:10:59<2:22:09, 1.00it/s][A | |
Iteration: 49%|████▊ | 7983/16404 [2:13:00<2:20:09, 1.00it/s][A | |
Iteration: 49%|████▉ | 8104/16404 [2:15:01<2:18:12, 1.00it/s][A | |
Iteration: 50%|█████ | 8225/16404 [2:17:02<2:16:10, 1.00it/s][A | |
Iteration: 51%|█████ | 8346/16404 [2:19:03<2:14:12, 1.00it/s][A | |
Iteration: 52%|█████▏ | 8467/16404 [2:21:04<2:12:09, 1.00it/s][A | |
Iteration: 52%|█████▏ | 8588/16404 [2:23:04<2:10:06, 1.00it/s][A | |
Iteration: 53%|█████▎ | 8709/16404 [2:25:05<2:08:05, 1.00it/s][A | |
Iteration: 54%|█████▍ | 8830/16404 [2:27:06<2:06:04, 1.00it/s][A | |
Iteration: 55%|█████▍ | 8951/16404 [2:29:07<2:04:04, 1.00it/s][A | |
Iteration: 55%|█████▌ | 9072/16404 [2:31:08<2:02:05, 1.00it/s][A | |
Iteration: 56%|█████▌ | 9193/16404 [2:33:09<2:00:04, 1.00it/s][A | |
Iteration: 57%|█████▋ | 9314/16404 [2:35:10<1:58:03, 1.00it/s][A | |
Iteration: 58%|█████▊ | 9435/16404 [2:37:10<1:56:00, 1.00it/s][A | |
Iteration: 58%|█████▊ | 9556/16404 [2:39:11<1:53:58, 1.00it/s][A | |
Iteration: 59%|█████▉ | 9677/16404 [2:41:12<1:51:57, 1.00it/s][A | |
Iteration: 60%|█████▉ | 9798/16404 [2:43:13<1:49:59, 1.00it/s][A | |
Iteration: 60%|██████ | 9919/16404 [2:45:14<1:47:59, 1.00it/s][A | |
Iteration: 61%|██████ | 10040/16404 [2:47:15<1:45:56, 1.00it/s][A | |
Iteration: 62%|██████▏ | 10161/16404 [2:49:16<1:43:54, 1.00it/s][A | |
Iteration: 63%|██████▎ | 10282/16404 [2:51:16<1:41:54, 1.00it/s][A | |
Iteration: 63%|██████▎ | 10403/16404 [2:53:17<1:39:54, 1.00it/s][A | |
Iteration: 64%|██████▍ | 10525/16404 [2:55:18<1:37:39, 1.00it/s][A | |
Iteration: 65%|██████▍ | 10646/16404 [2:57:19<1:35:38, 1.00it/s][A | |
Iteration: 66%|██████▌ | 10767/16404 [2:59:19<1:33:37, 1.00it/s][A | |
Iteration: 66%|██████▋ | 10888/16404 [3:01:20<1:31:36, 1.00it/s][A | |
Iteration: 67%|██████▋ | 11009/16404 [3:03:20<1:29:34, 1.00it/s][A | |
Iteration: 68%|██████▊ | 11130/16404 [3:05:21<1:27:36, 1.00it/s][A | |
Iteration: 69%|██████▊ | 11251/16404 [3:07:22<1:25:33, 1.00it/s][A | |
Iteration: 69%|██████▉ | 11372/16404 [3:09:22<1:23:32, 1.00it/s][A | |
Iteration: 70%|███████ | 11493/16404 [3:11:23<1:21:31, 1.00it/s][A | |
Iteration: 71%|███████ | 11614/16404 [3:13:23<1:19:29, 1.00it/s][A | |
Iteration: 72%|███████▏ | 11735/16404 [3:15:24<1:17:29, 1.00it/s][A | |
Iteration: 72%|███████▏ | 11856/16404 [3:17:24<1:15:30, 1.00it/s][A | |
Iteration: 73%|███████▎ | 11977/16404 [3:19:25<1:13:28, 1.00it/s][A | |
Iteration: 74%|███████▍ | 12098/16404 [3:21:25<1:11:26, 1.00it/s][A | |
Iteration: 74%|███████▍ | 12219/16404 [3:23:25<1:09:25, 1.00it/s][A | |
Iteration: 75%|███████▌ | 12340/16404 [3:25:26<1:07:26, 1.00it/s][A | |
Iteration: 76%|███████▌ | 12461/16404 [3:27:26<1:05:25, 1.00it/s][A | |
Iteration: 77%|███████▋ | 12582/16404 [3:29:27<1:03:26, 1.00it/s][A | |
Iteration: 77%|███████▋ | 12703/16404 [3:31:27<1:01:26, 1.00it/s][A | |
Iteration: 78%|███████▊ | 12824/16404 [3:33:28<59:25, 1.00it/s] [A | |
Iteration: 79%|███████▉ | 12945/16404 [3:35:29<57:25, 1.00it/s][A | |
Iteration: 80%|███████▉ | 13066/16404 [3:37:29<55:24, 1.00it/s][A | |
Iteration: 80%|████████ | 13187/16404 [3:39:30<53:24, 1.00it/s][A | |
Iteration: 81%|████████ | 13308/16404 [3:41:30<51:24, 1.00it/s][A | |
Iteration: 82%|████████▏ | 13429/16404 [3:43:31<49:24, 1.00it/s][A | |
Iteration: 83%|████████▎ | 13550/16404 [3:45:31<47:24, 1.00it/s][A | |
Iteration: 83%|████████▎ | 13671/16404 [3:47:32<45:24, 1.00it/s][A | |
Iteration: 84%|████████▍ | 13792/16404 [3:49:33<43:23, 1.00it/s][A | |
Iteration: 85%|████████▍ | 13913/16404 [3:51:33<41:21, 1.00it/s][A | |
Iteration: 86%|████████▌ | 14034/16404 [3:53:34<39:21, 1.00it/s][A | |
Iteration: 86%|████████▋ | 14155/16404 [3:55:34<37:19, 1.00it/s][A | |
Iteration: 87%|████████▋ | 14276/16404 [3:57:35<35:19, 1.00it/s][A | |
Iteration: 88%|████████▊ | 14397/16404 [3:59:35<33:17, 1.00it/s][A | |
Iteration: 89%|████████▊ | 14518/16404 [4:01:35<31:16, 1.00it/s][A | |
Iteration: 89%|████████▉ | 14639/16404 [4:03:36<29:16, 1.01it/s][A | |
Iteration: 90%|████████▉ | 14760/16404 [4:05:36<27:16, 1.00it/s][A | |
Iteration: 91%|█████████ | 14881/16404 [4:07:37<25:16, 1.00it/s][A | |
Iteration: 91%|█████████▏| 15002/16404 [4:09:37<23:15, 1.00it/s][A | |
Iteration: 92%|█████████▏| 15123/16404 [4:11:38<21:16, 1.00it/s][A07/16/2020 19:03:03 - INFO - transformers.configuration_utils - Configuration saved in ./examples/models/test/checkpoint-8000/config.json | |
07/16/2020 19:03:04 - INFO - transformers.modeling_utils - Model weights saved in ./examples/models/test/checkpoint-8000/pytorch_model.bin | |
07/16/2020 19:03:04 - INFO - __main__ - Saving model checkpoint to ./examples/models/test/checkpoint-8000 | |
07/16/2020 19:03:07 - INFO - __main__ - Saving optimizer and scheduler states to ./examples/models/test/checkpoint-8000 | |
Iteration: 93%|█████████▎| 15244/16404 [4:13:43<19:27, 1.01s/it][A | |
Iteration: 94%|█████████▎| 15365/16404 [4:15:43<17:22, 1.00s/it][A | |
Iteration: 94%|█████████▍| 15486/16404 [4:17:43<15:18, 1.00s/it][A | |
Iteration: 95%|█████████▌| 15607/16404 [4:19:43<13:15, 1.00it/s][A | |
Iteration: 96%|█████████▌| 15728/16404 [4:21:43<11:13, 1.00it/s][A | |
Iteration: 97%|█████████▋| 15849/16404 [4:23:44<09:12, 1.00it/s][A | |
Iteration: 97%|█████████▋| 15970/16404 [4:25:44<07:11, 1.01it/s][A | |
Iteration: 98%|█████████▊| 16091/16404 [4:27:44<05:11, 1.01it/s][A | |
Iteration: 99%|█████████▉| 16212/16404 [4:29:44<03:10, 1.01it/s][A | |
Iteration: 100%|█████████▉| 16333/16404 [4:31:44<01:10, 1.01it/s][A | |
Iteration: 100%|██████████| 16404/16404 [4:32:55<00:00, 1.00it/s] | |
Epoch: 100%|██████████| 3/3 [13:38:23<00:00, 16363.28s/it] | |
Epoch: 100%|██████████| 3/3 [13:38:23<00:00, 16367.83s/it] | |
07/16/2020 19:23:17 - INFO - __main__ - Training done in total 13.639871 hours | |
07/16/2020 19:23:17 - INFO - __main__ - global_step = 8203, average loss = 0.8324675223705326 | |
07/16/2020 19:23:17 - INFO - __main__ - Saving model checkpoint to ./examples/models/test/ | |
07/16/2020 19:23:17 - INFO - transformers.configuration_utils - Configuration saved in ./examples/models/test/config.json | |
07/16/2020 19:23:18 - INFO - transformers.modeling_utils - Model weights saved in ./examples/models/test/pytorch_model.bin | |
07/16/2020 19:23:19 - INFO - transformers.configuration_utils - loading configuration file ./examples/models/test/config.json | |
07/16/2020 19:23:19 - INFO - transformers.configuration_utils - Model config RobertaConfig { | |
"architectures": [ | |
"RobertaForQuestionAnswering" | |
], | |
"attention_probs_dropout_prob": 0.1, | |
"bos_token_id": 0, | |
"eos_token_id": 2, | |
"gradient_checkpointing": false, | |
"hidden_act": "gelu", | |
"hidden_dropout_prob": 0.1, | |
"hidden_size": 1024, | |
"initializer_range": 0.02, | |
"intermediate_size": 4096, | |
"layer_norm_eps": 1e-05, | |
"max_position_embeddings": 514, | |
"model_type": "roberta", | |
"num_attention_heads": 16, | |
"num_hidden_layers": 24, | |
"pad_token_id": 1, | |
"type_vocab_size": 1, | |
"vocab_size": 50265 | |
} | |
07/16/2020 19:23:19 - INFO - transformers.modeling_utils - loading weights file ./examples/models/test/pytorch_model.bin | |
07/16/2020 19:23:30 - INFO - transformers.modeling_utils - All model checkpoint weights were used when initializing RobertaForQuestionAnswering. | |
07/16/2020 19:23:30 - INFO - transformers.modeling_utils - All the weights of RobertaForQuestionAnswering were initialized from the model checkpoint at ./examples/models/test/. | |
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use RobertaForQuestionAnswering for predictions without further training. | |
07/16/2020 19:23:30 - INFO - transformers.configuration_utils - loading configuration file ./examples/models/test/config.json | |
07/16/2020 19:23:30 - INFO - transformers.configuration_utils - Model config RobertaConfig { | |
"architectures": [ | |
"RobertaForQuestionAnswering" | |
], | |
"attention_probs_dropout_prob": 0.1, | |
"bos_token_id": 0, | |
"eos_token_id": 2, | |
"gradient_checkpointing": false, | |
"hidden_act": "gelu", | |
"hidden_dropout_prob": 0.1, | |
"hidden_size": 1024, | |
"initializer_range": 0.02, | |
"intermediate_size": 4096, | |
"layer_norm_eps": 1e-05, | |
"max_position_embeddings": 514, | |
"model_type": "roberta", | |
"num_attention_heads": 16, | |
"num_hidden_layers": 24, | |
"pad_token_id": 1, | |
"type_vocab_size": 1, | |
"vocab_size": 50265 | |
} | |
07/16/2020 19:23:30 - INFO - transformers.tokenization_utils_base - Model name './examples/models/test/' not found in model shortcut name list (roberta-base, roberta-large, roberta-large-mnli, distilroberta-base, roberta-base-openai-detector, roberta-large-openai-detector). Assuming './examples/models/test/' is a path, a model identifier, or url to a directory containing tokenizer files. | |
07/16/2020 19:23:30 - INFO - transformers.tokenization_utils_base - Didn't find file ./examples/models/test/added_tokens.json. We won't load it. | |
07/16/2020 19:23:30 - INFO - transformers.tokenization_utils_base - Didn't find file ./examples/models/test/tokenizer.json. We won't load it. | |
07/16/2020 19:23:30 - INFO - transformers.tokenization_utils_base - loading file ./examples/models/test/vocab.json | |
07/16/2020 19:23:30 - INFO - transformers.tokenization_utils_base - loading file ./examples/models/test/merges.txt | |
07/16/2020 19:23:30 - INFO - transformers.tokenization_utils_base - loading file None | |
07/16/2020 19:23:30 - INFO - transformers.tokenization_utils_base - loading file ./examples/models/test/special_tokens_map.json | |
07/16/2020 19:23:30 - INFO - transformers.tokenization_utils_base - loading file ./examples/models/test/tokenizer_config.json | |
07/16/2020 19:23:30 - INFO - transformers.tokenization_utils_base - loading file None | |
07/16/2020 19:23:31 - INFO - __main__ - Loading checkpoints saved during training for evaluation | |
07/16/2020 19:23:31 - INFO - __main__ - Evaluate the following checkpoints: ['./examples/models/test/'] | |
07/16/2020 19:23:31 - INFO - transformers.configuration_utils - loading configuration file ./examples/models/test/config.json | |
07/16/2020 19:23:31 - INFO - transformers.configuration_utils - Model config RobertaConfig { | |
"architectures": [ | |
"RobertaForQuestionAnswering" | |
], | |
"attention_probs_dropout_prob": 0.1, | |
"bos_token_id": 0, | |
"eos_token_id": 2, | |
"gradient_checkpointing": false, | |
"hidden_act": "gelu", | |
"hidden_dropout_prob": 0.1, | |
"hidden_size": 1024, | |
"initializer_range": 0.02, | |
"intermediate_size": 4096, | |
"layer_norm_eps": 1e-05, | |
"max_position_embeddings": 514, | |
"model_type": "roberta", | |
"num_attention_heads": 16, | |
"num_hidden_layers": 24, | |
"pad_token_id": 1, | |
"type_vocab_size": 1, | |
"vocab_size": 50265 | |
} | |
07/16/2020 19:23:31 - INFO - transformers.modeling_utils - loading weights file ./examples/models/test/pytorch_model.bin | |
07/16/2020 19:23:42 - INFO - transformers.modeling_utils - All model checkpoint weights were used when initializing RobertaForQuestionAnswering. | |
07/16/2020 19:23:42 - INFO - transformers.modeling_utils - All the weights of RobertaForQuestionAnswering were initialized from the model checkpoint at ./examples/models/test/. | |
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use RobertaForQuestionAnswering for predictions without further training. | |
07/16/2020 19:23:43 - INFO - __main__ - Creating features from dataset file at . | |
0%| | 0/35 [00:00<?, ?it/s] | |
6%|▌ | 2/35 [00:00<00:02, 15.59it/s] | |
11%|█▏ | 4/35 [00:00<00:01, 16.37it/s] | |
17%|█▋ | 6/35 [00:00<00:01, 14.56it/s] | |
20%|██ | 7/35 [00:00<00:02, 12.32it/s] | |
26%|██▌ | 9/35 [00:00<00:02, 12.15it/s] | |
31%|███▏ | 11/35 [00:00<00:02, 9.65it/s] | |
34%|███▍ | 12/35 [00:01<00:02, 9.71it/s] | |
40%|████ | 14/35 [00:01<00:02, 10.16it/s] | |
46%|████▌ | 16/35 [00:01<00:01, 10.60it/s] | |
51%|█████▏ | 18/35 [00:01<00:01, 11.03it/s] | |
57%|█████▋ | 20/35 [00:01<00:01, 12.33it/s] | |
63%|██████▎ | 22/35 [00:01<00:00, 13.02it/s] | |
69%|██████▊ | 24/35 [00:02<00:01, 10.97it/s] | |
74%|███████▍ | 26/35 [00:02<00:00, 9.41it/s] | |
80%|████████ | 28/35 [00:02<00:00, 10.58it/s] | |
86%|████████▌ | 30/35 [00:02<00:00, 10.29it/s] | |
91%|█████████▏| 32/35 [00:03<00:00, 7.96it/s] | |
94%|█████████▍| 33/35 [00:03<00:00, 7.96it/s] | |
97%|█████████▋| 34/35 [00:03<00:00, 8.32it/s] | |
100%|██████████| 35/35 [00:03<00:00, 8.53it/s] | |
100%|██████████| 35/35 [00:03<00:00, 10.12it/s] | |
convert squad examples to features: 0%| | 0/11873 [00:00<?, ?it/s] | |
convert squad examples to features: 0%| | 1/11873 [00:00<1:22:46, 2.39it/s] | |
convert squad examples to features: 6%|▌ | 705/11873 [00:00<54:30, 3.41it/s] | |
convert squad examples to features: 9%|▉ | 1089/11873 [00:00<36:51, 4.88it/s] | |
convert squad examples to features: 11%|█ | 1313/11873 [00:00<25:17, 6.96it/s] | |
convert squad examples to features: 13%|█▎ | 1601/11873 [00:00<17:15, 9.92it/s] | |
convert squad examples to features: 17%|█▋ | 2017/11873 [00:01<11:36, 14.16it/s] | |
convert squad examples to features: 19%|█▉ | 2247/11873 [00:01<07:57, 20.15it/s] | |
convert squad examples to features: 21%|██ | 2462/11873 [00:01<05:28, 28.68it/s] | |
convert squad examples to features: 23%|██▎ | 2753/11873 [00:01<03:43, 40.73it/s] | |
convert squad examples to features: 25%|██▌ | 2977/11873 [00:01<02:36, 57.01it/s] | |
convert squad examples to features: 27%|██▋ | 3152/11873 [00:02<01:52, 77.38it/s] | |
convert squad examples to features: 28%|██▊ | 3289/11873 [00:02<01:26, 99.75it/s] | |
convert squad examples to features: 37%|███▋ | 4385/11873 [00:02<00:52, 141.94it/s] | |
convert squad examples to features: 40%|████ | 4782/11873 [00:02<00:35, 199.46it/s] | |
convert squad examples to features: 44%|████▍ | 5249/11873 [00:02<00:24, 272.69it/s] | |
convert squad examples to features: 48%|████▊ | 5665/11873 [00:03<00:16, 372.15it/s] | |
convert squad examples to features: 51%|█████▏ | 6113/11873 [00:03<00:11, 509.78it/s] | |
convert squad examples to features: 54%|█████▍ | 6438/11873 [00:03<00:08, 651.40it/s] | |
convert squad examples to features: 57%|█████▋ | 6722/11873 [00:03<00:07, 720.07it/s] | |
convert squad examples to features: 61%|██████▏ | 7297/11873 [00:03<00:04, 971.27it/s] | |
convert squad examples to features: 64%|██████▍ | 7613/11873 [00:04<00:03, 1161.24it/s] | |
convert squad examples to features: 67%|██████▋ | 7898/11873 [00:04<00:03, 1253.93it/s] | |
convert squad examples to features: 69%|██████▉ | 8193/11873 [00:04<00:02, 1458.06it/s] | |
convert squad examples to features: 71%|███████ | 8436/11873 [00:04<00:02, 1484.41it/s] | |
convert squad examples to features: 74%|███████▍ | 8801/11873 [00:04<00:01, 1762.85it/s] | |
convert squad examples to features: 76%|███████▋ | 9057/11873 [00:04<00:01, 1894.91it/s] | |
convert squad examples to features: 78%|███████▊ | 9313/11873 [00:04<00:01, 2035.52it/s] | |
convert squad examples to features: 81%|████████ | 9569/11873 [00:04<00:01, 1938.89it/s] | |
convert squad examples to features: 82%|████████▏ | 9791/11873 [00:05<00:01, 1991.99it/s] | |
convert squad examples to features: 85%|████████▌ | 10145/11873 [00:05<00:00, 2238.72it/s] | |
convert squad examples to features: 88%|████████▊ | 10393/11873 [00:05<00:00, 1644.06it/s] | |
convert squad examples to features: 91%|█████████ | 10817/11873 [00:05<00:00, 1212.40it/s] | |
convert squad examples to features: 97%|█████████▋| 11521/11873 [00:06<00:00, 1369.69it/s] | |
convert squad examples to features: 100%|██████████| 11873/11873 [00:06<00:00, 1872.36it/s] | |
add example index and unique id: 0%| | 0/11873 [00:00<?, ?it/s] | |
add example index and unique id: 100%|██████████| 11873/11873 [00:00<00:00, 884748.81it/s] | |
07/16/2020 19:23:54 - INFO - __main__ - Saving features into cached file ./cached_dev_roberta-large_512 | |
07/16/2020 19:24:11 - INFO - __main__ - ***** Running evaluation ***** | |
07/16/2020 19:24:11 - INFO - __main__ - Num examples = 12161 | |
07/16/2020 19:24:11 - INFO - __main__ - Batch size = 12 | |
Evaluating: 0%| | 0/1014 [00:00<?, ?it/s] | |
Evaluating: 8%|▊ | 86/1014 [02:00<21:44, 1.41s/it] | |
Evaluating: 17%|█▋ | 170/1014 [04:00<19:52, 1.41s/it] | |
Evaluating: 25%|██▌ | 255/1014 [06:02<17:55, 1.42s/it] | |
Evaluating: 33%|███▎ | 337/1014 [08:02<16:09, 1.43s/it] | |
Evaluating: 42%|████▏ | 422/1014 [10:03<14:06, 1.43s/it] | |
Evaluating: 50%|█████ | 507/1014 [12:03<12:02, 1.43s/it] | |
Evaluating: 58%|█████▊ | 592/1014 [14:03<09:59, 1.42s/it] | |
Evaluating: 67%|██████▋ | 677/1014 [16:04<07:59, 1.42s/it] | |
Evaluating: 75%|███████▌ | 762/1014 [18:04<05:57, 1.42s/it] | |
Evaluating: 84%|████████▎ | 848/1014 [20:05<03:54, 1.42s/it] | |
Evaluating: 92%|█████████▏| 933/1014 [22:06<01:54, 1.41s/it] | |
Evaluating: 100%|██████████| 1014/1014 [23:59<00:00, 1.42s/it] | |
07/16/2020 19:48:11 - INFO - __main__ - Evaluation done in total 1439.415438 secs (0.118363 sec per example) | |
07/16/2020 19:48:11 - INFO - transformers.data.metrics.squad_metrics - Writing predictions to: ./examples/models/test/predictions_.json | |
07/16/2020 19:48:11 - INFO - transformers.data.metrics.squad_metrics - Writing nbest to: ./examples/models/test/nbest_predictions_.json | |
07/16/2020 19:48:11 - INFO - transformers.data.metrics.squad_metrics - Writing null_log_odds to: ./examples/models/test/null_odds_.json | |
07/16/2020 19:48:38 - INFO - __main__ - Results: {'exact': 84.88166428029984, 'f1': 88.08101045556123, 'total': 11873, 'HasAns_exact': 81.5114709851552, 'HasAns_f1': 87.91933824879894, 'HasAns_total': 5928, 'NoAns_exact': 88.24222035323801, 'NoAns_f1': 88.24222035323801, 'NoAns_total': 5945, 'best_exact': 84.88166428029984, 'best_exact_thresh': 0.0, 'best_f1': 88.08101045556107, 'best_f1_thresh': 0.0} | |
***************************************** | |
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. | |
***************************************** |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e | |
set -x | |
export TASK=SQUAD | |
export SQUAD_VERSION=2.0 | |
export MODEL_NAME=large | |
export SQUAD_DATA=/home/ubuntu/SQuAD_data | |
export BS=2 | |
export ACCUMULATE=6 | |
GBS=$(($BS * $ACCUMULATE)) | |
export LR=3e-5 | |
export MSL=512 | |
export WD=0.01 | |
export EP=3 | |
export SEED=28 | |
export MGN=0.1 | |
export WUR=0.2 | |
export OUTPUT_DIR=roberta/${TASK}${SQUAD_VERSION}_${MODEL_NAME}_${GBS}_${LR}_${WD}_${EP}_${MGN}_${WUR}_${SEED} | |
pip3 install numpy | |
set +x | |
mpirun -np 4 -H localhost:4 -bind-to none -map-by slot python3 -m run_squad \ | |
--model_name=fairseq_roberta_${MODEL_NAME} \ | |
--do_eval \ | |
--do_train \ | |
--data_dir=${SQUAD_DATA} \ | |
--output_dir=${OUTPUT_DIR} \ | |
--gpus=0,1,2,3 \ | |
--num_accumulate=${ACCUMULATE} \ | |
--version=${SQUAD_VERSION} \ | |
--batch_size=${BS} \ | |
--lr=${LR} \ | |
--wd=${WD} \ | |
--seed=${SEED} \ | |
--max_seq_length=${MSL} \ | |
--eval_batch_size=48 \ | |
--save_interval=200 \ | |
--log_interval=50 \ | |
--max_saved_ckpt=6 \ | |
--epochs=${EP} \ | |
--warmup_ratio=${WUR} \ | |
--max_grad_norm=${MGN} \ | |
--comm_backend horovod \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
export SQUAD_DIR=/home/ubuntu/SQuAD_data | |
python3 -m torch.distributed.launch --nproc_per_node=4 ./examples/question-answering/run_squad.py \ | |
--model_type roberta \ | |
--model_name_or_path roberta-large \ | |
--do_train \ | |
--do_eval \ | |
--version_2_with_negative \ | |
--train_file $SQUAD_DIR/train-v2.0.json \ | |
--predict_file $SQUAD_DIR/dev-v2.0.json \ | |
--learning_rate 3e-5 \ | |
--weight_decay 0.01 \ | |
--num_train_epochs 3 \ | |
--warmup_steps 1642 \ | |
--adam_epsilon 1e-6 \ | |
--max_seq_length 512 \ | |
--doc_stride 128 \ | |
--output_dir ./examples/models/test/ \ | |
--per_gpu_eval_batch_size=8 \ | |
--per_gpu_train_batch_size=2 \ | |
--per_gpu_eval_batch_size=12 \ | |
--gradient_accumulation_steps=6 \ | |
--threads 20 \ | |
--logging_steps 50 \ | |
--save_steps 2000 \ | |
--overwrite_cache \ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment