Taylan Bilal taylanbil

## fullchanges.diff
$ git diff 621e8341 > ~/debug/20201005-wav2vec2/latestdiff.diff
(torch-xla-1.7)
taylanbil at beefy-pytorch-xla-eu in ~/debug/20201005-wav2vec2/fairseq (w2v2●●)
$ cat ~/debug/20201005-wav2vec2/latestdiff.diff
diff --git a/fairseq/criterions/wav2vec_criterion.py b/fairseq/criterions/wav2vec_criterion.py
index 019db622..c316147f 100644
--- a/fairseq/criterions/wav2vec_criterion.py
+++ b/fairseq/criterions/wav2vec_criterion.py
@@ -10,6 +10,7 @@ import torch.nn.functional as F

## samplenegs.diff
diff --git a/fairseq/data/audio/raw_audio_dataset.py b/fairseq/data/audio/raw_audio_dataset.py
index baafe0f9..2114c93a 100644
--- a/fairseq/data/audio/raw_audio_dataset.py
+++ b/fairseq/data/audio/raw_audio_dataset.py
@@ -185,6 +185,7 @@ class RawAudioDataset(FairseqDataset):
                 (B, T, self._C), padding_mask_reshaped,
             )
             input["mask_indices"] = mask_indices
+            input['padding_counts'] = input['mask_indices'].sum(-1).tolist()
             input["mask_channel_indices"] = mask_channel_indices

## gist:4322a4cf315a7b546bf16b90d58e3fa3
python \
 $p/fairseq/train.py \
 $HOME/data/w2v/manifest \
--tpu \
--distributed-world-size 8 \
--max-update 500000 \
--max-epoch 5 \
--num-batch-buckets 1 \
 --required-batch-size-multiple 4 \
 --max-sentences 4 \

## dlrm-criteo-common-stack-trace.txt
mensions={0,1}, to_apply=%AddComputation.1114
2020-07-24 05:41:09.472955: E tensorflow/compiler/xla/xla_client/xla_util.cc:76] *** Begin stack trace ***
2020-07-24 05:41:09.472964: E tensorflow/compiler/xla/xla_client/xla_util.cc:76]
2020-07-24 05:41:09.472969: E tensorflow/compiler/xla/xla_client/xla_util.cc:76]   %compare.1123 = pred[] compare(s32[] %constant.1120, s32[] %constant.1121), direction=NE
2020-07-24 05:41:09.472992: E tensorflow/compiler/xla/xla_client/xla_util.cc:76] }
2020-07-24 05:41:09.473015: E tensorflow/compiler/xla/xla_client/xla_util.cc:76]   %constant.1122 = f32[] constant(1)
2020-07-24 05:41:09.473003: E tensorflow/compiler/xla/xla_client/xla_util.cc:76]   %constant.1119 = s32[] constant(16)
2020-07-24 05:41:09.473007: E tensorflow/compiler/xla/xla_client/xla_util.cc:76]        tensorflow::CurrentStackTrace[abi:cxx11]()
2020-07-24 05:41:09.473010: E tensorflow/compiler/xla/xla_client/xla_util.cc:76]
2020-07-24 05:41:09.473021: E tensorflow/compiler/xla/xla_client/xla_util.cc:76]

## gist:e09bc472f717a1833aa676981e03f88b
taylanbil@dlrm-gpu-8:~/kkissmart-fairseq/mbart$ paste <( grep valid fulldata-gpu.txt | grep loss | cut -d '|' -f3,6,7) tpu.loss.schedule.txt
 fairseq_cli.train       fairseq_cli.train
 valid | loss 5.001 | ppl 32.02          valid | loss 5.003 | ppl 32.08
 valid_EN | loss 8.41 | ppl 340.13       valid_EN | loss 8.411 | ppl 340.26
 valid_IMG | loss 4.621 | ppl 24.61      valid_IMG | loss 4.624 | ppl 24.65
 valid | loss 4.767 | ppl 27.22          valid | loss 4.771 | ppl 27.31
 valid_EN | loss 7.853 | ppl 231.15      valid_EN | loss 7.853 | ppl 231.17
 valid_IMG | loss 4.423 | ppl 21.45      valid_IMG | loss 4.427 | ppl 21.51
 valid | loss 4.609 | ppl 24.41          valid | loss 4.61 | ppl 24.42
 valid_EN | loss 7.347 | ppl 162.79      valid_EN | loss 7.354 | ppl 163.54

## comparison.ml
On commit 1f8ccaaf71b15f22e447866233e7d5e395928cab

# GPU COMMAND - 8 gpus

```bash
python /home/taylanbil/kkissmart-fairseq/tpu_fairseq/train.py $FULLDATAPATH --encoder-normalize-before --decoder-normalize-before --arch mbart_base --layernorm-embedding --task multilingual_denoising --criterion cross_entropy --dataset-impl mmap --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' --lr-scheduler polynomial_decay --lr 1e-04 --min-lr -1 --warmup-updates 0 --total-num-update 500000 --dropout 0.0 --attention-dropout 0.0 --weight-decay 0.0 --max-tokens 4104 --seed 2 --log-format simple --log-interval 100 --add-lang-token --no-whole-word-mask-langs IMG --mask 0.35 --permute-sentences 1.0 --mask-length span-poisson --replace-length 1 --rotate 0.0 --max-source-positions 1026 --max-target-positions 1026 --tokens-per-sample 1026 --sample-break-mode complete --save-interval-updates 500 --skip-invalid-size-inputs-valid-test --langs EN,IMG --no-bos --no-input-eos --multilang-sampling-alpha 0.5 --max-sentences 4 --n

## gist:3dd399753e594825910184a784d0b786

(torch-xla-nightly) pytorch-xla-europe➜  mbart  ᐅ  paste <( grep valid initial-repro.txt| grep -v fairseq | grep loss | cut -d '|' -f 3,6 ) <( grep valid gpulosssched.txt| grep loss | cut -d '|' -f 3,6 )
 valid | loss 4.51       valid | loss 4.507
 valid_EN | loss 8.996   valid_EN | loss 9.003
 valid_IMG | loss 4.417          valid_IMG | loss 4.414
 valid | loss 4.317      valid | loss 4.317
 valid_EN | loss 8.497   valid_EN | loss 8.481
 valid_IMG | loss 4.231          valid_IMG | loss 4.231
 valid | loss 4.236      valid | loss 4.235
 valid_EN | loss 8.129   valid_EN | loss 8.135

## fairseq-trainslation-8gpus-1bucket-loss.txt
RAWLOSS @ 100 tensor(1759.2184, device='cuda:7')
RAWLOSS @ 100 tensor(2361.1760, device='cuda:4')
RAWLOSS @ 100 tensor(3937.2319, device='cuda:2')
RAWLOSS @ 100 tensor(2799.5732, device='cuda:0')
RAWLOSS @ 100 tensor(1954.5380, device='cuda:5')
RAWLOSS @ 100 tensor(2972.5251, device='cuda:6')
RAWLOSS @ 100 tensor(2046.4896, device='cuda:1')
RAWLOSS @ 100 tensor(3103.4412, device='cuda:3')
2020-07-20 20:33:19 | INFO | train_inner | epoch 001:    100 / 81036 loss=14.994, ppl=32621.7, wps=11385.2, ups=5.98, wpb=1903.2, bsz=64, num_updates=100, lr=1e-06, gnorm=6.203, loss_scale=128, train_wall=18, wall=64
RAWLOSS @ 200 tensor(2411.5215, device='cuda:7')

## fairseq-trainslation-8tpus-1bucket-loss.txt
RAWLOSS @ 100 tensor(2970.8003, device='xla:0')
RAWLOSS @ 100 tensor(1757.3165, device='xla:0')
RAWLOSS @ 100 tensor(1953.1198, device='xla:0')
RAWLOSS @ 100 tensor(3101.5469, device='xla:0')
RAWLOSS @ 100 tensor(3934.1355, device='xla:0')
RAWLOSS @ 100 tensor(2359.2961, device='xla:0')
RAWLOSS @ 100 tensor(2797.4104, device='xla:1')
RAWLOSS @ 100 tensor(2044.7153, device='xla:0')
2020-07-20 20:33:53 | INFO | root | NOTE: XLA compilation detected; too many of these can lead to slow training, but we expect a few in the beginning
2020-07-20 20:33:53 | INFO | train_inner | epoch 001:    100 / 81036 loss=13.799, ppl=14254.9, wps=0, ups=0, wpb=2187, bsz=64, num_updates=100, lr=1e-06, gnorm=3.694, train_wall=14, wall=95

## gist:806e3594d0f25ac70b6aa03d294db1ec
taylanbil@dlrm-gpu-8:~/kkissmart-fairseq$ paste <( grep wps gpu-repro.txt | sed 's/.*loss=//' | cut -d',' -f1 ) <(  grep wps tpulog.txt | sed 's/.*loss=//' | cut -d',' -f1  )
15.175  14.211
13.532  12.824
12.885  12.463
12.564  11.9
12.171  11.938
11.87   11.621
11.598  11.113
11.283  10.451
11.052  10.907
	$ git diff 621e8341 > ~/debug/20201005-wav2vec2/latestdiff.diff
	(torch-xla-1.7)
	taylanbil at beefy-pytorch-xla-eu in ~/debug/20201005-wav2vec2/fairseq (w2v2●●)
	$ cat ~/debug/20201005-wav2vec2/latestdiff.diff
	diff --git a/fairseq/criterions/wav2vec_criterion.py b/fairseq/criterions/wav2vec_criterion.py
	index 019db622..c316147f 100644
	--- a/fairseq/criterions/wav2vec_criterion.py
	+++ b/fairseq/criterions/wav2vec_criterion.py
	@@ -10,6 +10,7 @@ import torch.nn.functional as F
	diff --git a/fairseq/data/audio/raw_audio_dataset.py b/fairseq/data/audio/raw_audio_dataset.py
	index baafe0f9..2114c93a 100644
	--- a/fairseq/data/audio/raw_audio_dataset.py
	+++ b/fairseq/data/audio/raw_audio_dataset.py
	@@ -185,6 +185,7 @@ class RawAudioDataset(FairseqDataset):
	(B, T, self._C), padding_mask_reshaped,
	)
	input["mask_indices"] = mask_indices
	+ input['padding_counts'] = input['mask_indices'].sum(-1).tolist()
	input["mask_channel_indices"] = mask_channel_indices
	python \
	$p/fairseq/train.py \
	$HOME/data/w2v/manifest \
	--tpu \
	--distributed-world-size 8 \
	--max-update 500000 \
	--max-epoch 5 \
	--num-batch-buckets 1 \
	--required-batch-size-multiple 4 \
	--max-sentences 4 \
	mensions={0,1}, to_apply=%AddComputation.1114
	2020-07-24 05:41:09.472955: E tensorflow/compiler/xla/xla_client/xla_util.cc:76] * Begin stack trace *
	2020-07-24 05:41:09.472964: E tensorflow/compiler/xla/xla_client/xla_util.cc:76]
	2020-07-24 05:41:09.472969: E tensorflow/compiler/xla/xla_client/xla_util.cc:76] %compare.1123 = pred[] compare(s32[] %constant.1120, s32[] %constant.1121), direction=NE
	2020-07-24 05:41:09.472992: E tensorflow/compiler/xla/xla_client/xla_util.cc:76] }
	2020-07-24 05:41:09.473015: E tensorflow/compiler/xla/xla_client/xla_util.cc:76] %constant.1122 = f32[] constant(1)
	2020-07-24 05:41:09.473003: E tensorflow/compiler/xla/xla_client/xla_util.cc:76] %constant.1119 = s32[] constant(16)
	2020-07-24 05:41:09.473007: E tensorflow/compiler/xla/xla_client/xla_util.cc:76] tensorflow::CurrentStackTrace[abi:cxx11]()
	2020-07-24 05:41:09.473010: E tensorflow/compiler/xla/xla_client/xla_util.cc:76]
	2020-07-24 05:41:09.473021: E tensorflow/compiler/xla/xla_client/xla_util.cc:76]
	taylanbil@dlrm-gpu-8:~/kkissmart-fairseq/mbart$ paste <( grep valid fulldata-gpu.txt \| grep loss \| cut -d '\|' -f3,6,7) tpu.loss.schedule.txt
	fairseq_cli.train fairseq_cli.train
	valid \| loss 5.001 \| ppl 32.02 valid \| loss 5.003 \| ppl 32.08
	valid_EN \| loss 8.41 \| ppl 340.13 valid_EN \| loss 8.411 \| ppl 340.26
	valid_IMG \| loss 4.621 \| ppl 24.61 valid_IMG \| loss 4.624 \| ppl 24.65
	valid \| loss 4.767 \| ppl 27.22 valid \| loss 4.771 \| ppl 27.31
	valid_EN \| loss 7.853 \| ppl 231.15 valid_EN \| loss 7.853 \| ppl 231.17
	valid_IMG \| loss 4.423 \| ppl 21.45 valid_IMG \| loss 4.427 \| ppl 21.51
	valid \| loss 4.609 \| ppl 24.41 valid \| loss 4.61 \| ppl 24.42
	valid_EN \| loss 7.347 \| ppl 162.79 valid_EN \| loss 7.354 \| ppl 163.54
	On commit 1f8ccaaf71b15f22e447866233e7d5e395928cab

	# GPU COMMAND - 8 gpus

	```bash
	python /home/taylanbil/kkissmart-fairseq/tpu_fairseq/train.py $FULLDATAPATH --encoder-normalize-before --decoder-normalize-before --arch mbart_base --layernorm-embedding --task multilingual_denoising --criterion cross_entropy --dataset-impl mmap --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' --lr-scheduler polynomial_decay --lr 1e-04 --min-lr -1 --warmup-updates 0 --total-num-update 500000 --dropout 0.0 --attention-dropout 0.0 --weight-decay 0.0 --max-tokens 4104 --seed 2 --log-format simple --log-interval 100 --add-lang-token --no-whole-word-mask-langs IMG --mask 0.35 --permute-sentences 1.0 --mask-length span-poisson --replace-length 1 --rotate 0.0 --max-source-positions 1026 --max-target-positions 1026 --tokens-per-sample 1026 --sample-break-mode complete --save-interval-updates 500 --skip-invalid-size-inputs-valid-test --langs EN,IMG --no-bos --no-input-eos --multilang-sampling-alpha 0.5 --max-sentences 4 --n

	(torch-xla-nightly) pytorch-xla-europe➜ mbart ᐅ paste <( grep valid initial-repro.txt\| grep -v fairseq \| grep loss \| cut -d '\|' -f 3,6 ) <( grep valid gpulosssched.txt\| grep loss \| cut -d '\|' -f 3,6 )
	valid \| loss 4.51 valid \| loss 4.507
	valid_EN \| loss 8.996 valid_EN \| loss 9.003
	valid_IMG \| loss 4.417 valid_IMG \| loss 4.414
	valid \| loss 4.317 valid \| loss 4.317
	valid_EN \| loss 8.497 valid_EN \| loss 8.481
	valid_IMG \| loss 4.231 valid_IMG \| loss 4.231
	valid \| loss 4.236 valid \| loss 4.235
	valid_EN \| loss 8.129 valid_EN \| loss 8.135
	RAWLOSS @ 100 tensor(1759.2184, device='cuda:7')
	RAWLOSS @ 100 tensor(2361.1760, device='cuda:4')
	RAWLOSS @ 100 tensor(3937.2319, device='cuda:2')
	RAWLOSS @ 100 tensor(2799.5732, device='cuda:0')
	RAWLOSS @ 100 tensor(1954.5380, device='cuda:5')
	RAWLOSS @ 100 tensor(2972.5251, device='cuda:6')
	RAWLOSS @ 100 tensor(2046.4896, device='cuda:1')
	RAWLOSS @ 100 tensor(3103.4412, device='cuda:3')
	2020-07-20 20:33:19 \| INFO \| train_inner \| epoch 001: 100 / 81036 loss=14.994, ppl=32621.7, wps=11385.2, ups=5.98, wpb=1903.2, bsz=64, num_updates=100, lr=1e-06, gnorm=6.203, loss_scale=128, train_wall=18, wall=64
	RAWLOSS @ 200 tensor(2411.5215, device='cuda:7')
	RAWLOSS @ 100 tensor(2970.8003, device='xla:0')
	RAWLOSS @ 100 tensor(1757.3165, device='xla:0')
	RAWLOSS @ 100 tensor(1953.1198, device='xla:0')
	RAWLOSS @ 100 tensor(3101.5469, device='xla:0')
	RAWLOSS @ 100 tensor(3934.1355, device='xla:0')
	RAWLOSS @ 100 tensor(2359.2961, device='xla:0')
	RAWLOSS @ 100 tensor(2797.4104, device='xla:1')
	RAWLOSS @ 100 tensor(2044.7153, device='xla:0')
	2020-07-20 20:33:53 \| INFO \| root \| NOTE: XLA compilation detected; too many of these can lead to slow training, but we expect a few in the beginning
	2020-07-20 20:33:53 \| INFO \| train_inner \| epoch 001: 100 / 81036 loss=13.799, ppl=14254.9, wps=0, ups=0, wpb=2187, bsz=64, num_updates=100, lr=1e-06, gnorm=3.694, train_wall=14, wall=95
	taylanbil@dlrm-gpu-8:~/kkissmart-fairseq$ paste <( grep wps gpu-repro.txt \| sed 's/.loss=//' \| cut -d',' -f1 ) <( grep wps tpulog.txt \| sed 's/.loss=//' \| cut -d',' -f1 )
	15.175 14.211
	13.532 12.824
	12.885 12.463
	12.564 11.9
	12.171 11.938
	11.87 11.621
	11.598 11.113
	11.283 10.451
	11.052 10.907