Created
September 30, 2021 11:14
-
-
Save odellus/e1c637860acbd66280429fe1f99d4071 to your computer and use it in GitHub Desktop.
output from attempting to train led-large-16384 with batch size two and four gradient accumulation steps
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
***** Running training ***** | |
Num examples = 250 | |
Num Epochs = 1 | |
Instantaneous batch size per device = 2 | |
Total train batch size (w. parallel, distributed & accumulation) = 8 | |
Gradient Accumulation steps = 4 | |
Total optimization steps = 31 | |
--------------------------------------------------------------------------- | |
RuntimeError Traceback (most recent call last) | |
<ipython-input-24-3435b262f1ae> in <module>() | |
----> 1 trainer.train() | |
18 frames | |
/usr/local/lib/python3.7/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs) | |
1310 tr_loss_step = self.training_step(model, inputs) | |
1311 else: | |
-> 1312 tr_loss_step = self.training_step(model, inputs) | |
1313 | |
1314 if args.logging_nan_inf_filter and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)): | |
/usr/local/lib/python3.7/dist-packages/transformers/trainer.py in training_step(self, model, inputs) | |
1837 if self.use_amp: | |
1838 with autocast(): | |
-> 1839 loss = self.compute_loss(model, inputs) | |
1840 else: | |
1841 loss = self.compute_loss(model, inputs) | |
/usr/local/lib/python3.7/dist-packages/transformers/trainer.py in compute_loss(self, model, inputs, return_outputs) | |
1871 else: | |
1872 labels = None | |
-> 1873 outputs = model(**inputs) | |
1874 # Save past state if it exists | |
1875 # TODO: this needs to be fixed and made cleaner later. | |
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) | |
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks | |
1050 or _global_forward_hooks or _global_forward_pre_hooks): | |
-> 1051 return forward_call(*input, **kwargs) | |
1052 # Do not call functions when jit is used | |
1053 full_backward_hooks, non_full_backward_hooks = [], [] | |
/usr/local/lib/python3.7/dist-packages/transformers/models/led/modeling_led.py in forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, head_mask, decoder_head_mask, cross_attn_head_mask, encoder_outputs, global_attention_mask, past_key_values, inputs_embeds, decoder_inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict) | |
2374 output_attentions=output_attentions, | |
2375 output_hidden_states=output_hidden_states, | |
-> 2376 return_dict=return_dict, | |
2377 ) | |
2378 lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias | |
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) | |
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks | |
1050 or _global_forward_hooks or _global_forward_pre_hooks): | |
-> 1051 return forward_call(*input, **kwargs) | |
1052 # Do not call functions when jit is used | |
1053 full_backward_hooks, non_full_backward_hooks = [], [] | |
/usr/local/lib/python3.7/dist-packages/transformers/models/led/modeling_led.py in forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, head_mask, decoder_head_mask, cross_attn_head_mask, encoder_outputs, global_attention_mask, past_key_values, inputs_embeds, decoder_inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict) | |
2218 output_attentions=output_attentions, | |
2219 output_hidden_states=output_hidden_states, | |
-> 2220 return_dict=return_dict, | |
2221 ) | |
2222 # If the user passed a tuple for encoder_outputs, we wrap it in a LEDEncoderBaseModelOutput when return_dict=False | |
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) | |
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks | |
1050 or _global_forward_hooks or _global_forward_pre_hooks): | |
-> 1051 return forward_call(*input, **kwargs) | |
1052 # Do not call functions when jit is used | |
1053 full_backward_hooks, non_full_backward_hooks = [], [] | |
/usr/local/lib/python3.7/dist-packages/transformers/models/led/modeling_led.py in forward(self, input_ids, attention_mask, global_attention_mask, head_mask, inputs_embeds, output_attentions, output_hidden_states, return_dict) | |
1840 is_index_global_attn=is_index_global_attn, | |
1841 is_global_attn=is_global_attn, | |
-> 1842 output_attentions=output_attentions, | |
1843 ) | |
1844 hidden_states = layer_outputs[0] | |
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) | |
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks | |
1050 or _global_forward_hooks or _global_forward_pre_hooks): | |
-> 1051 return forward_call(*input, **kwargs) | |
1052 # Do not call functions when jit is used | |
1053 full_backward_hooks, non_full_backward_hooks = [], [] | |
/usr/local/lib/python3.7/dist-packages/transformers/models/led/modeling_led.py in forward(self, hidden_states, attention_mask, layer_head_mask, is_index_masked, is_index_global_attn, is_global_attn, output_attentions) | |
913 is_index_global_attn=is_index_global_attn, | |
914 is_global_attn=is_global_attn, | |
--> 915 output_attentions=output_attentions, | |
916 ) | |
917 hidden_states = attn_outputs[0] | |
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) | |
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks | |
1050 or _global_forward_hooks or _global_forward_pre_hooks): | |
-> 1051 return forward_call(*input, **kwargs) | |
1052 # Do not call functions when jit is used | |
1053 full_backward_hooks, non_full_backward_hooks = [], [] | |
/usr/local/lib/python3.7/dist-packages/transformers/models/led/modeling_led.py in forward(self, hidden_states, attention_mask, layer_head_mask, is_index_masked, is_index_global_attn, is_global_attn, output_attentions) | |
724 is_index_global_attn=is_index_global_attn, | |
725 is_global_attn=is_global_attn, | |
--> 726 output_attentions=output_attentions, | |
727 ) | |
728 | |
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) | |
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks | |
1050 or _global_forward_hooks or _global_forward_pre_hooks): | |
-> 1051 return forward_call(*input, **kwargs) | |
1052 # Do not call functions when jit is used | |
1053 full_backward_hooks, non_full_backward_hooks = [], [] | |
/usr/local/lib/python3.7/dist-packages/transformers/models/led/modeling_led.py in forward(self, hidden_states, attention_mask, layer_head_mask, is_index_masked, is_index_global_attn, is_global_attn, output_attentions) | |
280 max_num_global_attn_indices=max_num_global_attn_indices, | |
281 is_index_global_attn_nonzero=is_index_global_attn_nonzero, | |
--> 282 is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero, | |
283 ) | |
284 else: | |
/usr/local/lib/python3.7/dist-packages/transformers/models/led/modeling_led.py in _compute_attn_output_with_global_indices(self, value_vectors, attn_probs, max_num_global_attn_indices, is_index_global_attn_nonzero, is_local_index_global_attn_nonzero) | |
597 # compute attn output with global | |
598 attn_output_without_global = self._sliding_chunks_matmul_attn_probs_value( | |
--> 599 attn_probs_without_global, value_vectors, self.one_sided_attn_window_size | |
600 ) | |
601 return attn_output_only_global + attn_output_without_global | |
/usr/local/lib/python3.7/dist-packages/transformers/models/led/modeling_led.py in _sliding_chunks_matmul_attn_probs_value(self, attn_probs, value, window_overlap) | |
504 chunked_value = padded_value.as_strided(size=chunked_value_size, stride=chunked_value_stride) | |
505 | |
--> 506 chunked_attn_probs = self._pad_and_diagonalize(chunked_attn_probs) | |
507 | |
508 context = torch.einsum("bcwd,bcdh->bcwh", (chunked_attn_probs, chunked_value)) | |
/usr/local/lib/python3.7/dist-packages/transformers/models/led/modeling_led.py in _pad_and_diagonalize(chunked_hidden_states) | |
356 total_num_heads, num_chunks, window_overlap, hidden_dim = chunked_hidden_states.size() | |
357 chunked_hidden_states = nn.functional.pad( | |
--> 358 chunked_hidden_states, (0, window_overlap + 1) | |
359 ) # total_num_heads x num_chunks x window_overlap x (hidden_dim+window_overlap+1). Padding value is not important because it'll be overwritten | |
360 chunked_hidden_states = chunked_hidden_states.view( | |
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in _pad(input, pad, mode, value) | |
4151 assert len(pad) // 2 <= input.dim(), "Padding length too large" | |
4152 if mode == "constant": | |
-> 4153 return _VF.constant_pad_nd(input, pad, value) | |
4154 else: | |
4155 assert value == 0, 'Padding mode "{}"" doesn\'t take in value argument'.format(mode) | |
RuntimeError: CUDA out of memory. Tried to allocate 578.00 MiB (GPU 0; 15.90 GiB total capacity; 13.93 GiB already allocated; 175.75 MiB free; 14.83 GiB reserved in total by PyTorch) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment