Skip to content

Instantly share code, notes, and snippets.

@odellus
Created September 30, 2021 11:14
Show Gist options
  • Save odellus/e1c637860acbd66280429fe1f99d4071 to your computer and use it in GitHub Desktop.
Save odellus/e1c637860acbd66280429fe1f99d4071 to your computer and use it in GitHub Desktop.
output from attempting to train led-large-16384 with batch size two and four gradient accumulation steps
***** Running training *****
Num examples = 250
Num Epochs = 1
Instantaneous batch size per device = 2
Total train batch size (w. parallel, distributed & accumulation) = 8
Gradient Accumulation steps = 4
Total optimization steps = 31
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-24-3435b262f1ae> in <module>()
----> 1 trainer.train()
18 frames
/usr/local/lib/python3.7/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1310 tr_loss_step = self.training_step(model, inputs)
1311 else:
-> 1312 tr_loss_step = self.training_step(model, inputs)
1313
1314 if args.logging_nan_inf_filter and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)):
/usr/local/lib/python3.7/dist-packages/transformers/trainer.py in training_step(self, model, inputs)
1837 if self.use_amp:
1838 with autocast():
-> 1839 loss = self.compute_loss(model, inputs)
1840 else:
1841 loss = self.compute_loss(model, inputs)
/usr/local/lib/python3.7/dist-packages/transformers/trainer.py in compute_loss(self, model, inputs, return_outputs)
1871 else:
1872 labels = None
-> 1873 outputs = model(**inputs)
1874 # Save past state if it exists
1875 # TODO: this needs to be fixed and made cleaner later.
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/transformers/models/led/modeling_led.py in forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, head_mask, decoder_head_mask, cross_attn_head_mask, encoder_outputs, global_attention_mask, past_key_values, inputs_embeds, decoder_inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
2374 output_attentions=output_attentions,
2375 output_hidden_states=output_hidden_states,
-> 2376 return_dict=return_dict,
2377 )
2378 lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/transformers/models/led/modeling_led.py in forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, head_mask, decoder_head_mask, cross_attn_head_mask, encoder_outputs, global_attention_mask, past_key_values, inputs_embeds, decoder_inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
2218 output_attentions=output_attentions,
2219 output_hidden_states=output_hidden_states,
-> 2220 return_dict=return_dict,
2221 )
2222 # If the user passed a tuple for encoder_outputs, we wrap it in a LEDEncoderBaseModelOutput when return_dict=False
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/transformers/models/led/modeling_led.py in forward(self, input_ids, attention_mask, global_attention_mask, head_mask, inputs_embeds, output_attentions, output_hidden_states, return_dict)
1840 is_index_global_attn=is_index_global_attn,
1841 is_global_attn=is_global_attn,
-> 1842 output_attentions=output_attentions,
1843 )
1844 hidden_states = layer_outputs[0]
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/transformers/models/led/modeling_led.py in forward(self, hidden_states, attention_mask, layer_head_mask, is_index_masked, is_index_global_attn, is_global_attn, output_attentions)
913 is_index_global_attn=is_index_global_attn,
914 is_global_attn=is_global_attn,
--> 915 output_attentions=output_attentions,
916 )
917 hidden_states = attn_outputs[0]
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/transformers/models/led/modeling_led.py in forward(self, hidden_states, attention_mask, layer_head_mask, is_index_masked, is_index_global_attn, is_global_attn, output_attentions)
724 is_index_global_attn=is_index_global_attn,
725 is_global_attn=is_global_attn,
--> 726 output_attentions=output_attentions,
727 )
728
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/transformers/models/led/modeling_led.py in forward(self, hidden_states, attention_mask, layer_head_mask, is_index_masked, is_index_global_attn, is_global_attn, output_attentions)
280 max_num_global_attn_indices=max_num_global_attn_indices,
281 is_index_global_attn_nonzero=is_index_global_attn_nonzero,
--> 282 is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
283 )
284 else:
/usr/local/lib/python3.7/dist-packages/transformers/models/led/modeling_led.py in _compute_attn_output_with_global_indices(self, value_vectors, attn_probs, max_num_global_attn_indices, is_index_global_attn_nonzero, is_local_index_global_attn_nonzero)
597 # compute attn output with global
598 attn_output_without_global = self._sliding_chunks_matmul_attn_probs_value(
--> 599 attn_probs_without_global, value_vectors, self.one_sided_attn_window_size
600 )
601 return attn_output_only_global + attn_output_without_global
/usr/local/lib/python3.7/dist-packages/transformers/models/led/modeling_led.py in _sliding_chunks_matmul_attn_probs_value(self, attn_probs, value, window_overlap)
504 chunked_value = padded_value.as_strided(size=chunked_value_size, stride=chunked_value_stride)
505
--> 506 chunked_attn_probs = self._pad_and_diagonalize(chunked_attn_probs)
507
508 context = torch.einsum("bcwd,bcdh->bcwh", (chunked_attn_probs, chunked_value))
/usr/local/lib/python3.7/dist-packages/transformers/models/led/modeling_led.py in _pad_and_diagonalize(chunked_hidden_states)
356 total_num_heads, num_chunks, window_overlap, hidden_dim = chunked_hidden_states.size()
357 chunked_hidden_states = nn.functional.pad(
--> 358 chunked_hidden_states, (0, window_overlap + 1)
359 ) # total_num_heads x num_chunks x window_overlap x (hidden_dim+window_overlap+1). Padding value is not important because it'll be overwritten
360 chunked_hidden_states = chunked_hidden_states.view(
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in _pad(input, pad, mode, value)
4151 assert len(pad) // 2 <= input.dim(), "Padding length too large"
4152 if mode == "constant":
-> 4153 return _VF.constant_pad_nd(input, pad, value)
4154 else:
4155 assert value == 0, 'Padding mode "{}"" doesn\'t take in value argument'.format(mode)
RuntimeError: CUDA out of memory. Tried to allocate 578.00 MiB (GPU 0; 15.90 GiB total capacity; 13.93 GiB already allocated; 175.75 MiB free; 14.83 GiB reserved in total by PyTorch)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment