-
-
Save djsaunde/0c1664c32e44a64d31b5e01b4aafe5c4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [rank0]: Traceback (most recent call last): | |
| [rank0]: File "<frozen runpy>", line 198, in _run_module_as_main | |
| [rank0]: File "<frozen runpy>", line 88, in _run_code | |
| [rank0]: File "/workspace/axolotl/src/axolotl/cli/train.py", line 126, in <module> | |
| [rank0]: fire.Fire(do_cli) | |
| [rank0]: File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/fire/core.py", line 135, in Fire | |
| [rank0]: component_trace = _Fire(component, args, parsed_flag_args, context, name) | |
| [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| [rank0]: File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/fire/core.py", line 468, in _Fire | |
| [rank0]: component, remaining_args = _CallAndUpdateTrace( | |
| [rank0]: ^^^^^^^^^^^^^^^^^^^^ | |
| [rank0]: File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace | |
| [rank0]: component = fn(*varargs, **kwargs) | |
| [rank0]: ^^^^^^^^^^^^^^^^^^^^^^ | |
| [rank0]: File "/workspace/axolotl/src/axolotl/cli/train.py", line 93, in do_cli | |
| [rank0]: return do_train(parsed_cfg, parsed_cli_args) | |
| [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| [rank0]: File "/workspace/axolotl/src/axolotl/cli/train.py", line 49, in do_train | |
| [rank0]: model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) | |
| [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| [rank0]: File "/workspace/axolotl/src/axolotl/train.py", line 564, in train | |
| [rank0]: execute_training(cfg, trainer, resume_from_checkpoint) | |
| [rank0]: File "/workspace/axolotl/src/axolotl/train.py", line 225, in execute_training | |
| [rank0]: trainer.train(resume_from_checkpoint=resume_from_checkpoint) | |
| [rank0]: File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/transformers/trainer.py", line 2206, in train | |
| [rank0]: return inner_training_loop( | |
| [rank0]: ^^^^^^^^^^^^^^^^^^^^ | |
| [rank0]: File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/transformers/trainer.py", line 2548, in _inner_traini | |
| ng_loop | |
| [rank0]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) | |
| [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| [rank0]: File "/workspace/axolotl/src/axolotl/core/trainers/mixins/activation_checkpointing.py", line 32, in training_step | |
| [rank0]: return super().training_step(*args, **kwargs) | |
| [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| [rank0]: File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/transformers/trainer.py", line 3800, in training_step | |
| [rank0]: self.accelerator.backward(loss, **kwargs) | |
| [rank0]: File "/workspace/accelerate/src/accelerate/accelerator.py", line 2578, in backward | |
| [rank0]: loss.backward(**kwargs) | |
| [rank0]: File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/_tensor.py", line 648, in backward | |
| [rank0]: torch.autograd.backward( | |
| [rank0]: File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/autograd/__init__.py", line 353, in backward | |
| [rank0]: _engine_run_backward( | |
| [rank0]: File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/autograd/graph.py", line 824, in _engine_run_ba | |
| ckward | |
| [rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass | |
| [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| [rank0]: File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/autograd/function.py", line 307, in apply | |
| [rank0]: return user_fn(self, *args) | |
| [rank0]: ^^^^^^^^^^^^^^^^^^^^ | |
| [rank0]: File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", | |
| line 2049, in backward | |
| [rank0]: all_args = _backward_prologue_functional( | |
| [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| [rank0]: File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", | |
| line 1630, in _backward_prologue_functional | |
| [rank0]: flat_processed_tangents = list( | |
| [rank0]: ^^^^^ | |
| [rank0]: File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", | |
| line 1633, in <genexpr> | |
| [rank0]: AOTDispatchAutograd.process_runtime_tangent( | |
| [rank0]: File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", | |
| line 1808, in process_runtime_tangent | |
| [rank0]: raise RuntimeError( | |
| [rank0]: RuntimeError: | |
| [rank0]: During the backward, we encountered a tensor subclass where we guessed its | |
| [rank0]: metadata incorrectly. | |
| [rank0]: Expected metadata: {'_orig_dtype': torch.bfloat16, '_linear_mm_config': LinearMMConfig(output=ScaledMMConfig(emulate=Fa | |
| lse, use_fast_accum=True, fp8_output=False, pad_inner_dim=False), grad_input=ScaledMMConfig(emulate=False, use_fast_accum=False, | |
| fp8_output=False, pad_inner_dim=False), grad_weight=ScaledMMConfig(emulate=False, use_fast_accum=False, fp8_output=False, pad_i | |
| nner_dim=False)), '_gemm_input_role': <GemmInputRole.WEIGHT: 'weight'>, '_axiswise_dim': None}, expected type: <class 'torchao.f | |
| loat8.float8_tensor.Float8Tensor'> | |
| [rank0]: Runtime metadata: None, runtime type: <class 'torch.Tensor'> | |
| [rank0]: shape: torch.Size([8192, 3072]) | |
| [rank0]: To fix this, your tensor subclass must implement the dunder method __force_to_same_metadata__. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment