Created
April 18, 2024 01:34
-
-
Save woshiyyya/a19489631a79c85912bd68a0036a8396 to your computer and use it in GitHub Desktop.
error-efa.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
task. | |
class_name: RayTrainWorker | |
actor_id: 9e6790a209b7c509e64301f305000000 | |
pid: 35979 | |
namespace: f205d617-4ee1-4fae-a76a-c3f2382b7527 | |
ip: 172.24.101.245 | |
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an exit code None. Traceback (most recent call last): | |
File "python/ray/_raylet.pyx", line 1883, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1984, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1889, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1830, in ray._raylet.execute_task.function_executor | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/function_manager.py", line 724, in actor_method_executor | |
return method(__ray_actor, *args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/util/tracing/tracing_helper.py", line 467, in _resume_span | |
return method(self, *_args, **_kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/worker_group.py", line 33, in __execute | |
raise skipped from exception_cause(skipped) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/utils.py", line 169, in discard_return_wrapper | |
train_func(*args, **kwargs) | |
File "/home/ray/default/train.py", line 186, in train_func | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 545, in fit | |
call._call_and_handle_interrupt( | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 43, in _call_and_handle_interrupt | |
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 102, in launch | |
return function(*args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 581, in _fit_impl | |
self._run(model, ckpt_path=ckpt_path) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 990, in _run | |
results = self._run_stage() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1036, in _run_stage | |
self.fit_loop.run() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loop.py", line 202, in run | |
self.advance() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loop.py", line 359, in advance | |
self.epoch_loop.run(self._data_fetcher) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 136, in run | |
self.advance(data_fetcher) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 240, in advance | |
batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 187, in run | |
self._optimizer_step(batch_idx, closure) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 265, in _optimizer_step | |
call._call_lightning_module_hook( | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 157, in _call_lightning_module_hook | |
output = fn(*args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.py", line 1282, in optimizer_step | |
optimizer.step(closure=optimizer_closure) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/optimizer.py", line 151, in step | |
step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/ddp.py", line 263, in optimizer_step | |
optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/strategy.py", line 230, in optimizer_step | |
return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/amp.py", line 74, in optimizer_step | |
return super().optimizer_step(optimizer, model=model, closure=closure, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py", line 117, in optimizer_step | |
return optimizer.step(closure=closure, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/lr_scheduler.py", line 75, in wrapper | |
return wrapped(*args, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/optimizer.py", line 385, in wrapper | |
out = func(*args, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/optimizer.py", line 76, in _use_grad | |
ret = func(self, *args, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/adamw.py", line 164, in step | |
loss = closure() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py", line 104, in _wrap_closure | |
closure_result = closure() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 140, in __call__ | |
self._result = self.closure(*args, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context | |
return func(*args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 135, in closure | |
self._backward_fn(step_output.closure_loss) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 236, in backward_fn | |
call._call_strategy_hook(self.trainer, "backward", loss, optimizer) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 309, in _call_strategy_hook | |
output = fn(*args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/strategy.py", line 204, in backward | |
self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py", line 69, in backward | |
model.backward(tensor, *args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.py", line 1069, in backward | |
loss.backward(*args, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/_tensor.py", line 522, in backward | |
torch.autograd.backward( | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/autograd/__init__.py", line 266, in backward | |
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass | |
RuntimeError: GET was unable to find an engine to execute this computation | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py", line 1245, in dump | |
return super().dump(obj) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/tblib/pickling_support.py", line 46, in pickle_exception | |
rv = obj.__reduce_ex__(3) | |
RecursionError: maximum recursion depth exceeded while calling a Python object | |
The above exception was the direct cause of the following exception: | |
Traceback (most recent call last): | |
File "python/ray/_raylet.pyx", line 2281, in ray._raylet.task_execution_handler | |
File "python/ray/_raylet.pyx", line 2177, in ray._raylet.execute_task_with_cancellation_handler | |
File "python/ray/_raylet.pyx", line 1832, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1833, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 2071, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1089, in ray._raylet.store_task_errors | |
File "python/ray/_raylet.pyx", line 4575, in ray._raylet.CoreWorker.store_task_outputs | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py", line 494, in serialize | |
return self._serialize_to_msgpack(value) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py", line 449, in _serialize_to_msgpack | |
value = value.to_bytes() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/exceptions.py", line 32, in to_bytes | |
serialized_exception=pickle.dumps(self), | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py", line 1479, in dumps | |
cp.dump(obj) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py", line 1249, in dump | |
raise pickle.PicklingError(msg) from e | |
_pickle.PicklingError: Could not pickle object as excessively deep recursion required. | |
An unexpected internal error occurred while the worker was executing a task. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py", line 1245, in dump | |
return super().dump(obj) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/tblib/pickling_support.py", line 46, in pickle_exception | |
rv = obj.__reduce_ex__(3) | |
RecursionError: maximum recursion depth exceeded while calling a Python object | |
The above exception was the direct cause of the following exception: | |
Traceback (most recent call last): | |
File "python/ray/_raylet.pyx", line 2281, in ray._raylet.task_execution_handler | |
File "python/ray/_raylet.pyx", line 2177, in ray._raylet.execute_task_with_cancellation_handler | |
File "python/ray/_raylet.pyx", line 1832, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1833, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 2071, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1089, in ray._raylet.store_task_errors | |
File "python/ray/_raylet.pyx", line 4575, in ray._raylet.CoreWorker.store_task_outputs | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py", line 494, in serialize | |
return self._serialize_to_msgpack(value) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py", line 449, in _serialize_to_msgpack | |
value = value.to_bytes() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/exceptions.py", line 32, in to_bytes | |
serialized_exception=pickle.dumps(self), | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py", line 1479, in dumps | |
cp.dump(obj) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py", line 1249, in dump | |
raise pickle.PicklingError(msg) from e | |
_pickle.PicklingError: Could not pickle object as excessively deep recursion required. | |
An unexpected internal error occurred while the worker was executing a task. | |
Training errored after 0 iterations at 2024-04-17 18:24:17. Total running time: 1min 28s | |
Error file: /tmp/ray/session_2024-04-17_17-56-58_302463_3956/artifacts/2024-04-17_18-22-49/sd-stage_1-256/driver_artifacts/TorchTrainer_2b908_00000_0_2024-04-17_18-22-49/error.txt | |
(pid=37114, ip=172.24.101.245) Running: 33/383.0 CPU, 0/24.0 GPU, 6.0GB/372.5GB 2024-04-17 18:24:19,4320INFO tune.py:1016 -- Wrote the latest version of all result files and experiment state to 'private-vpc-20231023180430879200000005/org_7c1Kalm9WcX2bNIjW53GUT/cld_l732epdw3rcfjxnisdgint8xau/artifact_storage/yunxuan__xiao/sd-stage_1-256' in 1.4940s. | |
(pid=37114, ip=172.24.101.245) Running: 33/383.0 CPU, 0/24.0 GPU, 6.0GB/372.5GB 2024-04-17 18:24:19,4370ERROR tune.py:1044 -- Trials did not complete: [TorchTrainer_2b908_00000]2.24.101.245) - MapBatches(convert_precision): 4 active, 0 queu | |
RayActorError: The actor died unexpectedly before finishing this task.ued, [cpu: | |
class_name: with_parameters.<locals>._Inner | |
actor_id: 25c149ef281dc60bf21b13b105000000 | |
pid: 35755 | |
namespace: f205d617-4ee1-4fae-a76a-c3f2382b7527 | |
ip: 172.24.101.245 | |
The actor is dead because its worker process has died. Worker exit type: | |
SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an | |
exit code None. Traceback (most recent call last): | |
File "python/ray/_raylet.pyx", line 1883, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1984, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1889, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1830, in | |
ray._raylet.execute_task.function_executor | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/function_manager.p | |
y", line 724, in actor_method_executor | |
return method(__ray_actor, *args, **kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/util/tracing/tracing_helper | |
.py", line 467, in _resume_span | |
return method(self, *_args, **_kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/tune/trainable/trainable.py | |
", line 334, in train | |
raise skipped from exception_cause(skipped) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/air/_internal/util.py", | |
line 88, in run | |
self._ret = self._target(*self._args, **self._kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/tune/trainable/function_tra | |
inable.py", line 53, in <lambda> | |
training_func=lambda: self._trainable_func(self.config), | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/util/tracing/tracing_helper | |
.py", line 467, in _resume_span | |
return method(self, *_args, **_kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/base_trainer.py", | |
line 793, in _trainable_func | |
super()._trainable_func(self._merged_config) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/tune/trainable/function_tra | |
inable.py", line 261, in _trainable_func | |
output = fn() | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/base_trainer.py", | |
line 107, in _train_coordinator_fn | |
trainer.training_loop() | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/data_parallel_trainer | |
.py", line 461, in training_loop | |
self._run_training(training_iterator) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/data_parallel_trainer | |
.py", line 362, in _run_training | |
for training_results in training_iterator: | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/trainer.py", | |
line 123, in __next__ | |
next_results = self._run_with_error_handling(self._fetch_next_result) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/trainer.py", | |
line 89, in _run_with_error_handling | |
return func() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/trainer.py", | |
line 153, in _fetch_next_result | |
results = self._backend_executor.get_next_results() | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/backend_exe | |
cutor.py", line 569, in get_next_results | |
results = self.get_with_failure_handling(futures) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/backend_exe | |
cutor.py", line 650, in get_with_failure_handling | |
self._increment_failures() | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/backend_exe | |
cutor.py", line 712, in _increment_failures | |
raise failure | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/utils.py", | |
line 53, in check_for_failure | |
ray.get(object_ref) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/auto_init_hook.py" | |
, line 21, in auto_init_wrapper | |
return fn(*args, **kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.p | |
y", line 103, in wrapper | |
return func(*args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/worker.py", | |
line 2667, in get | |
values, debugger_breakpoint = worker.get_objects(object_refs, | |
timeout=timeout) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/worker.py", | |
line 866, in get_objects | |
raise value | |
ray.exceptions.RayActorError: The actor died unexpectedly before finishing this | |
task. | |
class_name: RayTrainWorker | |
actor_id: 9e6790a209b7c509e64301f305000000 | |
pid: 35979 | |
namespace: f205d617-4ee1-4fae-a76a-c3f2382b7527 | |
ip: 172.24.101.245 | |
The actor is dead because its worker process has died. Worker exit type: | |
SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an | |
exit code None. Traceback (most recent call last): | |
File "python/ray/_raylet.pyx", line 1883, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1984, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1889, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1830, in | |
ray._raylet.execute_task.function_executor | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/function_manager.p | |
y", line 724, in actor_method_executor | |
return method(__ray_actor, *args, **kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/util/tracing/tracing_helper | |
.py", line 467, in _resume_span | |
return method(self, *_args, **_kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/worker_grou | |
p.py", line 33, in __execute | |
raise skipped from exception_cause(skipped) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/utils.py", | |
line 169, in discard_return_wrapper | |
train_func(*args, **kwargs) | |
File "/home/ray/default/train.py", line 186, in train_func | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/train | |
er.py", line 545, in fit | |
call._call_and_handle_interrupt( | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call. | |
py", line 43, in _call_and_handle_interrupt | |
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, | |
**kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/la | |
unchers/subprocess_script.py", line 102, in launch | |
return function(*args, **kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/train | |
er.py", line 581, in _fit_impl | |
self._run(model, ckpt_path=ckpt_path) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/train | |
er.py", line 990, in _run | |
results = self._run_stage() | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/train | |
er.py", line 1036, in _run_stage | |
self.fit_loop.run() | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loo | |
p.py", line 202, in run | |
self.advance() | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loo | |
p.py", line 359, in advance | |
self.epoch_loop.run(self._data_fetcher) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/trainin | |
g_epoch_loop.py", line 136, in run | |
self.advance(data_fetcher) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/trainin | |
g_epoch_loop.py", line 240, in advance | |
batch_output = self.automatic_optimization.run(trainer.optimizers[0], | |
batch_idx, kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimiz | |
ation/automatic.py", line 187, in run | |
self._optimizer_step(batch_idx, closure) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimiz | |
ation/automatic.py", line 265, in _optimizer_step | |
call._call_lightning_module_hook( | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call. | |
py", line 157, in _call_lightning_module_hook | |
output = fn(*args, **kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.p | |
y", line 1282, in optimizer_step | |
optimizer.step(closure=optimizer_closure) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/optimize | |
r.py", line 151, in step | |
step_output = self._strategy.optimizer_step(self._optimizer, closure, | |
**kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/dd | |
p.py", line 263, in optimizer_step | |
optimizer_output = super().optimizer_step(optimizer, closure, model, | |
**kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/st | |
rategy.py", line 230, in optimizer_step | |
return self.precision_plugin.optimizer_step(optimizer, model=model, | |
closure=closure, **kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/preci | |
sion/amp.py", line 74, in optimizer_step | |
return super().optimizer_step(optimizer, model=model, closure=closure, | |
**kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/preci | |
sion/precision_plugin.py", line 117, in optimizer_step | |
return optimizer.step(closure=closure, **kwargs) | |
File | |
"/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/lr_scheduler. | |
py", line 75, in wrapper | |
return wrapped(*args, **kwargs) | |
File | |
"/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/optimizer.py" | |
, line 385, in wrapper | |
out = func(*args, **kwargs) | |
File | |
"/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/optimizer.py" | |
, line 76, in _use_grad | |
ret = func(self, *args, **kwargs) | |
File | |
"/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/adamw.py", | |
line 164, in step | |
loss = closure() | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/preci | |
sion/precision_plugin.py", line 104, in _wrap_closure | |
closure_result = closure() | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimiz | |
ation/automatic.py", line 140, in __call__ | |
self._result = self.closure(*args, **kwargs) | |
File | |
"/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/utils/_contextlib.p | |
y", line 115, in decorate_context | |
return func(*args, **kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimiz | |
ation/automatic.py", line 135, in closure | |
self._backward_fn(step_output.closure_loss) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimiz | |
ation/automatic.py", line 236, in backward_fn | |
call._call_strategy_hook(self.trainer, "backward", loss, optimizer) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call. | |
py", line 309, in _call_strategy_hook | |
output = fn(*args, **kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/st | |
rategy.py", line 204, in backward | |
self.precision_plugin.backward(closure_loss, self.lightning_module, | |
optimizer, *args, **kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/preci | |
sion/precision_plugin.py", line 69, in backward | |
model.backward(tensor, *args, **kwargs) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.p | |
y", line 1069, in backward | |
loss.backward(*args, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/_tensor.py", | |
line 522, in backward | |
torch.autograd.backward( | |
File | |
"/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/autograd/__init__.p | |
y", line 266, in backward | |
Variable._execution_engine.run_backward( # Calls into the C++ engine to run | |
the backward pass | |
RuntimeError: GET was unable to find an engine to execute this computation | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py" | |
, line 1245, in dump | |
return super().dump(obj) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/tblib/pickling_support.py", | |
line 46, in pickle_exception | |
rv = obj.__reduce_ex__(3) | |
RecursionError: maximum recursion depth exceeded while calling a Python object | |
The above exception was the direct cause of the following exception: | |
Traceback (most recent call last): | |
File "python/ray/_raylet.pyx", line 2281, in | |
ray._raylet.task_execution_handler | |
File "python/ray/_raylet.pyx", line 2177, in | |
ray._raylet.execute_task_with_cancellation_handler | |
File "python/ray/_raylet.pyx", line 1832, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1833, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 2071, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1089, in ray._raylet.store_task_errors | |
File "python/ray/_raylet.pyx", line 4575, in | |
ray._raylet.CoreWorker.store_task_outputs | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py", | |
line 494, in serialize | |
return self._serialize_to_msgpack(value) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py", | |
line 449, in _serialize_to_msgpack | |
value = value.to_bytes() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/exceptions.py", line | |
32, in to_bytes | |
serialized_exception=pickle.dumps(self), | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py" | |
, line 1479, in dumps | |
cp.dump(obj) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py" | |
, line 1249, in dump | |
raise pickle.PicklingError(msg) from e | |
_pickle.PicklingError: Could not pickle object as excessively deep recursion | |
required. | |
An unexpected internal error occurred while the worker was executing a task. | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py" | |
, line 1245, in dump | |
return super().dump(obj) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/tblib/pickling_support.py", | |
line 46, in pickle_exception | |
rv = obj.__reduce_ex__(3) | |
RecursionError: maximum recursion depth exceeded while calling a Python object | |
The above exception was the direct cause of the following exception: | |
Traceback (most recent call last): | |
File "python/ray/_raylet.pyx", line 2281, in | |
ray._raylet.task_execution_handler | |
File "python/ray/_raylet.pyx", line 2177, in | |
ray._raylet.execute_task_with_cancellation_handler | |
File "python/ray/_raylet.pyx", line 1832, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1833, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 2071, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1089, in ray._raylet.store_task_errors | |
File "python/ray/_raylet.pyx", line 4575, in | |
ray._raylet.CoreWorker.store_task_outputs | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py", | |
line 494, in serialize | |
return self._serialize_to_msgpack(value) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py", | |
line 449, in _serialize_to_msgpack | |
value = value.to_bytes() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/exceptions.py", line | |
32, in to_bytes | |
serialized_exception=pickle.dumps(self), | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py" | |
, line 1479, in dumps | |
cp.dump(obj) | |
File | |
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py" | |
, line 1249, in dump | |
raise pickle.PicklingError(msg) from e | |
_pickle.PicklingError: Could not pickle object as excessively deep recursion | |
required. | |
An unexpected internal error occurred while the worker was executing a task. | |
The above exception was the direct cause of the following exception: | |
╭───────────────────── Traceback (most recent call last) ──────────────────────╮ | |
│ /home/ray/default/train.py:261 in <module> │ | |
│ │ | |
│ 258 │ │ │ datasets=ray_datasets, │ | |
│ 259 │ │ │ resume_from_checkpoint=checkpoint, │ | |
│ 260 │ │ ) │ | |
│ ❱ 261 │ trainer.fit() │ | |
│ 262 │ | |
│ │ | |
│ /home/ray/anaconda3/lib/python3.9/site-packages/ray/train/base_trainer.py:63 │ | |
│ 8 in fit │ | |
│ │ | |
│ 635 │ │ if result.error: │ | |
│ 636 │ │ │ # Raise trainable errors to the user with a message to res │ | |
│ 637 │ │ │ # or configure `FailureConfig` in a new run. │ | |
│ ❱ 638 │ │ │ raise TrainingFailedError( │ | |
│ 639 │ │ │ │ "\n".join([restore_msg, TrainingFailedError._FAILURE_C │ | |
│ 640 │ │ │ ) from result.error │ | |
│ 641 │ │ return result │ | |
╰──────────────────────────────────────────────────────────────────────────────╯ | |
TrainingFailedError: The Ray Train run failed. Please inspect the previous error | |
messages for a cause. After fixing the issue (assuming that the error is not | |
caused by your own application logic, but rather an error such as OOM), you can | |
restart the run from scratch or continue this run. | |
To continue this run, you can use: `trainer = | |
TorchTrainer.restore("private-vpc-20231023180430879200000005/org_7c1Kalm9WcX2bNI | |
jW53GUT/cld_l732epdw3rcfjxnisdgint8xau/artifact_storage/yunxuan__xiao/sd-stage_1 | |
-256")`. | |
To start a new run that will retry on training failures, set | |
`train.RunConfig(failure_config=train.FailureConfig(max_failures))` in the | |
Trainer's `run_config` with `max_failures > 0`, or `max_failures = -1` for | |
unlimited retries. | |
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) Traceback (most recent call last): | |
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) File "pyarrow/public-api.pxi", line 128, in pyarrow.lib.pyarrow_wrap_data_type | |
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) File "pyarrow/types.pxi", line 393, in pyarrow.lib.ListType.init | |
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) File "pyarrow/types.pxi", line 150, in pyarrow.lib.DataType.init | |
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) File "pyarrow/types.pxi", line 76, in pyarrow.lib._datatype_to_pep3118 | |
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/air/util/tensor_extensions/arrow.py", line 116, in __arrow_ext_deserialize__ | |
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) @classmethod | |
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/worker.py", line 873, in sigterm_handler | |
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) raise_sys_exit_with_custom_error_message( | |
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) File "python/ray/_raylet.pyx", line 846, in ray._raylet.raise_sys_exit_with_custom_error_message | |
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) SystemExit: 1 | |
(RayTrainWorker pid=35975, ip=172.24.101.245) Could not load library libcudnn_cnn_train.so.8. Error: /usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8: undefined symbol: _ZN5cudnn14cublasSaxpy_v2EP13cublasContextiPKfS3_iPfi, version libcudnn_ops_infer.so.8 [repeated 216x across cluster] | |
(raylet) Traceback (most recent call last): | |
File "python/ray/_raylet.pyx", line 1883, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1984, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1889, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1830, in ray._raylet.execute_task.function_executor | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/function_manager.py", line 724, in actor_method_executor | |
return method(__ray_actor, *args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/util/tracing/tracing_helper.py", line 467, in _resume_span | |
return method(self, *_args, **_kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/worker_group.py", line 33, in __execute | |
raise skipped from exception_cause(skipped) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/utils.py", line 169, in discard_return_wrapper | |
train_func(*args, **kwargs) | |
File "/home/ray/default/train.py", line 186, in train_func | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 545, in fit | |
call._call_and_handle_interrupt( | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 43, in _call_and_handle_interrupt | |
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 102, in launch | |
return function(*args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 581, in _fit_impl | |
self._run(model, ckpt_path=ckpt_path) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 990, in _run | |
results = self._run_stage() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1036, in _run_stage | |
self.fit_loop.run() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loop.py", line 202, in run | |
self.advance() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loop.py", line 359, in advance | |
self.epoch_loop.run(self._data_fetcher) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 136, in run | |
self.advance(data_fetcher) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 240, in advance | |
batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 187, in run | |
self._optimizer_step(batch_idx, closure) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 265, in _optimizer_step | |
call._call_lightning_module_hook( | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 157, in _call_lightning_module_hook | |
output = fn(*args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.py", line 1282, in optimizer_step | |
optimizer.step(closure=optimizer_closure) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/optimizer.py", line 151, in step | |
step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/ddp.py", line 263, in optimizer_step | |
optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/strategy.py", line 230, in optimizer_step | |
return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/amp.py", line 74, in optimizer_step | |
return super().optimizer_step(optimizer, model=model, closure=closure, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py", line 117, in optimizer_step | |
return optimizer.step(closure=closure, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/lr_scheduler.py", line 75, in wrapper | |
return wrapped(*args, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/optimizer.py", line 385, in wrapper | |
out = func(*args, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/optimizer.py", line 76, in _use_grad | |
ret = func(self, *args, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/adamw.py", line 164, in step | |
loss = closure() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py", line 104, in _wrap_closure | |
closure_result = closure() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 140, in __call__ | |
self._result = self.closure(*args, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context | |
return func(*args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 135, in closure | |
self._backward_fn(step_output.closure_loss) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 236, in backward_fn | |
call._call_strategy_hook(self.trainer, "backward", loss, optimizer) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 309, in _call_strategy_hook | |
output = fn(*args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/strategy.py", line 204, in backward | |
self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py", line 69, in backward | |
model.backward(tensor, *args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.py", line 1069, in backward | |
loss.backward(*args, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/_tensor.py", line 522, in backward | |
torch.autograd.backward( | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/autograd/__init__.py", line 266, in backward | |
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass | |
RuntimeError: GET was unable to find an engine to execute this computation | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py", line 1245, in dump | |
return super().dump(obj) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/tblib/pickling_support.py", line 46, in pickle_exception | |
rv = obj.__reduce_ex__(3) | |
RecursionError: maximum recursion depth exceeded while calling a Python object | |
The above exception was the direct cause of the following exception: | |
Traceback (most recent call last): | |
File "python/ray/_raylet.pyx", line 2281, in ray._raylet.task_execution_handler | |
File "python/ray/_raylet.pyx", line 2177, in ray._raylet.execute_task_with_cancellation_handler | |
File "python/ray/_raylet.pyx", line 1832, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1833, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 2071, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1089, in ray._raylet.store_task_errors | |
File "python/ray/_raylet.pyx", line 4575, in ray._raylet.CoreWorker.store_task_outputs | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py", line 494, in serialize | |
return self._serialize_to_msgpack(value) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py", line 449, in _serialize_to_msgpack | |
value = value.to_bytes() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/exceptions.py", line 32, in to_bytes | |
serialized_exception=pickle.dumps(self), | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py", line 1479, in dumps | |
cp.dump(obj) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py", line 1249, in dump | |
raise pickle.PicklingError(msg) from e | |
_pickle.PicklingError: Could not pickle object as excessively deep recursion required. | |
An unexpected internal error occurred while the worker was executing a task. [repeated 7x across cluster] | |
(raylet) A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff24ef2ff893ac05805b25149405000000 Worker ID: 53ae90b654dedcc8bb460db711b1265c48f963c670107d95fe5e3799 Node ID: c356b61e0a4062b8d8e57c9a2753e26ae06d607ded743a410865c5ef Worker IP address: 172.24.101.245 Worker port: 10171 Worker PID: 35975 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an exit code None. Traceback (most recent call last): | |
File "python/ray/_raylet.pyx", line 1883, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1984, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1889, in ray._raylet.execute_task | |
File "python/ray/_raylet.pyx", line 1830, in ray._raylet.execute_task.function_executor | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/function_manager.py", line 724, in actor_method_executor | |
return method(__ray_actor, *args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/util/tracing/tracing_helper.py", line 467, in _resume_span | |
return method(self, *_args, **_kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/worker_group.py", line 33, in __execute | |
raise skipped from exception_cause(skipped) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/utils.py", line 169, in discard_return_wrapper | |
train_func(*args, **kwargs) | |
File "/home/ray/default/train.py", line 186, in train_func | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 545, in fit | |
call._call_and_handle_interrupt( | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 43, in _call_and_handle_interrupt | |
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 102, in launch | |
return function(*args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 581, in _fit_impl | |
self._run(model, ckpt_path=ckpt_path) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 990, in _run | |
results = self._run_stage() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1036, in _run_stage | |
self.fit_loop.run() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loop.py", line 202, in run | |
self.advance() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loop.py", line 359, in advance | |
self.epoch_loop.run(self._data_fetcher) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 136, in run | |
self.advance(data_fetcher) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 240, in advance | |
batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 187, in run | |
self._optimizer_step(batch_idx, closure) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 265, in _optimizer_step | |
call._call_lightning_module_hook( | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 157, in _call_lightning_module_hook | |
output = fn(*args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.py", line 1282, in optimizer_step | |
optimizer.step(closure=optimizer_closure) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/optimizer.py", line 151, in step | |
step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/ddp.py", line 263, in optimizer_step | |
optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/strategy.py", line 230, in optimizer_step | |
return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/amp.py", line 74, in optimizer_step | |
return super().optimizer_step(optimizer, model=model, closure=closure, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py", line 117, in optimizer_step | |
return optimizer.step(closure=closure, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/lr_scheduler.py", line 75, in wrapper | |
return wrapped(*args, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/optimizer.py", line 385, in wrapper | |
out = func(*args, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/optimizer.py", line 76, in _use_grad | |
ret = func(self, *args, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/adamw.py", line 164, in step | |
loss = closure() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py", line 104, in _wrap_closure | |
closure_result = closure() | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 140, in __call__ | |
self._result = self.closure(*args, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context | |
return func(*args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 135, in closure | |
self._backward_fn(step_output.closure_loss) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 236, in backward_fn | |
call._call_strategy_hook(self.trainer, "backward", loss, optimizer) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 309, in _call_strategy_hook | |
output = fn(*args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/strategy.py", line 204, in backward | |
self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py", line 69, in backward | |
model.backward(tensor, *args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.py", line 1069, in backward | |
loss.backward(*args, **kwargs) | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/_tensor.py", line 522, in backward | |
torch.autograd.backward( | |
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/autograd/__init__.py", line 266, in backward | |
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass | |
RuntimeError: GET was unable to find an engine to execute this computation |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment