Skip to content

Instantly share code, notes, and snippets.

@woshiyyya
Created April 18, 2024 01:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save woshiyyya/a19489631a79c85912bd68a0036a8396 to your computer and use it in GitHub Desktop.
Save woshiyyya/a19489631a79c85912bd68a0036a8396 to your computer and use it in GitHub Desktop.
error-efa.txt
task.
class_name: RayTrainWorker
actor_id: 9e6790a209b7c509e64301f305000000
pid: 35979
namespace: f205d617-4ee1-4fae-a76a-c3f2382b7527
ip: 172.24.101.245
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an exit code None. Traceback (most recent call last):
File "python/ray/_raylet.pyx", line 1883, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1984, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1889, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1830, in ray._raylet.execute_task.function_executor
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/function_manager.py", line 724, in actor_method_executor
return method(__ray_actor, *args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/util/tracing/tracing_helper.py", line 467, in _resume_span
return method(self, *_args, **_kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/worker_group.py", line 33, in __execute
raise skipped from exception_cause(skipped)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/utils.py", line 169, in discard_return_wrapper
train_func(*args, **kwargs)
File "/home/ray/default/train.py", line 186, in train_func
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 545, in fit
call._call_and_handle_interrupt(
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 43, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 102, in launch
return function(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 581, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 990, in _run
results = self._run_stage()
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1036, in _run_stage
self.fit_loop.run()
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loop.py", line 202, in run
self.advance()
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loop.py", line 359, in advance
self.epoch_loop.run(self._data_fetcher)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 136, in run
self.advance(data_fetcher)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 240, in advance
batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 187, in run
self._optimizer_step(batch_idx, closure)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 265, in _optimizer_step
call._call_lightning_module_hook(
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 157, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.py", line 1282, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/optimizer.py", line 151, in step
step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/ddp.py", line 263, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/strategy.py", line 230, in optimizer_step
return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/amp.py", line 74, in optimizer_step
return super().optimizer_step(optimizer, model=model, closure=closure, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py", line 117, in optimizer_step
return optimizer.step(closure=closure, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/lr_scheduler.py", line 75, in wrapper
return wrapped(*args, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/optimizer.py", line 385, in wrapper
out = func(*args, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/optimizer.py", line 76, in _use_grad
ret = func(self, *args, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/adamw.py", line 164, in step
loss = closure()
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py", line 104, in _wrap_closure
closure_result = closure()
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 140, in __call__
self._result = self.closure(*args, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 135, in closure
self._backward_fn(step_output.closure_loss)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 236, in backward_fn
call._call_strategy_hook(self.trainer, "backward", loss, optimizer)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 309, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/strategy.py", line 204, in backward
self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py", line 69, in backward
model.backward(tensor, *args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.py", line 1069, in backward
loss.backward(*args, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/_tensor.py", line 522, in backward
torch.autograd.backward(
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/autograd/__init__.py", line 266, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: GET was unable to find an engine to execute this computation
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py", line 1245, in dump
return super().dump(obj)
File "/home/ray/anaconda3/lib/python3.9/site-packages/tblib/pickling_support.py", line 46, in pickle_exception
rv = obj.__reduce_ex__(3)
RecursionError: maximum recursion depth exceeded while calling a Python object
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "python/ray/_raylet.pyx", line 2281, in ray._raylet.task_execution_handler
File "python/ray/_raylet.pyx", line 2177, in ray._raylet.execute_task_with_cancellation_handler
File "python/ray/_raylet.pyx", line 1832, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1833, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 2071, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1089, in ray._raylet.store_task_errors
File "python/ray/_raylet.pyx", line 4575, in ray._raylet.CoreWorker.store_task_outputs
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py", line 494, in serialize
return self._serialize_to_msgpack(value)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py", line 449, in _serialize_to_msgpack
value = value.to_bytes()
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/exceptions.py", line 32, in to_bytes
serialized_exception=pickle.dumps(self),
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py", line 1479, in dumps
cp.dump(obj)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py", line 1249, in dump
raise pickle.PicklingError(msg) from e
_pickle.PicklingError: Could not pickle object as excessively deep recursion required.
An unexpected internal error occurred while the worker was executing a task.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py", line 1245, in dump
return super().dump(obj)
File "/home/ray/anaconda3/lib/python3.9/site-packages/tblib/pickling_support.py", line 46, in pickle_exception
rv = obj.__reduce_ex__(3)
RecursionError: maximum recursion depth exceeded while calling a Python object
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "python/ray/_raylet.pyx", line 2281, in ray._raylet.task_execution_handler
File "python/ray/_raylet.pyx", line 2177, in ray._raylet.execute_task_with_cancellation_handler
File "python/ray/_raylet.pyx", line 1832, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1833, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 2071, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1089, in ray._raylet.store_task_errors
File "python/ray/_raylet.pyx", line 4575, in ray._raylet.CoreWorker.store_task_outputs
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py", line 494, in serialize
return self._serialize_to_msgpack(value)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py", line 449, in _serialize_to_msgpack
value = value.to_bytes()
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/exceptions.py", line 32, in to_bytes
serialized_exception=pickle.dumps(self),
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py", line 1479, in dumps
cp.dump(obj)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py", line 1249, in dump
raise pickle.PicklingError(msg) from e
_pickle.PicklingError: Could not pickle object as excessively deep recursion required.
An unexpected internal error occurred while the worker was executing a task.
Training errored after 0 iterations at 2024-04-17 18:24:17. Total running time: 1min 28s
Error file: /tmp/ray/session_2024-04-17_17-56-58_302463_3956/artifacts/2024-04-17_18-22-49/sd-stage_1-256/driver_artifacts/TorchTrainer_2b908_00000_0_2024-04-17_18-22-49/error.txt
(pid=37114, ip=172.24.101.245) Running: 33/383.0 CPU, 0/24.0 GPU, 6.0GB/372.5GB 2024-04-17 18:24:19,4320INFO tune.py:1016 -- Wrote the latest version of all result files and experiment state to 'private-vpc-20231023180430879200000005/org_7c1Kalm9WcX2bNIjW53GUT/cld_l732epdw3rcfjxnisdgint8xau/artifact_storage/yunxuan__xiao/sd-stage_1-256' in 1.4940s.
(pid=37114, ip=172.24.101.245) Running: 33/383.0 CPU, 0/24.0 GPU, 6.0GB/372.5GB 2024-04-17 18:24:19,4370ERROR tune.py:1044 -- Trials did not complete: [TorchTrainer_2b908_00000]2.24.101.245) - MapBatches(convert_precision): 4 active, 0 queu
RayActorError: The actor died unexpectedly before finishing this task.ued, [cpu:
class_name: with_parameters.<locals>._Inner
actor_id: 25c149ef281dc60bf21b13b105000000
pid: 35755
namespace: f205d617-4ee1-4fae-a76a-c3f2382b7527
ip: 172.24.101.245
The actor is dead because its worker process has died. Worker exit type:
SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an
exit code None. Traceback (most recent call last):
File "python/ray/_raylet.pyx", line 1883, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1984, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1889, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1830, in
ray._raylet.execute_task.function_executor
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/function_manager.p
y", line 724, in actor_method_executor
return method(__ray_actor, *args, **kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/util/tracing/tracing_helper
.py", line 467, in _resume_span
return method(self, *_args, **_kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/tune/trainable/trainable.py
", line 334, in train
raise skipped from exception_cause(skipped)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/air/_internal/util.py",
line 88, in run
self._ret = self._target(*self._args, **self._kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/tune/trainable/function_tra
inable.py", line 53, in <lambda>
training_func=lambda: self._trainable_func(self.config),
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/util/tracing/tracing_helper
.py", line 467, in _resume_span
return method(self, *_args, **_kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/base_trainer.py",
line 793, in _trainable_func
super()._trainable_func(self._merged_config)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/tune/trainable/function_tra
inable.py", line 261, in _trainable_func
output = fn()
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/base_trainer.py",
line 107, in _train_coordinator_fn
trainer.training_loop()
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/data_parallel_trainer
.py", line 461, in training_loop
self._run_training(training_iterator)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/data_parallel_trainer
.py", line 362, in _run_training
for training_results in training_iterator:
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/trainer.py",
line 123, in __next__
next_results = self._run_with_error_handling(self._fetch_next_result)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/trainer.py",
line 89, in _run_with_error_handling
return func()
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/trainer.py",
line 153, in _fetch_next_result
results = self._backend_executor.get_next_results()
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/backend_exe
cutor.py", line 569, in get_next_results
results = self.get_with_failure_handling(futures)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/backend_exe
cutor.py", line 650, in get_with_failure_handling
self._increment_failures()
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/backend_exe
cutor.py", line 712, in _increment_failures
raise failure
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/utils.py",
line 53, in check_for_failure
ray.get(object_ref)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/auto_init_hook.py"
, line 21, in auto_init_wrapper
return fn(*args, **kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.p
y", line 103, in wrapper
return func(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/worker.py",
line 2667, in get
values, debugger_breakpoint = worker.get_objects(object_refs,
timeout=timeout)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/worker.py",
line 866, in get_objects
raise value
ray.exceptions.RayActorError: The actor died unexpectedly before finishing this
task.
class_name: RayTrainWorker
actor_id: 9e6790a209b7c509e64301f305000000
pid: 35979
namespace: f205d617-4ee1-4fae-a76a-c3f2382b7527
ip: 172.24.101.245
The actor is dead because its worker process has died. Worker exit type:
SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an
exit code None. Traceback (most recent call last):
File "python/ray/_raylet.pyx", line 1883, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1984, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1889, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1830, in
ray._raylet.execute_task.function_executor
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/function_manager.p
y", line 724, in actor_method_executor
return method(__ray_actor, *args, **kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/util/tracing/tracing_helper
.py", line 467, in _resume_span
return method(self, *_args, **_kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/worker_grou
p.py", line 33, in __execute
raise skipped from exception_cause(skipped)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/utils.py",
line 169, in discard_return_wrapper
train_func(*args, **kwargs)
File "/home/ray/default/train.py", line 186, in train_func
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/train
er.py", line 545, in fit
call._call_and_handle_interrupt(
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.
py", line 43, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer,
**kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/la
unchers/subprocess_script.py", line 102, in launch
return function(*args, **kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/train
er.py", line 581, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/train
er.py", line 990, in _run
results = self._run_stage()
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/train
er.py", line 1036, in _run_stage
self.fit_loop.run()
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loo
p.py", line 202, in run
self.advance()
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loo
p.py", line 359, in advance
self.epoch_loop.run(self._data_fetcher)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/trainin
g_epoch_loop.py", line 136, in run
self.advance(data_fetcher)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/trainin
g_epoch_loop.py", line 240, in advance
batch_output = self.automatic_optimization.run(trainer.optimizers[0],
batch_idx, kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimiz
ation/automatic.py", line 187, in run
self._optimizer_step(batch_idx, closure)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimiz
ation/automatic.py", line 265, in _optimizer_step
call._call_lightning_module_hook(
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.
py", line 157, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.p
y", line 1282, in optimizer_step
optimizer.step(closure=optimizer_closure)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/optimize
r.py", line 151, in step
step_output = self._strategy.optimizer_step(self._optimizer, closure,
**kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/dd
p.py", line 263, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, closure, model,
**kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/st
rategy.py", line 230, in optimizer_step
return self.precision_plugin.optimizer_step(optimizer, model=model,
closure=closure, **kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/preci
sion/amp.py", line 74, in optimizer_step
return super().optimizer_step(optimizer, model=model, closure=closure,
**kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/preci
sion/precision_plugin.py", line 117, in optimizer_step
return optimizer.step(closure=closure, **kwargs)
File
"/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/lr_scheduler.
py", line 75, in wrapper
return wrapped(*args, **kwargs)
File
"/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/optimizer.py"
, line 385, in wrapper
out = func(*args, **kwargs)
File
"/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/optimizer.py"
, line 76, in _use_grad
ret = func(self, *args, **kwargs)
File
"/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/adamw.py",
line 164, in step
loss = closure()
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/preci
sion/precision_plugin.py", line 104, in _wrap_closure
closure_result = closure()
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimiz
ation/automatic.py", line 140, in __call__
self._result = self.closure(*args, **kwargs)
File
"/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/utils/_contextlib.p
y", line 115, in decorate_context
return func(*args, **kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimiz
ation/automatic.py", line 135, in closure
self._backward_fn(step_output.closure_loss)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimiz
ation/automatic.py", line 236, in backward_fn
call._call_strategy_hook(self.trainer, "backward", loss, optimizer)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.
py", line 309, in _call_strategy_hook
output = fn(*args, **kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/st
rategy.py", line 204, in backward
self.precision_plugin.backward(closure_loss, self.lightning_module,
optimizer, *args, **kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/preci
sion/precision_plugin.py", line 69, in backward
model.backward(tensor, *args, **kwargs)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.p
y", line 1069, in backward
loss.backward(*args, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/_tensor.py",
line 522, in backward
torch.autograd.backward(
File
"/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/autograd/__init__.p
y", line 266, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run
the backward pass
RuntimeError: GET was unable to find an engine to execute this computation
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py"
, line 1245, in dump
return super().dump(obj)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/tblib/pickling_support.py",
line 46, in pickle_exception
rv = obj.__reduce_ex__(3)
RecursionError: maximum recursion depth exceeded while calling a Python object
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "python/ray/_raylet.pyx", line 2281, in
ray._raylet.task_execution_handler
File "python/ray/_raylet.pyx", line 2177, in
ray._raylet.execute_task_with_cancellation_handler
File "python/ray/_raylet.pyx", line 1832, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1833, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 2071, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1089, in ray._raylet.store_task_errors
File "python/ray/_raylet.pyx", line 4575, in
ray._raylet.CoreWorker.store_task_outputs
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py",
line 494, in serialize
return self._serialize_to_msgpack(value)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py",
line 449, in _serialize_to_msgpack
value = value.to_bytes()
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/exceptions.py", line
32, in to_bytes
serialized_exception=pickle.dumps(self),
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py"
, line 1479, in dumps
cp.dump(obj)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py"
, line 1249, in dump
raise pickle.PicklingError(msg) from e
_pickle.PicklingError: Could not pickle object as excessively deep recursion
required.
An unexpected internal error occurred while the worker was executing a task.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py"
, line 1245, in dump
return super().dump(obj)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/tblib/pickling_support.py",
line 46, in pickle_exception
rv = obj.__reduce_ex__(3)
RecursionError: maximum recursion depth exceeded while calling a Python object
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "python/ray/_raylet.pyx", line 2281, in
ray._raylet.task_execution_handler
File "python/ray/_raylet.pyx", line 2177, in
ray._raylet.execute_task_with_cancellation_handler
File "python/ray/_raylet.pyx", line 1832, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1833, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 2071, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1089, in ray._raylet.store_task_errors
File "python/ray/_raylet.pyx", line 4575, in
ray._raylet.CoreWorker.store_task_outputs
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py",
line 494, in serialize
return self._serialize_to_msgpack(value)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py",
line 449, in _serialize_to_msgpack
value = value.to_bytes()
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/exceptions.py", line
32, in to_bytes
serialized_exception=pickle.dumps(self),
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py"
, line 1479, in dumps
cp.dump(obj)
File
"/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py"
, line 1249, in dump
raise pickle.PicklingError(msg) from e
_pickle.PicklingError: Could not pickle object as excessively deep recursion
required.
An unexpected internal error occurred while the worker was executing a task.
The above exception was the direct cause of the following exception:
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /home/ray/default/train.py:261 in <module> │
│ │
│ 258 │ │ │ datasets=ray_datasets, │
│ 259 │ │ │ resume_from_checkpoint=checkpoint, │
│ 260 │ │ ) │
│ ❱ 261 │ trainer.fit() │
│ 262 │
│ │
│ /home/ray/anaconda3/lib/python3.9/site-packages/ray/train/base_trainer.py:63 │
│ 8 in fit │
│ │
│ 635 │ │ if result.error: │
│ 636 │ │ │ # Raise trainable errors to the user with a message to res │
│ 637 │ │ │ # or configure `FailureConfig` in a new run. │
│ ❱ 638 │ │ │ raise TrainingFailedError( │
│ 639 │ │ │ │ "\n".join([restore_msg, TrainingFailedError._FAILURE_C │
│ 640 │ │ │ ) from result.error │
│ 641 │ │ return result │
╰──────────────────────────────────────────────────────────────────────────────╯
TrainingFailedError: The Ray Train run failed. Please inspect the previous error
messages for a cause. After fixing the issue (assuming that the error is not
caused by your own application logic, but rather an error such as OOM), you can
restart the run from scratch or continue this run.
To continue this run, you can use: `trainer =
TorchTrainer.restore("private-vpc-20231023180430879200000005/org_7c1Kalm9WcX2bNI
jW53GUT/cld_l732epdw3rcfjxnisdgint8xau/artifact_storage/yunxuan__xiao/sd-stage_1
-256")`.
To start a new run that will retry on training failures, set
`train.RunConfig(failure_config=train.FailureConfig(max_failures))` in the
Trainer's `run_config` with `max_failures > 0`, or `max_failures = -1` for
unlimited retries.
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) Traceback (most recent call last):
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) File "pyarrow/public-api.pxi", line 128, in pyarrow.lib.pyarrow_wrap_data_type
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) File "pyarrow/types.pxi", line 393, in pyarrow.lib.ListType.init
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) File "pyarrow/types.pxi", line 150, in pyarrow.lib.DataType.init
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) File "pyarrow/types.pxi", line 76, in pyarrow.lib._datatype_to_pep3118
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/air/util/tensor_extensions/arrow.py", line 116, in __arrow_ext_deserialize__
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) @classmethod
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/worker.py", line 873, in sigterm_handler
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) raise_sys_exit_with_custom_error_message(
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) File "python/ray/_raylet.pyx", line 846, in ray._raylet.raise_sys_exit_with_custom_error_message
(ReadParquet->SplitBlocks(2) pid=40525, ip=172.24.101.245) SystemExit: 1
(RayTrainWorker pid=35975, ip=172.24.101.245) Could not load library libcudnn_cnn_train.so.8. Error: /usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8: undefined symbol: _ZN5cudnn14cublasSaxpy_v2EP13cublasContextiPKfS3_iPfi, version libcudnn_ops_infer.so.8 [repeated 216x across cluster]
(raylet) Traceback (most recent call last):
File "python/ray/_raylet.pyx", line 1883, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1984, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1889, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1830, in ray._raylet.execute_task.function_executor
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/function_manager.py", line 724, in actor_method_executor
return method(__ray_actor, *args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/util/tracing/tracing_helper.py", line 467, in _resume_span
return method(self, *_args, **_kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/worker_group.py", line 33, in __execute
raise skipped from exception_cause(skipped)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/utils.py", line 169, in discard_return_wrapper
train_func(*args, **kwargs)
File "/home/ray/default/train.py", line 186, in train_func
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 545, in fit
call._call_and_handle_interrupt(
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 43, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 102, in launch
return function(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 581, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 990, in _run
results = self._run_stage()
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1036, in _run_stage
self.fit_loop.run()
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loop.py", line 202, in run
self.advance()
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loop.py", line 359, in advance
self.epoch_loop.run(self._data_fetcher)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 136, in run
self.advance(data_fetcher)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 240, in advance
batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 187, in run
self._optimizer_step(batch_idx, closure)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 265, in _optimizer_step
call._call_lightning_module_hook(
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 157, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.py", line 1282, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/optimizer.py", line 151, in step
step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/ddp.py", line 263, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/strategy.py", line 230, in optimizer_step
return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/amp.py", line 74, in optimizer_step
return super().optimizer_step(optimizer, model=model, closure=closure, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py", line 117, in optimizer_step
return optimizer.step(closure=closure, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/lr_scheduler.py", line 75, in wrapper
return wrapped(*args, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/optimizer.py", line 385, in wrapper
out = func(*args, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/optimizer.py", line 76, in _use_grad
ret = func(self, *args, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/adamw.py", line 164, in step
loss = closure()
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py", line 104, in _wrap_closure
closure_result = closure()
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 140, in __call__
self._result = self.closure(*args, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 135, in closure
self._backward_fn(step_output.closure_loss)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 236, in backward_fn
call._call_strategy_hook(self.trainer, "backward", loss, optimizer)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 309, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/strategy.py", line 204, in backward
self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py", line 69, in backward
model.backward(tensor, *args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.py", line 1069, in backward
loss.backward(*args, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/_tensor.py", line 522, in backward
torch.autograd.backward(
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/autograd/__init__.py", line 266, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: GET was unable to find an engine to execute this computation
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py", line 1245, in dump
return super().dump(obj)
File "/home/ray/anaconda3/lib/python3.9/site-packages/tblib/pickling_support.py", line 46, in pickle_exception
rv = obj.__reduce_ex__(3)
RecursionError: maximum recursion depth exceeded while calling a Python object
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "python/ray/_raylet.pyx", line 2281, in ray._raylet.task_execution_handler
File "python/ray/_raylet.pyx", line 2177, in ray._raylet.execute_task_with_cancellation_handler
File "python/ray/_raylet.pyx", line 1832, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1833, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 2071, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1089, in ray._raylet.store_task_errors
File "python/ray/_raylet.pyx", line 4575, in ray._raylet.CoreWorker.store_task_outputs
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py", line 494, in serialize
return self._serialize_to_msgpack(value)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/serialization.py", line 449, in _serialize_to_msgpack
value = value.to_bytes()
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/exceptions.py", line 32, in to_bytes
serialized_exception=pickle.dumps(self),
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py", line 1479, in dumps
cp.dump(obj)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle.py", line 1249, in dump
raise pickle.PicklingError(msg) from e
_pickle.PicklingError: Could not pickle object as excessively deep recursion required.
An unexpected internal error occurred while the worker was executing a task. [repeated 7x across cluster]
(raylet) A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff24ef2ff893ac05805b25149405000000 Worker ID: 53ae90b654dedcc8bb460db711b1265c48f963c670107d95fe5e3799 Node ID: c356b61e0a4062b8d8e57c9a2753e26ae06d607ded743a410865c5ef Worker IP address: 172.24.101.245 Worker port: 10171 Worker PID: 35975 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an exit code None. Traceback (most recent call last):
File "python/ray/_raylet.pyx", line 1883, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1984, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1889, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1830, in ray._raylet.execute_task.function_executor
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/_private/function_manager.py", line 724, in actor_method_executor
return method(__ray_actor, *args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/util/tracing/tracing_helper.py", line 467, in _resume_span
return method(self, *_args, **_kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/worker_group.py", line 33, in __execute
raise skipped from exception_cause(skipped)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/utils.py", line 169, in discard_return_wrapper
train_func(*args, **kwargs)
File "/home/ray/default/train.py", line 186, in train_func
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 545, in fit
call._call_and_handle_interrupt(
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 43, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 102, in launch
return function(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 581, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 990, in _run
results = self._run_stage()
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1036, in _run_stage
self.fit_loop.run()
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loop.py", line 202, in run
self.advance()
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loop.py", line 359, in advance
self.epoch_loop.run(self._data_fetcher)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 136, in run
self.advance(data_fetcher)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 240, in advance
batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 187, in run
self._optimizer_step(batch_idx, closure)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 265, in _optimizer_step
call._call_lightning_module_hook(
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 157, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.py", line 1282, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/optimizer.py", line 151, in step
step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/ddp.py", line 263, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/strategy.py", line 230, in optimizer_step
return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/amp.py", line 74, in optimizer_step
return super().optimizer_step(optimizer, model=model, closure=closure, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py", line 117, in optimizer_step
return optimizer.step(closure=closure, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/lr_scheduler.py", line 75, in wrapper
return wrapped(*args, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/optimizer.py", line 385, in wrapper
out = func(*args, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/optimizer.py", line 76, in _use_grad
ret = func(self, *args, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/optim/adamw.py", line 164, in step
loss = closure()
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py", line 104, in _wrap_closure
closure_result = closure()
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 140, in __call__
self._result = self.closure(*args, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 135, in closure
self._backward_fn(step_output.closure_loss)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 236, in backward_fn
call._call_strategy_hook(self.trainer, "backward", loss, optimizer)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 309, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/strategy.py", line 204, in backward
self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/precision_plugin.py", line 69, in backward
model.backward(tensor, *args, **kwargs)
File "/home/ray/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.py", line 1069, in backward
loss.backward(*args, **kwargs)
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/_tensor.py", line 522, in backward
torch.autograd.backward(
File "/mnt/cluster_storage/pypi/lib/python3.9/site-packages/torch/autograd/__init__.py", line 266, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: GET was unable to find an engine to execute this computation
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment