Last active
February 3, 2022 07:47
-
-
Save ananthsub/45c154145d0f852503c6a547f59e91f0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ChildFailedError: | |
============================================================ | |
main FAILED | |
------------------------------------------------------------ | |
Failures: | |
<NO_OTHER_FAILURES> | |
------------------------------------------------------------ | |
Root Cause (first observed failure): | |
[0]: | |
time : 2021-12-22_23:03:15 | |
host : | |
rank : 0 (local_rank: 0) | |
exitcode : 1 (pid: 13047) | |
error_file: /tmp/torchelastic_1b21p41k/316076834_45036004508157160_3_vaync48i/attempt_3/0/error.json | |
traceback : Traceback (most recent call last): | |
File "/mnt/xarfuse/uid-192950/7d1ee9d6-seed-af6ca79f-2868-4ec0-8a3d-a4632fa07b85-ns-4026534276/torch/distributed/elastic/multiprocessing/errors | |
/__init__.py", line 345, in wrapper | |
return f(*args, **kwargs) | |
File "/mnt/xarfuse/uid-192950/7d1ee9d6-seed-af6ca79f-2868-4ec0-8a3d-a4632fa07b85-ns-4026534276/pytorch_lightning/trainer/trainer.py", line 1992, in | |
save_checkpoint | |
self.checkpoint_connector.save_checkpoint(filepath, weights_only) | |
File "/mnt/xarfuse/uid-192950/7d1ee9d6-seed-af6ca79f-2868-4ec0-8a3d-a4632fa07b85-ns-4026534276/pytorch_lightning/trainer/connectors | |
/checkpoint_connector.py", line 460, in save_checkpoint | |
_checkpoint = self.dump_checkpoint(weights_only) | |
File "/mnt/xarfuse/uid-192950/7d1ee9d6-seed-af6ca79f-2868-4ec0-8a3d-a4632fa07b85-ns-4026534276/pytorch_lightning/trainer/connectors | |
/checkpoint_connector.py", line 375, in dump_checkpoint | |
"loops": self._get_loops_state_dict(), | |
File "/mnt/xarfuse/uid-192950/7d1ee9d6-seed-af6ca79f-2868-4ec0-8a3d-a4632fa07b85-ns-4026534276/pytorch_lightning/trainer/connectors | |
/checkpoint_connector.py", line 485, in _get_loops_state_dict | |
"fit_loop": self.trainer.fit_loop.state_dict(), | |
File "/mnt/xarfuse/uid-192950/7d1ee9d6-seed-af6ca79f-2868-4ec0-8a3d-a4632fa07b85-ns-4026534276/pytorch_lightning/loops/base.py", line 284, in | |
state_dict | |
v.state_dict(destination, key + ".") | |
File "/mnt/xarfuse/uid-192950/7d1ee9d6-seed-af6ca79f-2868-4ec0-8a3d-a4632fa07b85-ns-4026534276/pytorch_lightning/loops/base.py", line 287, in | |
state_dict | |
v.sync() | |
File "/mnt/xarfuse/uid-192950/7d1ee9d6-seed-af6ca79f-2868-4ec0-8a3d-a4632fa07b85-ns-4026534276/pytorch_lightning/trainer/connectors | |
/logger_connector/result.py", line 621, in sync | |
fn() | |
File "/mnt/xarfuse/uid-192950/7d1ee9d6-seed-af6ca79f-2868-4ec0-8a3d-a4632fa07b85-ns-4026534276/torchmetrics/metric.py", line 289, in sync | |
self._sync_dist(dist_sync_fn, process_group=process_group) | |
File "/mnt/xarfuse/uid-192950/7d1ee9d6-seed-af6ca79f-2868-4ec0-8a3d-a4632fa07b85-ns-4026534276/torchmetrics/metric.py", line 225, in _sync_dist | |
output_dict = apply_to_collection( | |
File "/mnt/xarfuse/uid-192950/7d1ee9d6-seed-af6ca79f-2868-4ec0-8a3d-a4632fa07b85-ns-4026534276/torchmetrics/utilities/data.py", line 195, in | |
apply_to_collection | |
return elem_type({k: apply_to_collection(v, dtype, function, *args, **kwargs) for k, v in data.items()}) | |
File "/mnt/xarfuse/uid-192950/7d1ee9d6-seed-af6ca79f-2868-4ec0-8a3d-a4632fa07b85-ns-4026534276/torchmetrics/utilities/data.py", line 195, in | |
<dictcomp> | |
return elem_type({k: apply_to_collection(v, dtype, function, *args, **kwargs) for k, v in data.items()}) | |
File "/mnt/xarfuse/uid-192950/7d1ee9d6-seed-af6ca79f-2868-4ec0-8a3d-a4632fa07b85-ns-4026534276/torchmetrics/utilities/data.py", line 191, in | |
apply_to_collection | |
return function(data, *args, **kwargs) | |
File "/mnt/xarfuse/uid-192950/7d1ee9d6-seed-af6ca79f-2868-4ec0-8a3d-a4632fa07b85-ns-4026534276/torchmetrics/utilities/distributed.py", line 124, in | |
gather_all_tensors | |
return _simple_gather_all_tensors(result, group, world_size) | |
File "/mnt/xarfuse/uid-192950/7d1ee9d6-seed-af6ca79f-2868-4ec0-8a3d-a4632fa07b85-ns-4026534276/torchmetrics/utilities/distributed.py", line 94, in | |
_simple_gather_all_tensors | |
torch.distributed.all_gather(gathered_result, result, group) | |
File "/mnt/xarfuse/uid-192950/7d1ee9d6-seed-af6ca79f-2868-4ec0-8a3d-a4632fa07b85-ns-4026534276/torch/distributed/distributed_c10d.py", line 2053, | |
in all_gather | |
work = group.allgather([tensor_list], [tensor]) | |
RuntimeError: Tensors must be CUDA and dense |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Do you have the
Trainer
config used available?