Created
January 27, 2022 05:38
-
-
Save zheyuye/e1d0ebfd4b988cff3bb1e9b0c725f33a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
stderr: /usr/local/lib/python3.6/dist-packages/torch/distributed/launch.py:186: FutureWarning: The module torch.distributed.launch is deprecated | |
stderr: and will be removed in future. Use torchrun. | |
stderr: Note that --use_env is set by default in torchrun. | |
stderr: If your script expects `--local_rank` argument to be set, please | |
stderr: change it to read from `os.environ['LOCAL_RANK']` instead. See | |
stderr: https://pytorch.org/docs/stable/distributed.html#launch-utility for | |
stderr: further instructions | |
stderr: | |
stderr: FutureWarning, | |
stderr: WARNING:torch.distributed.run: | |
stderr: ***************************************** | |
stderr: Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. | |
stderr: ***************************************** | |
stderr: Traceback (most recent call last): | |
stderr: File "/usr/local/lib/python3.6/dist-packages/accelerate/test_utils/test_script.py", line 291, in <module> | |
stderr: main() | |
stderr: File "/usr/local/lib/python3.6/dist-packages/accelerate/test_utils/test_script.py", line 278, in main | |
stderr: rng_sync_check() | |
stderr: File "/usr/local/lib/python3.6/dist-packages/accelerate/test_utils/test_script.py", line 38, in rng_sync_check | |
stderr: synchronize_rng_states(["torch"]) | |
stderr: File "/usr/local/lib/python3.6/dist-packages/accelerate/utils.py", line 110, in synchronize_rng_states | |
stderr: synchronize_rng_state(RNGType(rng_type), generator=generator) | |
stderr: File "/usr/local/lib/python3.6/dist-packages/accelerate/utils.py", line 92, in synchronize_rng_state | |
stderr: torch.distributed.broadcast(rng_state, 0) | |
stderr: File "/usr/local/lib/python3.6/dist-packages/torch/distributed/distributed_c10d.py", line 1159, in broadcast | |
stderr: work = default_pg.broadcast([tensor], opts) | |
stderr: RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, unhandled system error, NCCL version 21.0.3 | |
stderr: ncclSystemError: System call (socket, malloc, munmap, etc) failed. | |
stderr: WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1022 closing signal SIGTERM | |
stderr: ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 1023) of binary: /usr/bin/python3 | |
stderr: Traceback (most recent call last): | |
stderr: File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main | |
stderr: "__main__", mod_spec) | |
stderr: File "/usr/lib/python3.6/runpy.py", line 85, in _run_code | |
stderr: exec(code, run_globals) | |
stderr: File "/usr/local/lib/python3.6/dist-packages/torch/distributed/launch.py", line 193, in <module> | |
stderr: main() | |
stderr: File "/usr/local/lib/python3.6/dist-packages/torch/distributed/launch.py", line 189, in main | |
stderr: launch(args) | |
stderr: File "/usr/local/lib/python3.6/dist-packages/torch/distributed/launch.py", line 174, in launch | |
stderr: run(args) | |
stderr: File "/usr/local/lib/python3.6/dist-packages/torch/distributed/run.py", line 713, in run | |
stderr: )(*cmd_args) | |
stderr: File "/usr/local/lib/python3.6/dist-packages/torch/distributed/launcher/api.py", line 131, in __call__ | |
stderr: return launch_agent(self._config, self._entrypoint, list(args)) | |
stderr: File "/usr/local/lib/python3.6/dist-packages/torch/distributed/launcher/api.py", line 261, in launch_agent | |
stderr: failures=result.failures, | |
stderr: torch.distributed.elastic.multiprocessing.errors.ChildFailedError: | |
stderr: ============================================================ | |
stderr: /usr/local/lib/python3.6/dist-packages/accelerate/test_utils/test_script.py FAILED | |
stderr: ------------------------------------------------------------ | |
stderr: Failures: | |
stderr: <NO_OTHER_FAILURES> | |
stderr: ------------------------------------------------------------ | |
stderr: Root Cause (first observed failure): | |
stderr: [0]: | |
stderr: time : 2022-01-27_05:35:53 | |
stderr: host : lshl-devops-k8s-dr-firefly-4 | |
stderr: rank : 1 (local_rank: 1) | |
stderr: exitcode : 1 (pid: 1023) | |
stderr: error_file: <N/A> | |
stderr: traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html | |
stderr: ============================================================ | |
stderr: Traceback (most recent call last): | |
stderr: File "/usr/local/bin/accelerate-launch", line 8, in <module> | |
stderr: sys.exit(main()) | |
stderr: File "/usr/local/lib/python3.6/dist-packages/accelerate/commands/launch.py", line 390, in main | |
stderr: launch_command(args) | |
stderr: File "/usr/local/lib/python3.6/dist-packages/accelerate/commands/launch.py", line 378, in launch_command | |
stderr: multi_gpu_launcher(args) | |
stderr: File "/usr/local/lib/python3.6/dist-packages/accelerate/commands/launch.py", line 176, in multi_gpu_launcher | |
stderr: raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd) | |
stderr: subprocess.CalledProcessError: Command '['/usr/bin/python3', '-m', 'torch.distributed.launch', '--use_env', '--nproc_per_node', '2', '--nnodes', '2', '--node_rank', '0', '--master_addr', '10.219.34.224', '--master_port', '1234', '/usr/local/lib/python3.6/dist-packages/accelerate/test_utils/test_script.py']' returned non-zero exit status 1. | |
Running: accelerate-launch --config_file=None /usr/local/lib/python3.6/dist-packages/accelerate/test_utils/test_script.py | |
stdout: **Initialization** | |
stdout: Testing, testing. 1, 2, 3. | |
stdout: Distributed environment: MULTI_GPU Backend: nccl | |
stdout: Num processes: 4 | |
stdout: Process index: 0 | |
stdout: Local process index: 0 | |
stdout: Device: cuda:0 | |
stdout: Use FP16 precision: False | |
stdout: | |
stdout: | |
stdout: **Test random number generator synchronization** | |
stdout: Distributed environment: MULTI_GPU Backend: nccl | |
stdout: Num processes: 4 | |
stdout: Process index: 1 | |
stdout: Local process index: 1 | |
stdout: Device: cuda:1 | |
stdout: Use FP16 precision: False | |
stdout: | |
Traceback (most recent call last): | |
File "/usr/local/bin/accelerate", line 8, in <module> | |
sys.exit(main()) | |
File "/usr/local/lib/python3.6/dist-packages/accelerate/commands/accelerate_cli.py", line 41, in main | |
args.func(args) | |
File "/usr/local/lib/python3.6/dist-packages/accelerate/commands/test.py", line 52, in test_command | |
result = execute_subprocess_async(cmd, env=os.environ.copy()) | |
File "/usr/local/lib/python3.6/dist-packages/accelerate/test_utils/testing.py", line 135, in execute_subprocess_async | |
f"'{cmd_str}' failed with returncode {result.returncode}\n\n" | |
RuntimeError: 'accelerate-launch --config_file=None /usr/local/lib/python3.6/dist-packages/accelerate/test_utils/test_script.py' failed with returncode 1 | |
The combined stderr from workers follows: | |
/usr/local/lib/python3.6/dist-packages/torch/distributed/launch.py:186: FutureWarning: The module torch.distributed.launch is deprecated | |
and will be removed in future. Use torchrun. | |
Note that --use_env is set by default in torchrun. | |
If your script expects `--local_rank` argument to be set, please | |
change it to read from `os.environ['LOCAL_RANK']` instead. See | |
https://pytorch.org/docs/stable/distributed.html#launch-utility for | |
further instructions | |
FutureWarning, | |
WARNING:torch.distributed.run: | |
***************************************** | |
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. | |
***************************************** | |
Traceback (most recent call last): | |
File "/usr/local/lib/python3.6/dist-packages/accelerate/test_utils/test_script.py", line 291, in <module> | |
main() | |
File "/usr/local/lib/python3.6/dist-packages/accelerate/test_utils/test_script.py", line 278, in main | |
rng_sync_check() | |
File "/usr/local/lib/python3.6/dist-packages/accelerate/test_utils/test_script.py", line 38, in rng_sync_check | |
synchronize_rng_states(["torch"]) | |
File "/usr/local/lib/python3.6/dist-packages/accelerate/utils.py", line 110, in synchronize_rng_states | |
synchronize_rng_state(RNGType(rng_type), generator=generator) | |
File "/usr/local/lib/python3.6/dist-packages/accelerate/utils.py", line 92, in synchronize_rng_state | |
torch.distributed.broadcast(rng_state, 0) | |
File "/usr/local/lib/python3.6/dist-packages/torch/distributed/distributed_c10d.py", line 1159, in broadcast | |
work = default_pg.broadcast([tensor], opts) | |
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, unhandled system error, NCCL version 21.0.3 | |
ncclSystemError: System call (socket, malloc, munmap, etc) failed. | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1022 closing signal SIGTERM | |
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 1023) of binary: /usr/bin/python3 | |
Traceback (most recent call last): | |
File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main | |
"__main__", mod_spec) | |
File "/usr/lib/python3.6/runpy.py", line 85, in _run_code | |
exec(code, run_globals) | |
File "/usr/local/lib/python3.6/dist-packages/torch/distributed/launch.py", line 193, in <module> | |
main() | |
File "/usr/local/lib/python3.6/dist-packages/torch/distributed/launch.py", line 189, in main | |
launch(args) | |
File "/usr/local/lib/python3.6/dist-packages/torch/distributed/launch.py", line 174, in launch | |
run(args) | |
File "/usr/local/lib/python3.6/dist-packages/torch/distributed/run.py", line 713, in run | |
)(*cmd_args) | |
File "/usr/local/lib/python3.6/dist-packages/torch/distributed/launcher/api.py", line 131, in __call__ | |
return launch_agent(self._config, self._entrypoint, list(args)) | |
File "/usr/local/lib/python3.6/dist-packages/torch/distributed/launcher/api.py", line 261, in launch_agent | |
failures=result.failures, | |
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: | |
============================================================ | |
/usr/local/lib/python3.6/dist-packages/accelerate/test_utils/test_script.py FAILED | |
------------------------------------------------------------ | |
Failures: | |
<NO_OTHER_FAILURES> | |
------------------------------------------------------------ | |
Root Cause (first observed failure): | |
[0]: | |
time : 2022-01-27_05:35:53 | |
host : lshl-devops-k8s-dr-firefly-4 | |
rank : 1 (local_rank: 1) | |
exitcode : 1 (pid: 1023) | |
error_file: <N/A> | |
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html | |
============================================================ | |
Traceback (most recent call last): | |
File "/usr/local/bin/accelerate-launch", line 8, in <module> | |
sys.exit(main()) | |
File "/usr/local/lib/python3.6/dist-packages/accelerate/commands/launch.py", line 390, in main | |
launch_command(args) | |
File "/usr/local/lib/python3.6/dist-packages/accelerate/commands/launch.py", line 378, in launch_command | |
multi_gpu_launcher(args) | |
File "/usr/local/lib/python3.6/dist-packages/accelerate/commands/launch.py", line 176, in multi_gpu_launcher | |
raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd) | |
subprocess.CalledProcessError: Command '['/usr/bin/python3', '-m', 'torch.distributed.launch', '--use_env', '--nproc_per_node', '2', '--nnodes', '2', '--node_rank', '0', '--master_addr', '10.219.34.224', '--master_port', '1234', '/usr/local/lib/python3.6/dist-packages/accelerate/test_utils/test_script.py']' returned non-zero exit status 1. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment