Created
September 23, 2020 23:07
-
-
Save froody/f1c8107f7bd1a1650cdd47d64406a7d5 to your computer and use it in GitHub Desktop.
output of torch-ucc crash
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
% TORCH_UCC_COLL_BACKEND=xccl LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/public/apps/ucx/1.9/gcc.7.4.0-cuda.10.1/lib/:/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/lib/ python bug.py | |
TorchUCC: Thread mode multi is not supportedTorchUCC: Thread mode multi is not supportedTorchUCC: Thread mode multi is not supportedTorchUCC: Thread mode multi is not supportedTorchUCC: Thread mode multi is not supportedTorchUCC: Thread mode multi is not supportedTorchUCC: Thread mode multi is not supportedTorchUCC: Thread mode multi is not supported[1600902172.707476] [learnfair1871:25684:0] xccl_ucx_context.c:81 TEAM_UCX WARN Thread mode multiple is not supported | |
[1600902172.736990] [learnfair1871:25685:0] xccl_ucx_context.c:81 TEAM_UCX WARN Thread mode multiple is not supported | |
[1600902172.737302] [learnfair1871:25690:0] xccl_ucx_context.c:81 TEAM_UCX WARN Thread mode multiple is not supported | |
[1600902172.737331] [learnfair1871:25686:0] xccl_ucx_context.c:81 TEAM_UCX WARN Thread mode multiple is not supported | |
[1600902172.737382] [learnfair1871:25689:0] xccl_ucx_context.c:81 TEAM_UCX WARN Thread mode multiple is not supported | |
[1600902172.737496] [learnfair1871:25691:0] xccl_ucx_context.c:81 TEAM_UCX WARN Thread mode multiple is not supported | |
[1600902172.737505] [learnfair1871:25687:0] xccl_ucx_context.c:81 TEAM_UCX WARN Thread mode multiple is not supported | |
[1600902172.737718] [learnfair1871:25688:0] xccl_ucx_context.c:81 TEAM_UCX WARN Thread mode multiple is not supported | |
[learnfair1871:25687:1:25819] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x7fb88a600000) | |
[learnfair1871:25685:1:25820] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x7fbd80c00000) | |
[learnfair1871:25686:0:25686] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x7f4ade600000) | |
[learnfair1871:25688:1:25821] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x7f82b3c00000) | |
[learnfair1871:25691:1:25817] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x7f1895c00000) | |
[learnfair1871:25689:1:25816] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x7f7fcbc00000) | |
[learnfair1871:25690:0:25690] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x7f76a0c00000) | |
==== backtrace (tid: 25820) ==== | |
0 0x0000000000028905 ucs_debug_print_backtrace() /tmp/ucx/ucx-1.9.0/src/ucs/debug/debug.c:656 | |
1 0x0000000000012890 __funlockfile() ???:0 | |
2 0x0000000000010958 xccl_mem_component_reduce_multi() ???:0 | |
3 0x00000000000058e7 xccl_ucx_allreduce_knomial_progress() ???:0 | |
4 0x00000000000039f5 xccl_ucx_collective_test() xccl_ucx_lib.c:0 | |
5 0x000000000008c9f8 c10d::torch_xccl_progress() /private/home/tbirch/src/torch-ucc/src/torch_xccl.cpp:453 | |
6 0x0000000000056be6 c10d::ProcessGroupUCC::progress_loop() /private/home/tbirch/src/torch-ucc/src/torch_ucc.cpp:201 | |
7 0x0000000000070094 std::__invoke_impl<void, void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*>() /usr/include/c++/7/bits/invoke.h:73 | |
8 0x000000000006a7b8 std::__invoke<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*>() /usr/include/c++/7/bits/invoke.h:95 | |
9 0x0000000000083e6f std::thread::_Invoker<std::tuple<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*> >::_M_invoke<0ul, 1ul>() /usr/include/c++/7/thread:234 | |
10 0x0000000000083d74 std::thread::_Invoker<std::tuple<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*> >::operator()() /usr/include/c++/7/thread:243 | |
11 0x0000000000083c84 std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*> > >::_M_run() /usr/include/c++/7/thread:186 | |
12 0x00000000000c8163 execute_native_thread_routine() /home/conda/feedstock_root/build_artifacts/ctng-compilers_1578638331887/work/.build/x86_64-conda_cos6-linux-gnu/src/gcc/libstdc++-v3/src/c++11/thread.cc:80 | |
13 0x00000000000c8163 execute_native_thread_routine() /home/conda/feedstock_root/build_artifacts/ctng-compilers_1578638331887/work/.build/x86_64-conda_cos6-linux-gnu/src/gcc/libstdc++-v3/src/c++11/thread.cc:79 | |
14 0x00000000000076db start_thread() /build/glibc-OTsEL5/glibc-2.27/nptl/pthread_create.c:463 | |
15 0x000000000012188f clone() /build/glibc-OTsEL5/glibc-2.27/misc/../sysdeps/unix/sysv/linux/x86_64/clone.S:95 | |
================================= | |
==== backtrace (tid: 25819) ==== | |
0 0x0000000000028905 ucs_debug_print_backtrace() /tmp/ucx/ucx-1.9.0/src/ucs/debug/debug.c:656 | |
1 0x0000000000012890 __funlockfile() ???:0 | |
2 0x0000000000010958 xccl_mem_component_reduce_multi() ???:0 | |
3 0x00000000000058e7 xccl_ucx_allreduce_knomial_progress() ???:0 | |
4 0x00000000000039f5 xccl_ucx_collective_test() xccl_ucx_lib.c:0 | |
5 0x000000000008c9f8 c10d::torch_xccl_progress() /private/home/tbirch/src/torch-ucc/src/torch_xccl.cpp:453 | |
6 0x0000000000056be6 c10d::ProcessGroupUCC::progress_loop() /private/home/tbirch/src/torch-ucc/src/torch_ucc.cpp:201 | |
7 0x0000000000070094 std::__invoke_impl<void, void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*>() /usr/include/c++/7/bits/invoke.h:73 | |
8 0x000000000006a7b8 std::__invoke<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*>() /usr/include/c++/7/bits/invoke.h:95 | |
9 0x0000000000083e6f std::thread::_Invoker<std::tuple<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*> >::_M_invoke<0ul, 1ul>() /usr/include/c++/7/thread:234 | |
10 0x0000000000083d74 std::thread::_Invoker<std::tuple<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*> >::operator()() /usr/include/c++/7/thread:243 | |
11 0x0000000000083c84 std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*> > >::_M_run() /usr/include/c++/7/thread:186 | |
12 0x00000000000c8163 execute_native_thread_routine() /home/conda/feedstock_root/build_artifacts/ctng-compilers_1578638331887/work/.build/x86_64-conda_cos6-linux-gnu/src/gcc/libstdc++-v3/src/c++11/thread.cc:80 | |
13 0x00000000000c8163 execute_native_thread_routine() /home/conda/feedstock_root/build_artifacts/ctng-compilers_1578638331887/work/.build/x86_64-conda_cos6-linux-gnu/src/gcc/libstdc++-v3/src/c++11/thread.cc:79 | |
14 0x00000000000076db start_thread() /build/glibc-OTsEL5/glibc-2.27/nptl/pthread_create.c:463 | |
15 0x000000000012188f clone() /build/glibc-OTsEL5/glibc-2.27/misc/../sysdeps/unix/sysv/linux/x86_64/clone.S:95 | |
================================= | |
==== backtrace (tid: 25821) ==== | |
0 0x0000000000028905 ucs_debug_print_backtrace() /tmp/ucx/ucx-1.9.0/src/ucs/debug/debug.c:656 | |
1 0x0000000000012890 __funlockfile() ???:0 | |
2 0x0000000000010958 xccl_mem_component_reduce_multi() ???:0 | |
3 0x00000000000058e7 xccl_ucx_allreduce_knomial_progress() ???:0 | |
4 0x00000000000039f5 xccl_ucx_collective_test() xccl_ucx_lib.c:0 | |
5 0x000000000008c9f8 c10d::torch_xccl_progress() /private/home/tbirch/src/torch-ucc/src/torch_xccl.cpp:453 | |
6 0x0000000000056be6 c10d::ProcessGroupUCC::progress_loop() /private/home/tbirch/src/torch-ucc/src/torch_ucc.cpp:201 | |
7 0x0000000000070094 std::__invoke_impl<void, void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*>() /usr/include/c++/7/bits/invoke.h:73 | |
8 0x000000000006a7b8 std::__invoke<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*>() /usr/include/c++/7/bits/invoke.h:95 | |
9 0x0000000000083e6f std::thread::_Invoker<std::tuple<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*> >::_M_invoke<0ul, 1ul>() /usr/include/c++/7/thread:234 | |
10 0x0000000000083d74 std::thread::_Invoker<std::tuple<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*> >::operator()() /usr/include/c++/7/thread:243 | |
11 0x0000000000083c84 std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*> > >::_M_run() /usr/include/c++/7/thread:186 | |
12 0x00000000000c8163 execute_native_thread_routine() /home/conda/feedstock_root/build_artifacts/ctng-compilers_1578638331887/work/.build/x86_64-conda_cos6-linux-gnu/src/gcc/libstdc++-v3/src/c++11/thread.cc:80 | |
13 0x00000000000c8163 execute_native_thread_routine() /home/conda/feedstock_root/build_artifacts/ctng-compilers_1578638331887/work/.build/x86_64-conda_cos6-linux-gnu/src/gcc/libstdc++-v3/src/c++11/thread.cc:79 | |
14 0x00000000000076db start_thread() /build/glibc-OTsEL5/glibc-2.27/nptl/pthread_create.c:463 | |
15 0x000000000012188f clone() /build/glibc-OTsEL5/glibc-2.27/misc/../sysdeps/unix/sysv/linux/x86_64/clone.S:95 | |
================================= | |
==== backtrace (tid: 25816) ==== | |
0 0x0000000000028905 ucs_debug_print_backtrace() /tmp/ucx/ucx-1.9.0/src/ucs/debug/debug.c:656 | |
1 0x0000000000012890 __funlockfile() ???:0 | |
2 0x0000000000010958 xccl_mem_component_reduce_multi() ???:0 | |
3 0x00000000000058e7 xccl_ucx_allreduce_knomial_progress() ???:0 | |
4 0x00000000000039f5 xccl_ucx_collective_test() xccl_ucx_lib.c:0 | |
5 0x000000000008c9f8 c10d::torch_xccl_progress() /private/home/tbirch/src/torch-ucc/src/torch_xccl.cpp:453 | |
6 0x0000000000056be6 c10d::ProcessGroupUCC::progress_loop() /private/home/tbirch/src/torch-ucc/src/torch_ucc.cpp:201 | |
7 0x0000000000070094 std::__invoke_impl<void, void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*>() /usr/include/c++/7/bits/invoke.h:73 | |
8 0x000000000006a7b8 std::__invoke<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*>() /usr/include/c++/7/bits/invoke.h:95 | |
9 0x0000000000083e6f std::thread::_Invoker<std::tuple<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*> >::_M_invoke<0ul, 1ul>() /usr/include/c++/7/thread:234 | |
10 0x0000000000083d74 std::thread::_Invoker<std::tuple<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*> >::operator()() /usr/include/c++/7/thread:243 | |
11 0x0000000000083c84 std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*> > >::_M_run() /usr/include/c++/7/thread:186 | |
12 0x00000000000c8163 execute_native_thread_routine() /home/conda/feedstock_root/build_artifacts/ctng-compilers_1578638331887/work/.build/x86_64-conda_cos6-linux-gnu/src/gcc/libstdc++-v3/src/c++11/thread.cc:80 | |
13 0x00000000000c8163 execute_native_thread_routine() /home/conda/feedstock_root/build_artifacts/ctng-compilers_1578638331887/work/.build/x86_64-conda_cos6-linux-gnu/src/gcc/libstdc++-v3/src/c++11/thread.cc:79 | |
14 0x00000000000076db start_thread() /build/glibc-OTsEL5/glibc-2.27/nptl/pthread_create.c:463 | |
15 0x000000000012188f clone() /build/glibc-OTsEL5/glibc-2.27/misc/../sysdeps/unix/sysv/linux/x86_64/clone.S:95 | |
================================= | |
==== backtrace (tid: 25817) ==== | |
0 0x0000000000028905 ucs_debug_print_backtrace() /tmp/ucx/ucx-1.9.0/src/ucs/debug/debug.c:656 | |
1 0x0000000000012890 __funlockfile() ???:0 | |
2 0x0000000000010958 xccl_mem_component_reduce_multi() ???:0 | |
3 0x00000000000058e7 xccl_ucx_allreduce_knomial_progress() ???:0 | |
4 0x00000000000039f5 xccl_ucx_collective_test() xccl_ucx_lib.c:0 | |
5 0x000000000008c9f8 c10d::torch_xccl_progress() /private/home/tbirch/src/torch-ucc/src/torch_xccl.cpp:453 | |
6 0x0000000000056be6 c10d::ProcessGroupUCC::progress_loop() /private/home/tbirch/src/torch-ucc/src/torch_ucc.cpp:201 | |
7 0x0000000000070094 std::__invoke_impl<void, void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*>() /usr/include/c++/7/bits/invoke.h:73 | |
8 0x000000000006a7b8 std::__invoke<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*>() /usr/include/c++/7/bits/invoke.h:95 | |
9 0x0000000000083e6f std::thread::_Invoker<std::tuple<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*> >::_M_invoke<0ul, 1ul>() /usr/include/c++/7/thread:234 | |
10 0x0000000000083d74 std::thread::_Invoker<std::tuple<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*> >::operator()() /usr/include/c++/7/thread:243 | |
11 0x0000000000083c84 std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (c10d::ProcessGroupUCC::*)(), c10d::ProcessGroupUCC*> > >::_M_run() /usr/include/c++/7/thread:186 | |
12 0x00000000000c8163 execute_native_thread_routine() /home/conda/feedstock_root/build_artifacts/ctng-compilers_1578638331887/work/.build/x86_64-conda_cos6-linux-gnu/src/gcc/libstdc++-v3/src/c++11/thread.cc:80 | |
13 0x00000000000c8163 execute_native_thread_routine() /home/conda/feedstock_root/build_artifacts/ctng-compilers_1578638331887/work/.build/x86_64-conda_cos6-linux-gnu/src/gcc/libstdc++-v3/src/c++11/thread.cc:79 | |
14 0x00000000000076db start_thread() /build/glibc-OTsEL5/glibc-2.27/nptl/pthread_create.c:463 | |
15 0x000000000012188f clone() /build/glibc-OTsEL5/glibc-2.27/misc/../sysdeps/unix/sysv/linux/x86_64/clone.S:95 | |
================================= | |
Traceback (most recent call last): | |
File "/private/home/tbirch/src/fairscale/hello_ucx.py", line 30, in <module> | |
mp.spawn(worker, args=(world_size,), nprocs=world_size, join=True) | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 200, in spawn | |
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn') | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 158, in start_processes | |
while not context.join(): | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 108, in join | |
(error_index, name) | |
Exception: process 3 terminated with signal SIGSEGV |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment