Skip to content

Instantly share code, notes, and snippets.

@froody
Created September 22, 2020 23:08
Show Gist options
  • Save froody/d35d7571b1a8df0638867066d96ecc6c to your computer and use it in GitHub Desktop.
Save froody/d35d7571b1a8df0638867066d96ecc6c to your computer and use it in GitHub Desktop.
segfault creating single-element group
% TORCH_UCC_COLL_BACKEND=xccl LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/public/apps/ucx/1.9/gcc.7.4.0-cuda.10.1/lib/:/private/home/tbirch/.conda/envs/torch160-ucx/lib/python3.7/site-packages/torch/lib/ python hello_ucx.py
TorchUCC: Thread mode multi is not supportedTorchUCC: Thread mode multi is not supported[1600816014.600420] [devfair0133:73575:0] xccl_ucx_context.c:81 TEAM_UCX WARN Thread mode multiple is not supported
[1600816014.600420] [devfair0133:73576:0] xccl_ucx_context.c:81 TEAM_UCX WARN Thread mode multiple is not supported
rank is = 1
rank is = 0
t is tensor([2., 2., 2., 2., 2., 2., 2., 2., 2., 2.], device='cuda:0')
sending tensor([0.9740, 0.3096, 0.8529, 0.4817, 0.5530, 0.1668, 0.1471, 0.9425, 0.9323,
0.9054])
recvd tensor([0.9740, 0.3096, 0.8529, 0.4817, 0.5530, 0.1668, 0.1471, 0.9425, 0.9323,
0.9054])
creating group 0,1
creating group 0,1
TorchUCC: Thread mode multi is not supportedTorchUCC: Thread mode multi is not supported[1600816016.758399] [devfair0133:73576:0] xccl_ucx_context.c:81 TEAM_UCX WARN Thread mode multiple is not supported
[1600816016.758418] [devfair0133:73575:0] xccl_ucx_context.c:81 TEAM_UCX WARN Thread mode multiple is not supported
creating group 0
creating group 1
creating group 0
TorchUCC: Thread mode multi is not supportedTorchUCC: Thread mode multi is not supported[1600816016.937819] [devfair0133:73575:0] xccl_ucx_context.c:81 TEAM_UCX WARN Thread mode multiple is not supported
[1600816016.937869] [devfair0133:73576:0] xccl_ucx_context.c:81 TEAM_UCX WARN Thread mode multiple is not supported
[devfair0133:73575:0:73575] Caught signal 11 (Segmentation fault: address not mapped to object at address 0xffffffff00000001)
[devfair0133:73576:0:73576] Caught signal 11 (Segmentation fault: address not mapped to object at address 0xffffffff00000001)
==== backtrace (tid: 73575) ====
0 0x0000000000028905 ucs_debug_print_backtrace() /tmp/ucx/ucx-1.9.0/src/ucs/debug/debug.c:656
1 0x0000000000012890 __funlockfile() ???:0
2 0x000000000008c008 c10d::torch_ucx_req_test() /private/home/tbirch/src/torch-ucc/include/torch_ucc_sendrecv.hpp:237
3 0x000000000008c97d c10d::oob_allgather_test() /private/home/tbirch/src/torch-ucc/src/torch_xccl.cpp:70
4 0x000000000008ca64 c10d::oob_allgather() /private/home/tbirch/src/torch-ucc/src/torch_xccl.cpp:100
5 0x0000000000002f98 xccl_ucx_team_create_post() ???:0
6 0x00000000000039cb xccl_team_create_post() ???:0
7 0x000000000008cd15 c10d::torch_xccl_comm_init() /private/home/tbirch/src/torch-ucc/src/torch_xccl.cpp:182
8 0x00000000000579db c10d::ProcessGroupUCC::ProcessGroupUCC() /private/home/tbirch/src/torch-ucc/src/torch_ucc.cpp:179
9 0x0000000000083e5e __gnu_cxx::new_allocator<c10d::ProcessGroupUCC>::construct<c10d::ProcessGroupUCC, std::shared_ptr<c10d::Store> const&, int&, int&>() /usr/include/c++/7/ext/new_allocator.h:136
10 0x000000000008336c std::allocator_traits<std::allocator<c10d::ProcessGroupUCC> >::construct<c10d::ProcessGroupUCC, std::shared_ptr<c10d::Store> const&, int&, int&>() /usr/include/c++/7/bits/alloc_traits.h:475
11 0x0000000000081e99 std::_Sp_counted_ptr_inplace<c10d::ProcessGroupUCC, std::allocator<c10d::ProcessGroupUCC>, (__gnu_cxx::_Lock_policy)2>::_Sp_counted_ptr_inplace<std::shared_ptr<c10d::Store> const&, int&, int&>() /usr/include/c++/7/bits/shared_ptr_base.h:526
12 0x000000000007f213 std::__shared_count<(__gnu_cxx::_Lock_policy)2>::__shared_count<c10d::ProcessGroupUCC, std::allocator<c10d::ProcessGroupUCC>, std::shared_ptr<c10d::Store> const&, int&, int&>() /usr/include/c++/7/bits/shared_ptr_base.h:637
13 0x000000000007ba8e std::__shared_ptr<c10d::ProcessGroupUCC, (__gnu_cxx::_Lock_policy)2>::__shared_ptr<std::allocator<c10d::ProcessGroupUCC>, std::shared_ptr<c10d::Store> const&, int&, int&>() /usr/include/c++/7/bits/shared_ptr_base.h:1295
14 0x0000000000076f63 std::shared_ptr<c10d::ProcessGroupUCC>::shared_ptr<std::allocator<c10d::ProcessGroupUCC>, std::shared_ptr<c10d::Store> const&, int&, int&>() /usr/include/c++/7/bits/shared_ptr.h:344
15 0x0000000000071a08 std::allocate_shared<c10d::ProcessGroupUCC, std::allocator<c10d::ProcessGroupUCC>, std::shared_ptr<c10d::Store> const&, int&, int&>() /usr/include/c++/7/bits/shared_ptr.h:691
16 0x000000000006c09c std::make_shared<c10d::ProcessGroupUCC, std::shared_ptr<c10d::Store> const&, int&, int&>() /usr/include/c++/7/bits/shared_ptr.h:707
17 0x00000000000590a3 c10d::ProcessGroupUCC::createProcessGroupUCC() /private/home/tbirch/src/torch-ucc/src/torch_ucc.cpp:476
18 0x000000000007b011 pybind11::detail::argument_loader<std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&>::call_impl<std::shared_ptr<c10d::ProcessGroup>, std::shared_ptr<c10d::ProcessGroup> (*&)(std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&), 0ul, 1ul, 2ul, 3ul, pybind11::detail::void_type>() /private/home/tbirch/.conda/envs/torch160-ucx/lib/python3.7/site-packages/torch/include/pybind11/cast.h:1931
19 0x00000000000766b9 pybind11::detail::argument_loader<std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&>::call<std::shared_ptr<c10d::ProcessGroup>, pybind11::detail::void_type, std::shared_ptr<c10d::ProcessGroup> (*&)(std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&)>() /private/home/tbirch/.conda/envs/torch160-ucx/lib/python3.7/site-packages/torch/include/pybind11/cast.h:1908
20 0x0000000000070c1f pybind11::cpp_function::initialize<std::shared_ptr<c10d::ProcessGroup> (*&)(std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&), std::shared_ptr<c10d::ProcessGroup>, std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&>(std::shared_ptr<c10d::ProcessGroup> (*&)(std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&), std::shared_ptr<c10d::ProcessGroup> (*)(std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&))::{lambda(pybind11::detail::function_call&)#3}::operator()() /private/home/tbirch/.conda/envs/torch160-ucx/lib/python3.7/site-packages/torch/include/pybind11/pybind11.h:155
21 0x0000000000070f54 pybind11::cpp_function::initialize<std::shared_ptr<c10d::ProcessGroup> (*&)(std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&), std::shared_ptr<c10d::ProcessGroup>, std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&>(std::shared_ptr<c10d::ProcessGroup> (*&)(std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&), std::shared_ptr<c10d::ProcessGroup> (*)(std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&))::{lambda(pybind11::detail::function_call&)#3}::_FUN() /private/home/tbirch/.conda/envs/torch160-ucx/lib/python3.7/site-packages/torch/include/pybind11/pybind11.h:133
22 0x0000000000065293 pybind11::cpp_function::dispatcher() /private/home/tbirch/.conda/envs/torch160-ucx/lib/python3.7/site-packages/torch/include/pybind11/pybind11.h:620
23 0x00000000001491f4 _PyMethodDef_RawFastCallKeywords() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:693
24 0x00000000001c6bb9 _PyCFunction_FastCallKeywords() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:732
25 0x00000000001c6bb9 call_function() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:4568
26 0x00000000001c6bb9 _PyEval_EvalFrameDefault() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:3124
27 0x0000000000118db2 PyEval_EvalFrameEx() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:547
28 0x0000000000138b63 _PyFunction_FastCallKeywords() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:433
29 0x000000000017f335 call_function() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:4616
30 0x00000000001c3aee _PyEval_EvalFrameDefault() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:3139
31 0x0000000000118db2 PyEval_EvalFrameEx() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:547
32 0x0000000000138b01 _PyFunction_FastCallKeywords() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:433
33 0x000000000017f335 call_function() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:4616
34 0x00000000001c6ef2 _PyEval_EvalFrameDefault() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:3093
35 0x0000000000137b68 PyEval_EvalFrameEx() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:547
36 0x0000000000137b68 _PyFunction_FastCallDict() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:322
37 0x00000000001c43a3 do_call_core() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:4645
38 0x00000000001c43a3 _PyEval_EvalFrameDefault() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:3191
39 0x0000000000137b68 PyEval_EvalFrameEx() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:547
40 0x0000000000137b68 _PyFunction_FastCallDict() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:322
41 0x00000000001c43a3 do_call_core() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:4645
42 0x00000000001c43a3 _PyEval_EvalFrameDefault() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:3191
43 0x0000000000138767 PyEval_EvalFrameEx() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:547
44 0x0000000000138767 _PyFunction_FastCallKeywords() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:408
45 0x000000000017f335 call_function() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:4616
46 0x00000000001c2d01 _PyEval_EvalFrameDefault() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:3110
47 0x0000000000138767 PyEval_EvalFrameEx() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:547
48 0x0000000000138767 _PyFunction_FastCallKeywords() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:408
49 0x000000000017f335 call_function() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:4616
50 0x00000000001c2d01 _PyEval_EvalFrameDefault() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:3110
51 0x0000000000138767 PyEval_EvalFrameEx() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:547
52 0x0000000000138767 _PyFunction_FastCallKeywords() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:408
53 0x00000000001c2ae5 call_function() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:4616
54 0x00000000001c2ae5 _PyEval_EvalFrameDefault() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:3124
55 0x0000000000118db2 PyEval_EvalFrameEx() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:547
56 0x0000000000138b63 _PyFunction_FastCallKeywords() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:433
57 0x000000000017f335 call_function() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:4616
58 0x00000000001c3aee _PyEval_EvalFrameDefault() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:3139
=================================
==== backtrace (tid: 73576) ====
0 0x0000000000028905 ucs_debug_print_backtrace() /tmp/ucx/ucx-1.9.0/src/ucs/debug/debug.c:656
1 0x0000000000012890 __funlockfile() ???:0
2 0x000000000008c008 c10d::torch_ucx_req_test() /private/home/tbirch/src/torch-ucc/include/torch_ucc_sendrecv.hpp:237
3 0x000000000008c97d c10d::oob_allgather_test() /private/home/tbirch/src/torch-ucc/src/torch_xccl.cpp:70
4 0x000000000008ca64 c10d::oob_allgather() /private/home/tbirch/src/torch-ucc/src/torch_xccl.cpp:100
5 0x0000000000002f98 xccl_ucx_team_create_post() ???:0
6 0x00000000000039cb xccl_team_create_post() ???:0
7 0x000000000008cd15 c10d::torch_xccl_comm_init() /private/home/tbirch/src/torch-ucc/src/torch_xccl.cpp:182
8 0x00000000000579db c10d::ProcessGroupUCC::ProcessGroupUCC() /private/home/tbirch/src/torch-ucc/src/torch_ucc.cpp:179
9 0x0000000000083e5e __gnu_cxx::new_allocator<c10d::ProcessGroupUCC>::construct<c10d::ProcessGroupUCC, std::shared_ptr<c10d::Store> const&, int&, int&>() /usr/include/c++/7/ext/new_allocator.h:136
10 0x000000000008336c std::allocator_traits<std::allocator<c10d::ProcessGroupUCC> >::construct<c10d::ProcessGroupUCC, std::shared_ptr<c10d::Store> const&, int&, int&>() /usr/include/c++/7/bits/alloc_traits.h:475
11 0x0000000000081e99 std::_Sp_counted_ptr_inplace<c10d::ProcessGroupUCC, std::allocator<c10d::ProcessGroupUCC>, (__gnu_cxx::_Lock_policy)2>::_Sp_counted_ptr_inplace<std::shared_ptr<c10d::Store> const&, int&, int&>() /usr/include/c++/7/bits/shared_ptr_base.h:526
12 0x000000000007f213 std::__shared_count<(__gnu_cxx::_Lock_policy)2>::__shared_count<c10d::ProcessGroupUCC, std::allocator<c10d::ProcessGroupUCC>, std::shared_ptr<c10d::Store> const&, int&, int&>() /usr/include/c++/7/bits/shared_ptr_base.h:637
13 0x000000000007ba8e std::__shared_ptr<c10d::ProcessGroupUCC, (__gnu_cxx::_Lock_policy)2>::__shared_ptr<std::allocator<c10d::ProcessGroupUCC>, std::shared_ptr<c10d::Store> const&, int&, int&>() /usr/include/c++/7/bits/shared_ptr_base.h:1295
14 0x0000000000076f63 std::shared_ptr<c10d::ProcessGroupUCC>::shared_ptr<std::allocator<c10d::ProcessGroupUCC>, std::shared_ptr<c10d::Store> const&, int&, int&>() /usr/include/c++/7/bits/shared_ptr.h:344
15 0x0000000000071a08 std::allocate_shared<c10d::ProcessGroupUCC, std::allocator<c10d::ProcessGroupUCC>, std::shared_ptr<c10d::Store> const&, int&, int&>() /usr/include/c++/7/bits/shared_ptr.h:691
16 0x000000000006c09c std::make_shared<c10d::ProcessGroupUCC, std::shared_ptr<c10d::Store> const&, int&, int&>() /usr/include/c++/7/bits/shared_ptr.h:707
17 0x00000000000590a3 c10d::ProcessGroupUCC::createProcessGroupUCC() /private/home/tbirch/src/torch-ucc/src/torch_ucc.cpp:476
18 0x000000000007b011 pybind11::detail::argument_loader<std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&>::call_impl<std::shared_ptr<c10d::ProcessGroup>, std::shared_ptr<c10d::ProcessGroup> (*&)(std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&), 0ul, 1ul, 2ul, 3ul, pybind11::detail::void_type>() /private/home/tbirch/.conda/envs/torch160-ucx/lib/python3.7/site-packages/torch/include/pybind11/cast.h:1931
19 0x00000000000766b9 pybind11::detail::argument_loader<std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&>::call<std::shared_ptr<c10d::ProcessGroup>, pybind11::detail::void_type, std::shared_ptr<c10d::ProcessGroup> (*&)(std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&)>() /private/home/tbirch/.conda/envs/torch160-ucx/lib/python3.7/site-packages/torch/include/pybind11/cast.h:1908
20 0x0000000000070c1f pybind11::cpp_function::initialize<std::shared_ptr<c10d::ProcessGroup> (*&)(std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&), std::shared_ptr<c10d::ProcessGroup>, std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&>(std::shared_ptr<c10d::ProcessGroup> (*&)(std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&), std::shared_ptr<c10d::ProcessGroup> (*)(std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&))::{lambda(pybind11::detail::function_call&)#3}::operator()() /private/home/tbirch/.conda/envs/torch160-ucx/lib/python3.7/site-packages/torch/include/pybind11/pybind11.h:155
21 0x0000000000070f54 pybind11::cpp_function::initialize<std::shared_ptr<c10d::ProcessGroup> (*&)(std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&), std::shared_ptr<c10d::ProcessGroup>, std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&>(std::shared_ptr<c10d::ProcessGroup> (*&)(std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&), std::shared_ptr<c10d::ProcessGroup> (*)(std::shared_ptr<c10d::Store> const&, int, int, std::chrono::duration<float, std::ratio<1l, 1l> > const&))::{lambda(pybind11::detail::function_call&)#3}::_FUN() /private/home/tbirch/.conda/envs/torch160-ucx/lib/python3.7/site-packages/torch/include/pybind11/pybind11.h:133
22 0x0000000000065293 pybind11::cpp_function::dispatcher() /private/home/tbirch/.conda/envs/torch160-ucx/lib/python3.7/site-packages/torch/include/pybind11/pybind11.h:620
23 0x00000000001491f4 _PyMethodDef_RawFastCallKeywords() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:693
24 0x00000000001c6bb9 _PyCFunction_FastCallKeywords() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:732
25 0x00000000001c6bb9 call_function() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:4568
26 0x00000000001c6bb9 _PyEval_EvalFrameDefault() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:3124
27 0x0000000000118db2 PyEval_EvalFrameEx() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:547
28 0x0000000000138b63 _PyFunction_FastCallKeywords() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:433
29 0x000000000017f335 call_function() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:4616
30 0x00000000001c3aee _PyEval_EvalFrameDefault() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:3139
31 0x0000000000118db2 PyEval_EvalFrameEx() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:547
32 0x0000000000138b01 _PyFunction_FastCallKeywords() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:433
33 0x000000000017f335 call_function() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:4616
34 0x00000000001c6ef2 _PyEval_EvalFrameDefault() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:3093
35 0x0000000000137b68 PyEval_EvalFrameEx() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:547
36 0x0000000000137b68 _PyFunction_FastCallDict() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:322
37 0x00000000001c43a3 do_call_core() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:4645
38 0x00000000001c43a3 _PyEval_EvalFrameDefault() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:3191
39 0x0000000000137b68 PyEval_EvalFrameEx() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:547
40 0x0000000000137b68 _PyFunction_FastCallDict() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:322
41 0x00000000001c43a3 do_call_core() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:4645
42 0x00000000001c43a3 _PyEval_EvalFrameDefault() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:3191
43 0x0000000000138767 PyEval_EvalFrameEx() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:547
44 0x0000000000138767 _PyFunction_FastCallKeywords() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:408
45 0x000000000017f335 call_function() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:4616
46 0x00000000001c2d01 _PyEval_EvalFrameDefault() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:3110
47 0x0000000000138767 PyEval_EvalFrameEx() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:547
48 0x0000000000138767 _PyFunction_FastCallKeywords() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:408
49 0x000000000017f335 call_function() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:4616
50 0x00000000001c2d01 _PyEval_EvalFrameDefault() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:3110
51 0x0000000000138767 PyEval_EvalFrameEx() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:547
52 0x0000000000138767 _PyFunction_FastCallKeywords() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:408
53 0x00000000001c2ae5 call_function() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:4616
54 0x00000000001c2ae5 _PyEval_EvalFrameDefault() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:3124
55 0x0000000000118db2 PyEval_EvalFrameEx() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:547
56 0x0000000000138b63 _PyFunction_FastCallKeywords() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Objects/call.c:433
57 0x000000000017f335 call_function() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:4616
58 0x00000000001c3aee _PyEval_EvalFrameDefault() /home/conda/feedstock_root/build_artifacts/python_1596159872474/work/Python/ceval.c:3139
=================================
Traceback (most recent call last):
File "hello_ucx.py", line 47, in <module>
mp.spawn(worker, args=(world_size,), nprocs=world_size, join=True)
File "/private/home/tbirch/.conda/envs/torch160-ucx/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 200, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/private/home/tbirch/.conda/envs/torch160-ucx/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 158, in start_processes
while not context.join():
File "/private/home/tbirch/.conda/envs/torch160-ucx/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 108, in join
(error_index, name)
Exception: process 0 terminated with signal SIGSEGV
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment