rohan-varma/nccl deadlock

## nccl deadlock
--- Process 0 -----
#0  0x00007fff70b5269e in clock_gettime ()
#1  0x00007fbe97a5a7fd in clock_gettime () from /lib64/libc.so.6
#2  0x00007fbe3e3f99ae in ?? () from /lib64/libcuda.so.1
#3  0x00007fbe3e4c12c7 in ?? () from /lib64/libcuda.so.1
#4  0x00007fbe3e3a1cac in ?? () from /lib64/libcuda.so.1
#5  0x00007fbe3e3d9502 in ?? () from /lib64/libcuda.so.1
#6  0x00007fbe3e3165e8 in ?? () from /lib64/libcuda.so.1
#7  0x00007fbe3e316cd4 in ?? () from /lib64/libcuda.so.1
#8  0x00007fbe8308b1e7 in ?? () from /usr/local/cuda/lib64/libcudart.so.9.2
#9  0x00007fbe830852a0 in ?? () from /usr/local/cuda/lib64/libcudart.so.9.2
#10 0x00007fbe83091616 in ?? () from /usr/local/cuda/lib64/libcudart.so.9.2
#11 0x00007fbe83094031 in ?? () from /usr/local/cuda/lib64/libcudart.so.9.2
#12 0x00007fbe8308723e in ?? () from /usr/local/cuda/lib64/libcudart.so.9.2
#13 0x00007fbe830739ce in ?? () from /usr/local/cuda/lib64/libcudart.so.9.2
#14 0x00007fbe830a8d24 in cudaEventCreate () from /usr/local/cuda/lib64/libcudart.so.9.2
#15 0x00007fbe5bf7be68 in torch::autograd::profiler::(anonymous namespace)::CUDAMethods::record (
    this=0x7fbe777a8128 <torch::autograd::profiler::(anonymous namespace)::RegisterCUDAMethods::RegisterCUDAMethods()::methods>, device=0x7fff70b1c288, event=0x7fff70b1c290,
    cpu_ns=0x7fff70b1c220) at /home/rvarm1/pytorch/torch/csrc/autograd/profiler_cuda.cpp:46
#16 0x00007fbe7b883d71 in torch::autograd::profiler::LegacyEvent::record (this=0x7fff70b1c220, record_cuda=true) at /home/rvarm1/pytorch/torch/csrc/autograd/profiler_legacy.cpp:570
#17 0x00007fbe7b889bc1 in torch::autograd::profiler::LegacyEvent::LegacyEvent (this=0x7fff70b1c220, kind=torch::autograd::profiler::EventKind::Mark, name=..., thread_id=1,
    record_cuda=true, handle=0, shapes=..., node_id=-1) at /home/rvarm1/pytorch/torch/csrc/autograd/profiler_legacy.h:116
#18 0x00007fbe7b881068 in torch::autograd::profiler::ProfilerThreadLocalState::mark (this=0x55f5a180f3b0, name=..., include_cuda=true)
    at /home/rvarm1/pytorch/torch/csrc/autograd/profiler_legacy.cpp:189
#19 0x00007fbe7b882f20 in torch::autograd::profiler::<lambda(int)>::operator()(int) const (__closure=0x55f5a180f550)
    at /home/rvarm1/pytorch/torch/csrc/autograd/profiler_legacy.cpp:519
#20 0x00007fbe7b887194 in std::_Function_handler<void(int), torch::autograd::profiler::enableProfilerLegacy(const torch::autograd::profiler::ProfilerConfig&)::<lambda(int)> >::_M_invoke(const std::_Any_data &, int &&) (__functor=..., __args#0=@0x7fff70b1c3b4: 1) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/std_function.h:316
#21 0x00007fbe82024b1e in std::function<void (int)>::operator()(int) const (this=0x7fff70b1c830, __args#0=1) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/std_function.h:706
#22 0x00007fbe5bf7c219 in torch::autograd::profiler::(anonymous namespace)::CUDAMethods::onEachDevice(std::function<void(int)>) const (
    this=0x7fbe777a8128 <torch::autograd::profiler::(anonymous namespace)::RegisterCUDAMethods::RegisterCUDAMethods()::methods>, op=...)
    at /home/rvarm1/pytorch/torch/csrc/autograd/profiler_cuda.cpp:86
#23 0x00007fbe7b883551 in torch::autograd::profiler::enableProfilerLegacy (new_config=...) at /home/rvarm1/pytorch/torch/csrc/autograd/profiler_legacy.cpp:518
#24 0x00007fbe818743c1 in pybind11::detail::argument_loader<torch::autograd::profiler::ProfilerConfig const&>::call_impl<void, void (*&)(torch::autograd::profiler::ProfilerConfig const&), 0ul, pybind11::detail::void_type>(void (*&)(torch::autograd::profiler::ProfilerConfig const&), std::integer_sequence<unsigned long, 0ul>, pybind11::detail::void_type&&) && (
    this=0x7fff70b1c990, f=@0x55f52c6604a8: 0x7fbe7b8830aa <torch::autograd::profiler::enableProfilerLegacy(torch::autograd::profiler::ProfilerConfig const&)>)
    at /home/rvarm1/pytorch/third_party/pybind11/include/pybind11/cast.h:2010
#25 0x00007fbe81870eaf in pybind11::detail::argument_loader<torch::autograd::profiler::ProfilerConfig const&>::call<void, pybind11::detail::void_type, void (*&)(torch::autograd::profiler::ProfilerConfig const&)>(void (*&)(torch::autograd::profiler::ProfilerConfig const&)) && (this=0x7fff70b1c990,

---- Process 1 -----

#0  0x00007fff70b5269e in clock_gettime ()
#1  0x00007fbe97a5a7fd in clock_gettime () from /lib64/libc.so.6
#2  0x00007fbe3e3f99ae in ?? () from /lib64/libcuda.so.1
#3  0x00007fbe3e4c12c7 in ?? () from /lib64/libcuda.so.1
#4  0x00007fbe3e3a1cac in ?? () from /lib64/libcuda.so.1
#5  0x00007fbe3e3a1e60 in ?? () from /lib64/libcuda.so.1
#6  0x00007fbe3e3cc361 in ?? () from /lib64/libcuda.so.1
#7  0x00007fbe3e541526 in ?? () from /lib64/libcuda.so.1
#8  0x00007fbe3e2dc56b in ?? () from /lib64/libcuda.so.1
#9  0x00007fbe3e2dc7e8 in ?? () from /lib64/libcuda.so.1
#10 0x00007fbe3e2dc82e in ?? () from /lib64/libcuda.so.1
#11 0x00007fbe3e46fcd6 in cuLaunchKernel () from /lib64/libcuda.so.1
#12 0x00007fbe8307d8fd in ?? () from /usr/local/cuda/lib64/libcudart.so.9.2
#13 0x00007fbe8307d987 in ?? () from /usr/local/cuda/lib64/libcudart.so.9.2
#14 0x00007fbe830ab96b in cudaLaunchKernel () from /usr/local/cuda/lib64/libcudart.so.9.2
#15 0x00007fbe5eab6035 in ncclBarrierEnqueueWait (comm=0x7fbdec000dc0) at enqueue.cc:215
#16 0x00007fbe5eaba233 in ncclGroupEnd () at group.cc:282
#17 0x00007fbe82093ce2 in c10d::(anonymous namespace)::AutoNcclGroup::~AutoNcclGroup (this=0x7fff70b1be1f, __in_chrg=<optimized out>)
    at /home/rvarm1/pytorch/torch/lib/c10d/ProcessGroupNCCL.cpp:33
#18 0x00007fbe8209f166 in c10d::ProcessGroupNCCL::collective<c10d::ProcessGroupNCCL::allgather(std::vector<std::vector<at::Tensor> >&, std::vector<at::Tensor>&, const c10d::AllgatherOptions&)::<lambda(at::Tensor&, at::Tensor&, ncclComm_t, c10::cuda::CUDAStream&)>, c10d::ProcessGroupNCCL::allgather(std::vector<std::vector<at::Tensor> >&, std::vector<at::Tensor>&, const c10d::AllgatherOptions&)::<lambda(std::vector<c10::cuda::CUDAStream>&)>, c10d::ProcessGroupNCCL::allgather(std::vector<std::vector<at::Tensor> >&, std::vector<at::Tensor>&, const c10d::AllgatherOptions&)::<lambda(std::vector<c10::cuda::CUDAStream>&)> >(std::vector<at::Tensor, std::allocator<at::Tensor> > &, std::vector<at::Tensor, std::allocator<at::Tensor> > &, c10d::ProcessGroupNCCL::<lambda(at::Tensor&, at::Tensor&, ncclComm_t, c10::cuda::CUDAStream&)>, c10d::ProcessGroupNCCL::<lambda(std::vector<c10::cuda::CUDAStream, std::allocator<c10::cuda::CUDAStream> >&)>, c10d::ProcessGroupNCCL::<lambda(std::vector<c10::cuda::CUDAStream, std::allocator<c10::cuda::CUDAStream> >&)>, c10d::OpType, const char *) (
    this=0x55f52cb1e9c0, inputs=..., outputs=..., fn=..., pre=..., post=..., opType=c10d::OpType::ALLGATHER, profilingTitle=0x7fbe82400974 "nccl:all_gather")
    at /home/rvarm1/pytorch/torch/lib/c10d/ProcessGroupNCCL.cpp:1101
#19 0x00007fbe8209c9a5 in c10d::ProcessGroupNCCL::allgather (this=0x55f52cb1e9c0, outputTensors=..., inputTensors=..., opts=...)
    at /home/rvarm1/pytorch/torch/lib/c10d/ProcessGroupNCCL.cpp:1372
	--- Process 0 -----
	#0 0x00007fff70b5269e in clock_gettime ()
	#1 0x00007fbe97a5a7fd in clock_gettime () from /lib64/libc.so.6
	#2 0x00007fbe3e3f99ae in ?? () from /lib64/libcuda.so.1
	#3 0x00007fbe3e4c12c7 in ?? () from /lib64/libcuda.so.1
	#4 0x00007fbe3e3a1cac in ?? () from /lib64/libcuda.so.1
	#5 0x00007fbe3e3d9502 in ?? () from /lib64/libcuda.so.1
	#6 0x00007fbe3e3165e8 in ?? () from /lib64/libcuda.so.1
	#7 0x00007fbe3e316cd4 in ?? () from /lib64/libcuda.so.1
	#8 0x00007fbe8308b1e7 in ?? () from /usr/local/cuda/lib64/libcudart.so.9.2
	#9 0x00007fbe830852a0 in ?? () from /usr/local/cuda/lib64/libcudart.so.9.2
	#10 0x00007fbe83091616 in ?? () from /usr/local/cuda/lib64/libcudart.so.9.2
	#11 0x00007fbe83094031 in ?? () from /usr/local/cuda/lib64/libcudart.so.9.2
	#12 0x00007fbe8308723e in ?? () from /usr/local/cuda/lib64/libcudart.so.9.2
	#13 0x00007fbe830739ce in ?? () from /usr/local/cuda/lib64/libcudart.so.9.2
	#14 0x00007fbe830a8d24 in cudaEventCreate () from /usr/local/cuda/lib64/libcudart.so.9.2
	#15 0x00007fbe5bf7be68 in torch::autograd::profiler::(anonymous namespace)::CUDAMethods::record (
	this=0x7fbe777a8128 <torch::autograd::profiler::(anonymous namespace)::RegisterCUDAMethods::RegisterCUDAMethods()::methods>, device=0x7fff70b1c288, event=0x7fff70b1c290,
	cpu_ns=0x7fff70b1c220) at /home/rvarm1/pytorch/torch/csrc/autograd/profiler_cuda.cpp:46
	#16 0x00007fbe7b883d71 in torch::autograd::profiler::LegacyEvent::record (this=0x7fff70b1c220, record_cuda=true) at /home/rvarm1/pytorch/torch/csrc/autograd/profiler_legacy.cpp:570
	#17 0x00007fbe7b889bc1 in torch::autograd::profiler::LegacyEvent::LegacyEvent (this=0x7fff70b1c220, kind=torch::autograd::profiler::EventKind::Mark, name=..., thread_id=1,
	record_cuda=true, handle=0, shapes=..., node_id=-1) at /home/rvarm1/pytorch/torch/csrc/autograd/profiler_legacy.h:116
	#18 0x00007fbe7b881068 in torch::autograd::profiler::ProfilerThreadLocalState::mark (this=0x55f5a180f3b0, name=..., include_cuda=true)
	at /home/rvarm1/pytorch/torch/csrc/autograd/profiler_legacy.cpp:189
	#19 0x00007fbe7b882f20 in torch::autograd::profiler::<lambda(int)>::operator()(int) const (__closure=0x55f5a180f550)
	at /home/rvarm1/pytorch/torch/csrc/autograd/profiler_legacy.cpp:519
	#20 0x00007fbe7b887194 in std::_Function_handler<void(int), torch::autograd::profiler::enableProfilerLegacy(const torch::autograd::profiler::ProfilerConfig&)::<lambda(int)> >::_M_invoke(const std::_Any_data &, int &&) (__functor=..., __args#0=@0x7fff70b1c3b4: 1) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/std_function.h:316
	#21 0x00007fbe82024b1e in std::function<void (int)>::operator()(int) const (this=0x7fff70b1c830, __args#0=1) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/std_function.h:706
	#22 0x00007fbe5bf7c219 in torch::autograd::profiler::(anonymous namespace)::CUDAMethods::onEachDevice(std::function<void(int)>) const (
	this=0x7fbe777a8128 <torch::autograd::profiler::(anonymous namespace)::RegisterCUDAMethods::RegisterCUDAMethods()::methods>, op=...)
	at /home/rvarm1/pytorch/torch/csrc/autograd/profiler_cuda.cpp:86
	#23 0x00007fbe7b883551 in torch::autograd::profiler::enableProfilerLegacy (new_config=...) at /home/rvarm1/pytorch/torch/csrc/autograd/profiler_legacy.cpp:518
	#24 0x00007fbe818743c1 in pybind11::detail::argument_loader<torch::autograd::profiler::ProfilerConfig const&>::call_impl<void, void (&)(torch::autograd::profiler::ProfilerConfig const&), 0ul, pybind11::detail::void_type>(void (&)(torch::autograd::profiler::ProfilerConfig const&), std::integer_sequence<unsigned long, 0ul>, pybind11::detail::void_type&&) && (
	this=0x7fff70b1c990, f=@0x55f52c6604a8: 0x7fbe7b8830aa <torch::autograd::profiler::enableProfilerLegacy(torch::autograd::profiler::ProfilerConfig const&)>)
	at /home/rvarm1/pytorch/third_party/pybind11/include/pybind11/cast.h:2010
	#25 0x00007fbe81870eaf in pybind11::detail::argument_loader<torch::autograd::profiler::ProfilerConfig const&>::call<void, pybind11::detail::void_type, void (&)(torch::autograd::profiler::ProfilerConfig const&)>(void (&)(torch::autograd::profiler::ProfilerConfig const&)) && (this=0x7fff70b1c990,

	---- Process 1 -----

	#0 0x00007fff70b5269e in clock_gettime ()
	#1 0x00007fbe97a5a7fd in clock_gettime () from /lib64/libc.so.6
	#2 0x00007fbe3e3f99ae in ?? () from /lib64/libcuda.so.1
	#3 0x00007fbe3e4c12c7 in ?? () from /lib64/libcuda.so.1
	#4 0x00007fbe3e3a1cac in ?? () from /lib64/libcuda.so.1
	#5 0x00007fbe3e3a1e60 in ?? () from /lib64/libcuda.so.1
	#6 0x00007fbe3e3cc361 in ?? () from /lib64/libcuda.so.1
	#7 0x00007fbe3e541526 in ?? () from /lib64/libcuda.so.1
	#8 0x00007fbe3e2dc56b in ?? () from /lib64/libcuda.so.1
	#9 0x00007fbe3e2dc7e8 in ?? () from /lib64/libcuda.so.1
	#10 0x00007fbe3e2dc82e in ?? () from /lib64/libcuda.so.1
	#11 0x00007fbe3e46fcd6 in cuLaunchKernel () from /lib64/libcuda.so.1
	#12 0x00007fbe8307d8fd in ?? () from /usr/local/cuda/lib64/libcudart.so.9.2
	#13 0x00007fbe8307d987 in ?? () from /usr/local/cuda/lib64/libcudart.so.9.2
	#14 0x00007fbe830ab96b in cudaLaunchKernel () from /usr/local/cuda/lib64/libcudart.so.9.2
	#15 0x00007fbe5eab6035 in ncclBarrierEnqueueWait (comm=0x7fbdec000dc0) at enqueue.cc:215
	#16 0x00007fbe5eaba233 in ncclGroupEnd () at group.cc:282
	#17 0x00007fbe82093ce2 in c10d::(anonymous namespace)::AutoNcclGroup::~AutoNcclGroup (this=0x7fff70b1be1f, __in_chrg=<optimized out>)
	at /home/rvarm1/pytorch/torch/lib/c10d/ProcessGroupNCCL.cpp:33
	#18 0x00007fbe8209f166 in c10d::ProcessGroupNCCL::collective<c10d::ProcessGroupNCCL::allgather(std::vector<std::vector<at::Tensor> >&, std::vector<at::Tensor>&, const c10d::AllgatherOptions&)::<lambda(at::Tensor&, at::Tensor&, ncclComm_t, c10::cuda::CUDAStream&)>, c10d::ProcessGroupNCCL::allgather(std::vector<std::vector<at::Tensor> >&, std::vector<at::Tensor>&, const c10d::AllgatherOptions&)::<lambda(std::vector<c10::cuda::CUDAStream>&)>, c10d::ProcessGroupNCCL::allgather(std::vector<std::vector<at::Tensor> >&, std::vector<at::Tensor>&, const c10d::AllgatherOptions&)::<lambda(std::vector<c10::cuda::CUDAStream>&)> >(std::vector<at::Tensor, std::allocator<at::Tensor> > &, std::vector<at::Tensor, std::allocator<at::Tensor> > &, c10d::ProcessGroupNCCL::<lambda(at::Tensor&, at::Tensor&, ncclComm_t, c10::cuda::CUDAStream&)>, c10d::ProcessGroupNCCL::<lambda(std::vector<c10::cuda::CUDAStream, std::allocator<c10::cuda::CUDAStream> >&)>, c10d::ProcessGroupNCCL::<lambda(std::vector<c10::cuda::CUDAStream, std::allocator<c10::cuda::CUDAStream> >&)>, c10d::OpType, const char *) (
	this=0x55f52cb1e9c0, inputs=..., outputs=..., fn=..., pre=..., post=..., opType=c10d::OpType::ALLGATHER, profilingTitle=0x7fbe82400974 "nccl:all_gather")
	at /home/rvarm1/pytorch/torch/lib/c10d/ProcessGroupNCCL.cpp:1101
	#19 0x00007fbe8209c9a5 in c10d::ProcessGroupNCCL::allgather (this=0x55f52cb1e9c0, outputTensors=..., inputTensors=..., opts=...)
	at /home/rvarm1/pytorch/torch/lib/c10d/ProcessGroupNCCL.cpp:1372