Skip to content

Instantly share code, notes, and snippets.

@taylanbil
Created September 18, 2019 03:30
Show Gist options
  • Save taylanbil/8ff73e6b8cb26e550c8d47be1a844f48 to your computer and use it in GitHub Desktop.
Save taylanbil/8ff73e6b8cb26e550c8d47be1a844f48 to your computer and use it in GitHub Desktop.
error while dumping graphs
| WARNING: 240829 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1422704, 2718830, 2897878, 3673048, 2016896, 2200333, 3886976, 2097242, 3124502, 2871279]
Epoch 1 begin 00:17:55
training/ 00:19:08, device xla:1, step 1, Rate=132.04, GlobalRate=132.04, loss=15.8125, nll_loss=15.8750
training/ 00:20:21, device xla:1, step 2, Rate=54.94, GlobalRate=6.89, loss=15.8125, nll_loss=15.8125
training/ 00:25:46, device xla:1, step 3, Rate=22.92, GlobalRate=2.56, loss=16.0000, nll_loss=16.0000
training/ 00:40:56, device xla:1, step 4, Rate=9.34, GlobalRate=0.98, loss=15.9375, nll_loss=15.9375
training/ 01:58:50, device xla:1, step 5, Rate=3.77, GlobalRate=0.26, loss=15.7500, nll_loss=15.8125
2019-09-18 03:13:04.411218: E tensorflow/compiler/xla/xla_client/tf_logging.cc:11] Check failed: session_work.first->session()->Run( session_work.second.feed_inputs, session_work.second.outputs_handles, &outputs) == ::tensorflow::Status::OK() (Unavailable: From /job:tpu_worker/replica:0/task:0:
Driver not open.
[[{{node XRTAllocateFromTensor_9}}]] vs. OK)
*** Begin stack trace ***
tensorflow::CurrentStackTrace[abi:cxx11]()
xla::XrtComputationClient::TransferToServer(absl::Span<xla::ComputationClient::TensorSource const>)
torch_xla::TensorToXlaData(at::Tensor const&, torch_xla::Device const&)
torch_xla::XLATensor::GetIrValueForTensor(at::Tensor const&, torch_xla::Device const&)
torch_xla::XLATensor::GetIrValue() const
torch_xla::XLATensor::div(torch_xla::XLATensor const&, torch_xla::XLATensor const&)
torch_xla::AtenXlaType::div(at::Tensor const&, at::Tensor const&)
c10::detail::wrap_kernel_functor_unboxed_<c10::detail::WrapKernelFunction_<at::Tensor (at::Tensor const&, at::Tensor const&), &torch_xla::AtenXlaType::div, at::Tensor, c10::guts::typelist::typelist<at::Tensor const&, at::Tensor const&> >, at::Tensor (at::Tensor const&, at::Tensor const&)>::call(c10::KernelCache*, at::Tensor const&, at::Tensor const&)
torch::autograd::VariableType::div(at::Tensor const&, at::Tensor const&)
PyCFunction_Call
PyObject_Call
PyNumber_TrueDivide
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyFunction_FastCallDict
_PyObject_FastCallDict
_PyObject_Call_Prepend
PyObject_Call
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyFunction_FastCallDict
_PyObject_FastCallDict
_PyObject_Call_Prepend
PyObject_Call
clone
*** End stack trace ***
Exception in model function for device=xla:1: tensorflow/compiler/xla/xla_client/xrt_computation_client.cc:266 : Check failed: session_work.first->session()->Run( session_work.second.feed_inputs, session_work.second.outputs_handles, &outputs) == ::tensorflow::Status::OK() (Unavailable: From /job:tpu_worker/replica:0/task:0:
Driver not open.
[[{{node XRTAllocateFromTensor_9}}]] vs. OK)
*** Begin stack trace ***
tensorflow::CurrentStackTrace[abi:cxx11]()
xla::XrtComputationClient::TransferToServer(absl::Span<xla::ComputationClient::TensorSource const>)
torch_xla::TensorToXlaData(at::Tensor const&, torch_xla::Device const&)
torch_xla::XLATensor::GetIrValueForTensor(at::Tensor const&, torch_xla::Device const&)
torch_xla::XLATensor::GetIrValue() const
torch_xla::XLATensor::div(torch_xla::XLATensor const&, torch_xla::XLATensor const&)
torch_xla::AtenXlaType::div(at::Tensor const&, at::Tensor const&)
c10::detail::wrap_kernel_functor_unboxed_<c10::detail::WrapKernelFunction_<at::Tensor (at::Tensor const&, at::Tensor const&), &torch_xla::AtenXlaType::div, at::Tensor, c10::guts::typelist::typelist<at::Tensor const&, at::Tensor const&> >, at::Tensor (at::Tensor const&, at::Tensor const&)>::call(c10::KernelCache*, at::Tensor const&, at::Tensor const&)
torch::autograd::VariableType::div(at::Tensor const&, at::Tensor const&)
PyCFunction_Call
PyObject_Call
PyNumber_TrueDivide
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyFunction_FastCallDict
_PyObject_FastCallDict
_PyObject_Call_Prepend
PyObject_Call
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyFunction_FastCallDict
_PyObject_FastCallDict
_PyObject_Call_Prepend
PyObject_Call
clone
*** End stack trace ***
Traceback (most recent call last):
File "/anaconda3/envs/pytorch-nightly/lib/python3.6/site-packages/torch_xla_py/data_parallel.py", line 229, in _module_runner
result.result = loop_fn(module, loader, torch.device(device), context)
File "tpu-examples/fairseq_train_tpu.py", line 289, in train_loop_fn
log_output = trainer.train_step(samples)
File "/home/taylanbil/tpu-examples/deps/fairseq/fairseq/trainer.py", line 328, in train_step
logging_outputs, self.criterion
File "/home/taylanbil/tpu-examples/deps/fairseq/fairseq/tasks/fairseq_task.py", line 258, in aggregate_logging_outputs
return criterion.__class__.aggregate_logging_outputs(logging_outputs)
File "/home/taylanbil/tpu-examples/deps/fairseq/fairseq/criterions/label_smoothed_cross_entropy.py", line 74, in aggregate_logging_outputs
'loss': sum(log.get('loss', 0) for log in logging_outputs) / sample_size / math.log(2) if sample_size > 0 else 0.,
RuntimeError: tensorflow/compiler/xla/xla_client/xrt_computation_client.cc:266 : Check failed: session_work.first->session()->Run( session_work.second.feed_inputs, session_work.second.outputs_handles, &outputs) == ::tensorflow::Status::OK() (Unavailable: From /job:tpu_worker/replica:0/task:0:
Driver not open.
[[{{node XRTAllocateFromTensor_9}}]] vs. OK)
*** Begin stack trace ***
tensorflow::CurrentStackTrace[abi:cxx11]()
xla::XrtComputationClient::TransferToServer(absl::Span<xla::ComputationClient::TensorSource const>)
torch_xla::TensorToXlaData(at::Tensor const&, torch_xla::Device const&)
torch_xla::XLATensor::GetIrValueForTensor(at::Tensor const&, torch_xla::Device const&)
torch_xla::XLATensor::GetIrValue() const
torch_xla::XLATensor::div(torch_xla::XLATensor const&, torch_xla::XLATensor const&)
torch_xla::AtenXlaType::div(at::Tensor const&, at::Tensor const&)
c10::detail::wrap_kernel_functor_unboxed_<c10::detail::WrapKernelFunction_<at::Tensor (at::Tensor const&, at::Tensor const&), &torch_xla::AtenXlaType::div, at::Tensor, c10::guts::typelist::typelist<at::Tensor const&, at::Tensor const&> >, at::Tensor (at::Tensor const&, at::Tensor const&)>::call(c10::KernelCache*, at::Tensor const&, at::Tensor const&)
torch::autograd::VariableType::div(at::Tensor const&, at::Tensor const&)
PyCFunction_Call
PyObject_Call
PyNumber_TrueDivide
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyFunction_FastCallDict
_PyObject_FastCallDict
_PyObject_Call_Prepend
PyObject_Call
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyFunction_FastCallDict
_PyObject_FastCallDict
_PyObject_Call_Prepend
PyObject_Call
clone
*** End stack trace ***
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment