Created
June 15, 2017 21:36
-
-
Save mckees/4b01ea9bcf09f6224ffda88611bbc666 to your computer and use it in GitHub Desktop.
Tensorflow GPU Out of Memory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2017-06-15 11:32:50.740362: W tensorflow/core/common_runtime/bfc_allocator.cc:277] *****************x******************************x*****************************x***x***************xx | |
2017-06-15 11:32:50.740369: W tensorflow/core/framework/op_kernel.cc:1165] Resource exhausted: OOM when allocating tensor with shape[1152,1024,4,4] | |
2017-06-15 11:32:50.740655: W tensorflow/core/framework/op_kernel.cc:1165] Resource exhausted: OOM when allocating tensor with shape[1152,1024,4,4] | |
[[Node: ssd_300_vgg/conv6/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](ssd_300_vgg/conv6/convolution/SpaceToBatchND, ssd_300_vgg/conv6/weights/read/_223)]] | |
INFO:tensorflow:Error reported to Coordinator: <class 'tensorflow.python.framework.errors_impl.ResourceExhaustedError'>, OOM when allocating tensor with shape[1152,1024,4,4] | |
[[Node: ssd_300_vgg/conv6/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](ssd_300_vgg/conv6/convolution/SpaceToBatchND, ssd_300_vgg/conv6/weights/read/_223)]] | |
[[Node: zero_fraction_9/Mean/_315 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_816_zero_fraction_9/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]] | |
Caused by op u'ssd_300_vgg/conv6/convolution', defined at: | |
File "train_ssd_network.py", line 390, in <module> | |
tf.app.run() | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 48, in run | |
_sys.exit(main(_sys.argv[:1] + flags_passthrough)) | |
File "train_ssd_network.py", line 291, in main | |
clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) | |
File "/my/path/SSD-Tensorflow/deployment/model_deploy.py", line 196, in create_clones | |
outputs = model_fn(*args, **kwargs) | |
File "train_ssd_network.py", line 275, in clone_fn | |
ssd_net.net(b_image, is_training=True) | |
File "/my/path/SSD-Tensorflow/nets/ssd_vgg_300.py", line 155, in net | |
scope=scope) | |
File "/my/path/SSD-Tensorflow/nets/ssd_vgg_300.py", line 474, in ssd_net | |
net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6') | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 181, in func_with_args | |
return func(*args, **current_args) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/layers/python/layers/layers.py", line 949, in convolution | |
outputs = layer.apply(inputs) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/layers/base.py", line 492, in apply | |
return self.__call__(inputs, *args, **kwargs) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/layers/base.py", line 441, in __call__ | |
outputs = self.call(inputs, *args, **kwargs) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/layers/convolutional.py", line 158, in call | |
data_format=utils.convert_data_format(self.data_format, self.rank + 2)) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 670, in convolution | |
op=op) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 453, in with_space_to_batch | |
result = op(input_converted, num_spatial_dims, "VALID") | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 662, in op | |
name=name) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 131, in _non_atrous_convolution | |
name=name) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_nn_ops.py", line 399, in conv2d | |
data_format=data_format, name=name) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op | |
op_def=op_def) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2528, in create_op | |
original_op=self._default_original_op, op_def=op_def) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1203, in __init__ | |
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access | |
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[1152,1024,4,4] | |
[[Node: ssd_300_vgg/conv6/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](ssd_300_vgg/conv6/convolution/SpaceToBatchND, ssd_300_vgg/conv6/weights/read/_223)]] | |
[[Node: zero_fraction_9/Mean/_315 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_816_zero_fraction_9/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]] | |
Traceback (most recent call last): | |
File "train_ssd_network.py", line 390, in <module> | |
tf.app.run() | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 48, in run | |
_sys.exit(main(_sys.argv[:1] + flags_passthrough)) | |
File "train_ssd_network.py", line 386, in main | |
sync_optimizer=None) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/slim/python/slim/learning.py", line 761, in train | |
sv.stop(threads, close_summary_writer=True) | |
File "/usr/lib/python2.7/contextlib.py", line 35, in __exit__ | |
self.gen.throw(type, value, traceback) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/supervisor.py", line 964, in managed_session | |
self.stop(close_summary_writer=close_summary_writer) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/supervisor.py", line 792, in stop | |
stop_grace_period_secs=self._stop_grace_secs) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/coordinator.py", line 389, in join | |
six.reraise(*self._exc_info_to_raise) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/coordinator.py", line 296, in stop_on_exception | |
yield | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/coordinator.py", line 494, in run | |
self.run_loop() | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/supervisor.py", line 994, in run_loop | |
self._sv.global_step]) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 896, in run | |
run_metadata_ptr) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1108, in _run | |
feed_dict_tensor, options, run_metadata) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1261, in _do_run | |
options, run_metadata) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1280, in _do_call | |
raise type(e)(node_def, op, message) | |
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[1152,1024,4,4] | |
[[Node: ssd_300_vgg/conv6/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](ssd_300_vgg/conv6/convolution/SpaceToBatchND, ssd_300_vgg/conv6/weights/read/_223)]] | |
[[Node: zero_fraction_9/Mean/_315 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_816_zero_fraction_9/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]] | |
Caused by op u'ssd_300_vgg/conv6/convolution', defined at: | |
File "train_ssd_network.py", line 390, in <module> | |
tf.app.run() | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 48, in run | |
_sys.exit(main(_sys.argv[:1] + flags_passthrough)) | |
File "train_ssd_network.py", line 291, in main | |
clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) | |
File "/my/path/SSD-Tensorflow/deployment/model_deploy.py", line 196, in create_clones | |
outputs = model_fn(*args, **kwargs) | |
File "train_ssd_network.py", line 275, in clone_fn | |
ssd_net.net(b_image, is_training=True) | |
File "/my/path/SSD-Tensorflow/nets/ssd_vgg_300.py", line 155, in net | |
scope=scope) | |
File "/my/path/SSD-Tensorflow/nets/ssd_vgg_300.py", line 474, in ssd_net | |
net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6') | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 181, in func_with_args | |
return func(*args, **current_args) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/layers/python/layers/layers.py", line 949, in convolution | |
outputs = layer.apply(inputs) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/layers/base.py", line 492, in apply | |
return self.__call__(inputs, *args, **kwargs) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/layers/base.py", line 441, in __call__ | |
outputs = self.call(inputs, *args, **kwargs) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/layers/convolutional.py", line 158, in call | |
data_format=utils.convert_data_format(self.data_format, self.rank + 2)) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 670, in convolution | |
op=op) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 453, in with_space_to_batch | |
result = op(input_converted, num_spatial_dims, "VALID") | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 662, in op | |
name=name) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 131, in _non_atrous_convolution | |
name=name) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_nn_ops.py", line 399, in conv2d | |
data_format=data_format, name=name) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op | |
op_def=op_def) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2528, in create_op | |
original_op=self._default_original_op, op_def=op_def) | |
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1203, in __init__ | |
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access | |
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[1152,1024,4,4] | |
[[Node: ssd_300_vgg/conv6/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](ssd_300_vgg/conv6/convolution/SpaceToBatchND, ssd_300_vgg/conv6/weights/read/_223)]] | |
[[Node: zero_fraction_9/Mean/_315 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_816_zero_fraction_9/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment