Skip to content

Instantly share code, notes, and snippets.

@mckees
Created June 15, 2017 21:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mckees/4b01ea9bcf09f6224ffda88611bbc666 to your computer and use it in GitHub Desktop.
Save mckees/4b01ea9bcf09f6224ffda88611bbc666 to your computer and use it in GitHub Desktop.
Tensorflow GPU Out of Memory
2017-06-15 11:32:50.740362: W tensorflow/core/common_runtime/bfc_allocator.cc:277] *****************x******************************x*****************************x***x***************xx
2017-06-15 11:32:50.740369: W tensorflow/core/framework/op_kernel.cc:1165] Resource exhausted: OOM when allocating tensor with shape[1152,1024,4,4]
2017-06-15 11:32:50.740655: W tensorflow/core/framework/op_kernel.cc:1165] Resource exhausted: OOM when allocating tensor with shape[1152,1024,4,4]
[[Node: ssd_300_vgg/conv6/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](ssd_300_vgg/conv6/convolution/SpaceToBatchND, ssd_300_vgg/conv6/weights/read/_223)]]
INFO:tensorflow:Error reported to Coordinator: <class 'tensorflow.python.framework.errors_impl.ResourceExhaustedError'>, OOM when allocating tensor with shape[1152,1024,4,4]
[[Node: ssd_300_vgg/conv6/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](ssd_300_vgg/conv6/convolution/SpaceToBatchND, ssd_300_vgg/conv6/weights/read/_223)]]
[[Node: zero_fraction_9/Mean/_315 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_816_zero_fraction_9/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
Caused by op u'ssd_300_vgg/conv6/convolution', defined at:
File "train_ssd_network.py", line 390, in <module>
tf.app.run()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "train_ssd_network.py", line 291, in main
clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue])
File "/my/path/SSD-Tensorflow/deployment/model_deploy.py", line 196, in create_clones
outputs = model_fn(*args, **kwargs)
File "train_ssd_network.py", line 275, in clone_fn
ssd_net.net(b_image, is_training=True)
File "/my/path/SSD-Tensorflow/nets/ssd_vgg_300.py", line 155, in net
scope=scope)
File "/my/path/SSD-Tensorflow/nets/ssd_vgg_300.py", line 474, in ssd_net
net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6')
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 181, in func_with_args
return func(*args, **current_args)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/layers/python/layers/layers.py", line 949, in convolution
outputs = layer.apply(inputs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/layers/base.py", line 492, in apply
return self.__call__(inputs, *args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/layers/base.py", line 441, in __call__
outputs = self.call(inputs, *args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/layers/convolutional.py", line 158, in call
data_format=utils.convert_data_format(self.data_format, self.rank + 2))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 670, in convolution
op=op)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 453, in with_space_to_batch
result = op(input_converted, num_spatial_dims, "VALID")
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 662, in op
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 131, in _non_atrous_convolution
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_nn_ops.py", line 399, in conv2d
data_format=data_format, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2528, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1203, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[1152,1024,4,4]
[[Node: ssd_300_vgg/conv6/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](ssd_300_vgg/conv6/convolution/SpaceToBatchND, ssd_300_vgg/conv6/weights/read/_223)]]
[[Node: zero_fraction_9/Mean/_315 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_816_zero_fraction_9/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
Traceback (most recent call last):
File "train_ssd_network.py", line 390, in <module>
tf.app.run()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "train_ssd_network.py", line 386, in main
sync_optimizer=None)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/slim/python/slim/learning.py", line 761, in train
sv.stop(threads, close_summary_writer=True)
File "/usr/lib/python2.7/contextlib.py", line 35, in __exit__
self.gen.throw(type, value, traceback)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/supervisor.py", line 964, in managed_session
self.stop(close_summary_writer=close_summary_writer)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/supervisor.py", line 792, in stop
stop_grace_period_secs=self._stop_grace_secs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/coordinator.py", line 389, in join
six.reraise(*self._exc_info_to_raise)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/coordinator.py", line 296, in stop_on_exception
yield
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/coordinator.py", line 494, in run
self.run_loop()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/supervisor.py", line 994, in run_loop
self._sv.global_step])
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 896, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1108, in _run
feed_dict_tensor, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1261, in _do_run
options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1280, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[1152,1024,4,4]
[[Node: ssd_300_vgg/conv6/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](ssd_300_vgg/conv6/convolution/SpaceToBatchND, ssd_300_vgg/conv6/weights/read/_223)]]
[[Node: zero_fraction_9/Mean/_315 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_816_zero_fraction_9/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
Caused by op u'ssd_300_vgg/conv6/convolution', defined at:
File "train_ssd_network.py", line 390, in <module>
tf.app.run()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "train_ssd_network.py", line 291, in main
clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue])
File "/my/path/SSD-Tensorflow/deployment/model_deploy.py", line 196, in create_clones
outputs = model_fn(*args, **kwargs)
File "train_ssd_network.py", line 275, in clone_fn
ssd_net.net(b_image, is_training=True)
File "/my/path/SSD-Tensorflow/nets/ssd_vgg_300.py", line 155, in net
scope=scope)
File "/my/path/SSD-Tensorflow/nets/ssd_vgg_300.py", line 474, in ssd_net
net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6')
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 181, in func_with_args
return func(*args, **current_args)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/layers/python/layers/layers.py", line 949, in convolution
outputs = layer.apply(inputs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/layers/base.py", line 492, in apply
return self.__call__(inputs, *args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/layers/base.py", line 441, in __call__
outputs = self.call(inputs, *args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/layers/convolutional.py", line 158, in call
data_format=utils.convert_data_format(self.data_format, self.rank + 2))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 670, in convolution
op=op)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 453, in with_space_to_batch
result = op(input_converted, num_spatial_dims, "VALID")
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 662, in op
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 131, in _non_atrous_convolution
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_nn_ops.py", line 399, in conv2d
data_format=data_format, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2528, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1203, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[1152,1024,4,4]
[[Node: ssd_300_vgg/conv6/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](ssd_300_vgg/conv6/convolution/SpaceToBatchND, ssd_300_vgg/conv6/weights/read/_223)]]
[[Node: zero_fraction_9/Mean/_315 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_816_zero_fraction_9/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment