Skip to content

Instantly share code, notes, and snippets.

@lispc
Last active April 29, 2017 08:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lispc/27c9f4fe935abade5de90c18276e1742 to your computer and use it in GitHub Desktop.
Save lispc/27c9f4fe935abade5de90c18276e1742 to your computer and use it in GitHub Desktop.
Tensorflow FIFOQueue bug
from __future__ import print_function
import os
os.environ["CUDA_VISIBLE_DEVICES"]=''
import time
import threading
import numpy as np
import tensorflow as tf
queue_size = 10
queue_shape = [6, 200, 200, 3]
cluster_spec_config = {'train': ['localhost:22222', 'localhost:23333']}
queue_loc = '/job:train/task:0/cpu:0'
def run_pop():
with tf.device(queue_loc):
queue = tf.FIFOQueue(10, tf.uint8, queue_shape, name='queue', shared_name='shared_queue')
sess = tf.Session('grpc://localhost:22222')
all_data = []
for i in range(5):
data = queue.dequeue()
all_data.append(tf.Print(data, [data], 'data:'))
pop_op = tf.group(*all_data)
while True:
print('running pop_op')
sess.run(pop_op)
def run_push():
with tf.device(queue_loc):
queue = tf.FIFOQueue(10, tf.uint8, queue_shape, name='queue', shared_name='shared_queue')
sess = tf.Session('grpc://localhost:23333')
sess.run(tf.global_variables_initializer())
place_holder = tf.placeholder(tf.uint8, queue_shape, 'place_holder')
enqueue_op = queue.enqueue(place_holder)
idx = 0
while True:
time.sleep(5)
print('pushing')
for i in range(4):
sess.run(enqueue_op, feed_dict={place_holder: np.full(queue_shape, idx, np.uint8)})
idx += 1
def test_dequeue():
cluster_spec = tf.train.ClusterSpec(cluster_spec_config)
server0 = tf.train.Server(cluster_spec, job_name='train', task_index=0)
server1 = tf.train.Server(cluster_spec, job_name='train', task_index=1)
# put queue on server1, pop from server0
t = threading.Thread(target=run_push)
t.setDaemon(True)
t.start()
time.sleep(20)
run_pop()
def main():
test_dequeue()
if __name__ == '__main__':
main()
'''
Log:
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcublas.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcudnn.so.5 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcufft.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcuda.so.1 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcurand.so.8.0 locally
W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE3 instructions, but these are available on your machine and could speed up CPU computations.
W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.1 instructions, but these are available on your machine and could speed up CPU computations.
W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
E tensorflow/stream_executor/cuda/cuda_driver.cc:509] failed call to cuInit: CUDA_ERROR_NO_DEVICE
I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:158] retrieving CUDA diagnostic information for host: ficusmexico
I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:165] hostname: ficusmexico
I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:189] libcuda reported version is: 367.57.0
I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:363] driver version file contents: """NVRM version: NVIDIA UNIX x86_64 Kernel Module 367.57 Mon Oct 3 20:37:01 PDT 2016
GCC version: gcc version 4.9.4 (Ubuntu 4.9.4-2ubuntu1~14.04.1)
"""
I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:193] kernel reported version is: 367.57.0
I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:300] kernel version seems to match DSO: 367.57.0
I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:200] Initialize GrpcChannelCache for job train -> {0 -> localhost:22222, 1 -> localhost:23333}
I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:221] Started server with target: grpc://localhost:22222
I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:200] Initialize GrpcChannelCache for job train -> {0 -> localhost:22222, 1 -> localhost:23333}
I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:221] Started server with target: grpc://localhost:23333
I tensorflow/core/distributed_runtime/master_session.cc:1012] Start master session 7796303ae7a69d4d with config:
pushing
pushing
pushing
running pop_op
I tensorflow/core/distributed_runtime/master_session.cc:1012] Start master session ec298cfbc9003ece with config:
I tensorflow/core/kernels/logging_ops.cc:79] data:[[[[0 0 0]]]...]
I tensorflow/core/kernels/logging_ops.cc:79] data:[[[[0 0 0]]]...]
I tensorflow/core/kernels/logging_ops.cc:79] data:[[[[0 0 0]]]...]
I tensorflow/core/kernels/logging_ops.cc:79] data:[[[[0 0 0]]]...]
I tensorflow/core/kernels/logging_ops.cc:79] data:[[[[0 0 0]]]...]
running pop_op
I tensorflow/core/kernels/logging_ops.cc:79] data:[[[[1 1 1]]]...]
I tensorflow/core/kernels/logging_ops.cc:79] data:[[[[1 1 1]]]...]
I tensorflow/core/kernels/logging_ops.cc:79] data:[[[[1 1 1]]]...]
I tensorflow/core/kernels/logging_ops.cc:79] data:[[[[1 1 1]]]...]
I tensorflow/core/kernels/logging_ops.cc:79] data:[[[[1 1 1]]]...]
running pop_op
I tensorflow/core/kernels/logging_ops.cc:79] data:[[[[2 2 2]]]...]
I tensorflow/core/kernels/logging_ops.cc:79] data:[[[[2 2 2]]]...]
I tensorflow/core/kernels/logging_ops.cc:79] data:[[[[2 2 2]]]...]
I tensorflow/core/kernels/logging_ops.cc:79] data:[[[[2 2 2]]]...]
I tensorflow/core/kernels/logging_ops.cc:79] data:[[[[2 2 2]]]...]
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment