Skip to content

Instantly share code, notes, and snippets.

@byronyi
Created January 17, 2019 04:40
Show Gist options
  • Save byronyi/ca1b55e5a5423d5b3abb9efc6fd34b80 to your computer and use it in GitHub Desktop.
Save byronyi/ca1b55e5a5423d5b3abb9efc6fd34b80 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting job # 0 in a separate thread.\n"
]
}
],
"source": [
"%%python --bg\n",
"\n",
"import os\n",
"import json\n",
"\n",
"import tensorflow as tf\n",
"\n",
"os.environ['TF_CONFIG'] = json.dumps({\n",
" 'cluster': {\n",
" 'worker': ['localhost:5000', 'localhost:5001']\n",
" },\n",
" 'task': {'type': 'worker', 'index': 0}\n",
"})\n",
"\n",
"tf.contrib.distribute.run_standard_tensorflow_server().join()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting job # 2 in a separate thread.\n"
]
}
],
"source": [
"%%python --bg\n",
"\n",
"import os\n",
"import json\n",
"\n",
"import tensorflow as tf\n",
"\n",
"os.environ['TF_CONFIG'] = json.dumps({\n",
" 'cluster': {\n",
" 'worker': ['localhost:5000', 'localhost:5001']\n",
" },\n",
" 'task': {'type': 'worker', 'index': 1}\n",
"})\n",
"\n",
"tf.contrib.distribute.run_standard_tensorflow_server().join()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:Not all devices in `tf.distribute.Strategy` are visible to TensorFlow.\n",
"INFO:tensorflow:CollectiveAllReduceStrategy with local_devices = ('/device:CPU:0',)\n",
"INFO:tensorflow:Initializing RunConfig with distribution strategies.\n",
"INFO:tensorflow:RunConfig initialized for Distribute Coordinator with STANDALONE_CLIENT mode\n",
"WARNING:tensorflow:Using temporary folder as model directory: /var/folders/gn/sjntndrs1fs22kfr302697mr0000gn/T/tmpMkv3Vq\n",
"INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': {'worker': ['localhost:5000', 'localhost:5001']}, '_model_dir': '/var/folders/gn/sjntndrs1fs22kfr302697mr0000gn/T/tmpMkv3Vq', '_protocol': None, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_session_config': allow_soft_placement: true\n",
"graph_options {\n",
" rewrite_options {\n",
" meta_optimizer_iterations: ONE\n",
" }\n",
"}\n",
", '_tf_random_seed': None, '_save_summary_steps': 100, '_device_fn': None, '_experimental_distribute': DistributeConfig(train_distribute=<tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x12293ebd0>, eval_distribute=<tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x12293ebd0>, remote_cluster={'worker': ['localhost:5000', 'localhost:5001']}), '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_evaluation_master': '', '_eval_distribute': <tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x12293ebd0>, '_train_distribute': <tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x12293ebd0>, '_master': '', '_distribute_coordinator_mode': 'standalone_client'}\n",
"INFO:tensorflow:Running `train_and_evaluate` with Distribute Coordinator.\n",
"INFO:tensorflow:Running Distribute Coordinator with mode = 'standalone_client', cluster_spec = {'worker': ['localhost:5000', 'localhost:5001']}, task_type = None, task_id = None, environment = None, rpc_layer = 'grpc'\n",
"INFO:tensorflow:Device is available but not used by distribute strategy: /device:CPU:0\n",
"INFO:tensorflow:Device is available but not used by distribute strategy: /device:CPU:0\n",
"WARNING:tensorflow:Not all devices in `tf.distribute.Strategy` are visible to TensorFlow.\n",
"WARNING:tensorflow:Not all devices in `tf.distribute.Strategy` are visible to TensorFlow.\n",
"INFO:tensorflow:Multi-worker CollectiveAllReduceStrategy with cluster_spec = {'worker': ['localhost:5000', 'localhost:5001']}, task_type = 'worker', task_id = 0, num_workers = 2, local_devices = ('/job:worker/task:0',)\n",
"INFO:tensorflow:Multi-worker CollectiveAllReduceStrategy with cluster_spec = {'worker': ['localhost:5000', 'localhost:5001']}, task_type = 'worker', task_id = 1, num_workers = 2, local_devices = ('/job:worker/task:1',)\n",
"INFO:tensorflow:Updated config: {'_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
"graph_options {\n",
" rewrite_options {\n",
" meta_optimizer_iterations: ONE\n",
" }\n",
"}\n",
", '_keep_checkpoint_max': 5, '_task_type': 'worker', '_train_distribute': <tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x123d21590>, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x123d2c710>, '_model_dir': '/var/folders/gn/sjntndrs1fs22kfr302697mr0000gn/T/tmpMkv3Vq', '_protocol': None, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_save_summary_steps': 100, '_device_fn': None, '_experimental_distribute': DistributeConfig(train_distribute=<tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x123d2c2d0>, eval_distribute=<tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x123d2c2d0>, remote_cluster={'worker': ['localhost:5000', 'localhost:5001']}), '_num_worker_replicas': 2, '_task_id': 0, '_log_step_count_steps': 100, '_evaluation_master': 'grpc://localhost:5000', '_eval_distribute': <tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x123d2c2d0>, '_global_id_in_cluster': 0, '_master': 'grpc://localhost:5000', '_distribute_coordinator_mode': 'standalone_client'}\n",
"INFO:tensorflow:Updated config: {'_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
"graph_options {\n",
" rewrite_options {\n",
" meta_optimizer_iterations: ONE\n",
" }\n",
"}\n",
", '_keep_checkpoint_max': 5, '_task_type': 'worker', '_train_distribute': <tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x123d21c50>, '_is_chief': False, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x123d2cb90>, '_model_dir': '/var/folders/gn/sjntndrs1fs22kfr302697mr0000gn/T/tmpMkv3Vq', '_protocol': None, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_save_summary_steps': 100, '_device_fn': None, '_experimental_distribute': DistributeConfig(train_distribute=<tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x123d2cb50>, eval_distribute=<tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x123d2cb50>, remote_cluster={'worker': ['localhost:5000', 'localhost:5001']}), '_num_worker_replicas': 2, '_task_id': 1, '_log_step_count_steps': 100, '_evaluation_master': 'grpc://localhost:5001', '_eval_distribute': <tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x123d2cb50>, '_global_id_in_cluster': 1, '_master': 'grpc://localhost:5001', '_distribute_coordinator_mode': 'standalone_client'}\n",
"WARNING:tensorflow:From /usr/local/lib/python2.7/site-packages/tensorflow/python/data/ops/dataset_ops.py:1763: make_initializable_iterator (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_initializable_iterator(dataset)`.\n",
"WARNING:tensorflow:From /usr/local/lib/python2.7/site-packages/tensorflow/python/data/ops/dataset_ops.py:1458: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"Colocations handled automatically by placer.\n",
"INFO:tensorflow:Calling model_fn.\n",
"WARNING:tensorflow:From /usr/local/lib/python2.7/site-packages/tensorflow/python/ops/init_ops.py:1253: calling __init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"Call initializer instance with the dtype argument instead of passing it to the constructor\n",
"INFO:tensorflow:Calling model_fn.\n",
"INFO:tensorflow:Collective All-reduce invoked with batches size = 2, num_workers = 2\n",
"INFO:tensorflow:Collective All-reduce invoked with batches size = 2, num_workers = 2\n",
"INFO:tensorflow:Done calling model_fn.\n",
"INFO:tensorflow:Done calling model_fn.\n",
"INFO:tensorflow:Create CheckpointSaverHook.\n",
"INFO:tensorflow:Creating chief session creator with config: device_filters: \"/job:worker/task:0\"\n",
"allow_soft_placement: true\n",
"graph_options {\n",
" rewrite_options {\n",
" meta_optimizer_iterations: ONE\n",
" scoped_allocator_optimization: ON\n",
" scoped_allocator_opts {\n",
" enable_op: \"CollectiveReduce\"\n",
" }\n",
" }\n",
"}\n",
"experimental {\n",
" collective_group_leader: \"/job:worker/replica:0/task:0\"\n",
"}\n",
"\n",
"INFO:tensorflow:Graph was finalized.\n",
"INFO:tensorflow:Create CheckpointSaverHook.\n",
"INFO:tensorflow:Creating chief session creator with config: device_filters: \"/job:worker/task:1\"\n",
"allow_soft_placement: true\n",
"graph_options {\n",
" rewrite_options {\n",
" meta_optimizer_iterations: ONE\n",
" scoped_allocator_optimization: ON\n",
" scoped_allocator_opts {\n",
" enable_op: \"CollectiveReduce\"\n",
" }\n",
" }\n",
"}\n",
"experimental {\n",
" collective_group_leader: \"/job:worker/replica:0/task:0\"\n",
"}\n",
"\n",
"INFO:tensorflow:Graph was finalized.\n",
"INFO:tensorflow:Running local_init_op.\n",
"INFO:tensorflow:Running local_init_op.\n",
"INFO:tensorflow:Done running local_init_op.\n",
"INFO:tensorflow:Done running local_init_op.\n",
"INFO:tensorflow:Saving checkpoints for 0 into /var/folders/gn/sjntndrs1fs22kfr302697mr0000gn/T/tmpMkv3Vq/model.ckpt.\n",
"INFO:tensorflow:loss = 1.3263894e-05, step = 0\n",
"INFO:tensorflow:loss = 1.3263894e-05, step = 0\n",
"WARNING:tensorflow:It seems that global step (tf.train.get_global_step) has not been increased. Current value (could be stable): 0 vs previous value: 0. You could increase the global step by passing tf.train.get_global_step() to Optimizer.apply_gradients or Optimizer.minimize.\n",
"INFO:tensorflow:Loss for final step: 0.0.\n",
"INFO:tensorflow:Loss for final step: 0.0.\n"
]
}
],
"source": [
"import tensorflow as tf\n",
"from tensorflow.data import Dataset as tfd\n",
"from tensorflow.contrib.distribute import CollectiveAllReduceStrategy\n",
"from tensorflow.contrib.distribute import DistributeConfig\n",
"\n",
"\n",
"def input_fn():\n",
" features = tfd.from_tensors([[1.]]).repeat(32)\n",
" labels = tfd.from_tensors(1).repeat(32)\n",
" return tfd.zip((features, labels))\n",
"\n",
"def model_fn(features, labels, mode):\n",
" layer = tf.layers.Dense(1)\n",
" logits = layer(features)\n",
"\n",
" if mode == tf.estimator.ModeKeys.PREDICT:\n",
" predictions = {\"logits\": logits}\n",
" return tf.estimator.EstimatorSpec(mode, predictions=predictions)\n",
" \n",
" loss = tf.losses.mean_squared_error(\n",
" labels=labels, predictions=tf.reshape(logits, []))\n",
"\n",
" if mode == tf.estimator.ModeKeys.EVAL:\n",
" return tf.estimator.EstimatorSpec(mode, loss=loss)\n",
"\n",
" if mode == tf.estimator.ModeKeys.TRAIN:\n",
" train_op = tf.train.GradientDescentOptimizer(0.2).minimize(loss)\n",
" return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)\n",
"\n",
"train_spec = tf.estimator.TrainSpec(input_fn=input_fn)\n",
"eval_spec = tf.estimator.EvalSpec(input_fn=input_fn)\n",
"\n",
"distribution = CollectiveAllReduceStrategy(num_gpus_per_worker=0)\n",
"\n",
"config = tf.estimator.RunConfig(\n",
" experimental_distribute=DistributeConfig(\n",
" train_distribute=distribution,\n",
" eval_distribute=distribution,\n",
" remote_cluster={\n",
" 'worker': ['localhost:5000', 'localhost:5001'],\n",
" },\n",
" )\n",
")\n",
"\n",
"estimator = tf.estimator.Estimator(model_fn=model_fn, config=config)\n",
"tf.estimator.train_and_evaluate(estimator=estimator,\n",
" train_spec=train_spec,\n",
" eval_spec=eval_spec)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"All background processes were killed.\n"
]
}
],
"source": [
"%killbgscripts"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@threeleafzerg
Copy link

byronyi,
I tried you scirpts and steps exactly same.
The worker script (#3 ) always stuck with following log : (any ideas? )
(tf-estimator-nightly 1.13.0.dev2019010910 tf-nightly 1.13.0.dev20190116)
Log:
[zhouhaiy@mlt-skx052 temp]$ python worker.py
2019-01-17 15:28:47.684458: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
2019-01-17 15:28:47.699155: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2500000000 Hz
2019-01-17 15:28:47.709979: I tensorflow/compiler/xla/service/service.cc:162] XLA service 0x7e17fa0 executing computations on platfo rm Host. Devices:
2019-01-17 15:28:47.710022: I tensorflow/compiler/xla/service/service.cc:169] StreamExecutor device (0): ,
WARNING:tensorflow:Not all devices in tf.distribute.Strategy are visible to TensorFlow.
WARNING:tensorflow:Using temporary folder as model directory: /tmp/tmpQBujoJ
WARNING:tensorflow:Not all devices in tf.distribute.Strategy are visible to TensorFlow.
WARNING:tensorflow:Not all devices in tf.distribute.Strategy are visible to TensorFlow.
WARNING:tensorflow:From /home/zhouhaiy/.local/lib/python2.7/site-packages/tensorflow/python/data/ops/dataset_ops.py:1763: make_initi alizable_iterator (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use for ... in dataset: to iterate over a dataset. If using tf.estimator, return the Dataset object directly from your input f unction. As a last resort, you can use tf.compat.v1.data.make_initializable_iterator(dataset).
WARNING:tensorflow:From /home/zhouhaiy/.local/lib/python2.7/site-packages/tensorflow/python/data/ops/dataset_ops.py:1458: colocate_w ith (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From /home/zhouhaiy/.local/lib/python2.7/site-packages/tensorflow/python/ops/init_ops.py:1253: calling init ( from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor

@threeleafzerg
Copy link

I also tried these three script in my develop machine. (a rather clean environment)
The #3 script still fail with error. Does this feature needs any pre-requisite packages?
Log from #3:
WARNING:tensorflow:Not all devices in tf.distribute.Strategy are visible to TensorFlow.
WARNING:tensorflow:Not all devices in tf.distribute.Strategy are visible to TensorFlow.
WARNING:tensorflow:From /home/sunbear/miniconda2/lib/python2.7/site-packages/tensorflow/python/data/ops/dataset_ops.py:1763: make_initializable_iterator (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use for ... in dataset: to iterate over a dataset. If using tf.estimator, return the Dataset object directly from your input function. As a last resort, you can use tf.compat.v1.data.make_initializable_iterator(dataset).
WARNING:tensorflow:From /home/sunbear/miniconda2/lib/python2.7/site-packages/tensorflow/python/data/ops/dataset_ops.py:1458: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From /home/sunbear/miniconda2/lib/python2.7/site-packages/tensorflow/python/ops/init_ops.py:1253: calling init (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
E0117 19:12:52.497343637 4517 http_proxy.cc:62] 'https' scheme not supported in proxy URI
E0117 19:12:53.510391454 4515 http_proxy.cc:62] 'https' scheme not supported in proxy URI
WARNING:tensorflow:It seems that global step (tf.train.get_global_step) has not been increased. Current value (could be stable): 0 vs previous value: 0. You could increase the global step by passing tf.train.get_global_step() to Optimizer.apply_gradients or Optimizer.minimize.

Log from #1 or #2:
2019-01-17 19:12:29.215574: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-01-17 19:12:29.237936: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 3600000000 Hz
2019-01-17 19:12:29.238280: I tensorflow/compiler/xla/service/service.cc:162] XLA service 0x55a4bafcda60 executing computations on platform Host. Devices:
2019-01-17 19:12:29.238303: I tensorflow/compiler/xla/service/service.cc:169] StreamExecutor device (0): ,
2019-01-17 19:12:29.239407: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:250] Initialize GrpcChannelCache for job worker -> {0 -> localhost:6000, 1 -> localhost:6001}
2019-01-17 19:12:29.240302: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:387] Started server with target: grpc://localhost:6000
2019-01-17 19:12:29.240321: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:391] Server already started (target: grpc://localhost:6000)
2019-01-17 19:12:29.240340: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:391] Server already started (target: grpc://localhost:6000)
2019-01-17 19:12:53.513087: I tensorflow/core/distributed_runtime/master_session.cc:1192] Start master session 5a3ad4f61cec82ac with config: device_filters: "/job:worker/task:0" device_filters: "/job:worker/task:0" allow_soft_placement: true graph_options { rewrite_options { meta_optimizer_iterations: ONE scoped_allocator_optimization: ON scoped_allocator_opts { enable_op: "CollectiveReduce" enable_op: "CollectiveReduce" } } } experimental { collective_group_leader: "/job:worker/replica:0/task:0" }
E0117 19:12:53.525456321 4578 http_proxy.cc:62] 'https' scheme not supported in proxy URI
E0117 19:12:53.525458685 4577 http_proxy.cc:62] 'https' scheme not supported in proxy URI
E0117 19:12:53.525475494 4579 http_proxy.cc:62] 'https' scheme not supported in proxy URI
2019-01-17 19:12:53.701830: W tensorflow/core/common_runtime/base_collective_executor.cc:203] BaseCollectiveExecutor::StartAbort Out of range: End of sequence
[[{{node IteratorGetNext}}]]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment