-
-
Save byronyi/ca1b55e5a5423d5b3abb9efc6fd34b80 to your computer and use it in GitHub Desktop.
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Starting job # 0 in a separate thread.\n" | |
] | |
} | |
], | |
"source": [ | |
"%%python --bg\n", | |
"\n", | |
"import os\n", | |
"import json\n", | |
"\n", | |
"import tensorflow as tf\n", | |
"\n", | |
"os.environ['TF_CONFIG'] = json.dumps({\n", | |
" 'cluster': {\n", | |
" 'worker': ['localhost:5000', 'localhost:5001']\n", | |
" },\n", | |
" 'task': {'type': 'worker', 'index': 0}\n", | |
"})\n", | |
"\n", | |
"tf.contrib.distribute.run_standard_tensorflow_server().join()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Starting job # 2 in a separate thread.\n" | |
] | |
} | |
], | |
"source": [ | |
"%%python --bg\n", | |
"\n", | |
"import os\n", | |
"import json\n", | |
"\n", | |
"import tensorflow as tf\n", | |
"\n", | |
"os.environ['TF_CONFIG'] = json.dumps({\n", | |
" 'cluster': {\n", | |
" 'worker': ['localhost:5000', 'localhost:5001']\n", | |
" },\n", | |
" 'task': {'type': 'worker', 'index': 1}\n", | |
"})\n", | |
"\n", | |
"tf.contrib.distribute.run_standard_tensorflow_server().join()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"WARNING:tensorflow:Not all devices in `tf.distribute.Strategy` are visible to TensorFlow.\n", | |
"INFO:tensorflow:CollectiveAllReduceStrategy with local_devices = ('/device:CPU:0',)\n", | |
"INFO:tensorflow:Initializing RunConfig with distribution strategies.\n", | |
"INFO:tensorflow:RunConfig initialized for Distribute Coordinator with STANDALONE_CLIENT mode\n", | |
"WARNING:tensorflow:Using temporary folder as model directory: /var/folders/gn/sjntndrs1fs22kfr302697mr0000gn/T/tmpMkv3Vq\n", | |
"INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': {'worker': ['localhost:5000', 'localhost:5001']}, '_model_dir': '/var/folders/gn/sjntndrs1fs22kfr302697mr0000gn/T/tmpMkv3Vq', '_protocol': None, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_session_config': allow_soft_placement: true\n", | |
"graph_options {\n", | |
" rewrite_options {\n", | |
" meta_optimizer_iterations: ONE\n", | |
" }\n", | |
"}\n", | |
", '_tf_random_seed': None, '_save_summary_steps': 100, '_device_fn': None, '_experimental_distribute': DistributeConfig(train_distribute=<tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x12293ebd0>, eval_distribute=<tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x12293ebd0>, remote_cluster={'worker': ['localhost:5000', 'localhost:5001']}), '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_evaluation_master': '', '_eval_distribute': <tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x12293ebd0>, '_train_distribute': <tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x12293ebd0>, '_master': '', '_distribute_coordinator_mode': 'standalone_client'}\n", | |
"INFO:tensorflow:Running `train_and_evaluate` with Distribute Coordinator.\n", | |
"INFO:tensorflow:Running Distribute Coordinator with mode = 'standalone_client', cluster_spec = {'worker': ['localhost:5000', 'localhost:5001']}, task_type = None, task_id = None, environment = None, rpc_layer = 'grpc'\n", | |
"INFO:tensorflow:Device is available but not used by distribute strategy: /device:CPU:0\n", | |
"INFO:tensorflow:Device is available but not used by distribute strategy: /device:CPU:0\n", | |
"WARNING:tensorflow:Not all devices in `tf.distribute.Strategy` are visible to TensorFlow.\n", | |
"WARNING:tensorflow:Not all devices in `tf.distribute.Strategy` are visible to TensorFlow.\n", | |
"INFO:tensorflow:Multi-worker CollectiveAllReduceStrategy with cluster_spec = {'worker': ['localhost:5000', 'localhost:5001']}, task_type = 'worker', task_id = 0, num_workers = 2, local_devices = ('/job:worker/task:0',)\n", | |
"INFO:tensorflow:Multi-worker CollectiveAllReduceStrategy with cluster_spec = {'worker': ['localhost:5000', 'localhost:5001']}, task_type = 'worker', task_id = 1, num_workers = 2, local_devices = ('/job:worker/task:1',)\n", | |
"INFO:tensorflow:Updated config: {'_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n", | |
"graph_options {\n", | |
" rewrite_options {\n", | |
" meta_optimizer_iterations: ONE\n", | |
" }\n", | |
"}\n", | |
", '_keep_checkpoint_max': 5, '_task_type': 'worker', '_train_distribute': <tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x123d21590>, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x123d2c710>, '_model_dir': '/var/folders/gn/sjntndrs1fs22kfr302697mr0000gn/T/tmpMkv3Vq', '_protocol': None, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_save_summary_steps': 100, '_device_fn': None, '_experimental_distribute': DistributeConfig(train_distribute=<tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x123d2c2d0>, eval_distribute=<tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x123d2c2d0>, remote_cluster={'worker': ['localhost:5000', 'localhost:5001']}), '_num_worker_replicas': 2, '_task_id': 0, '_log_step_count_steps': 100, '_evaluation_master': 'grpc://localhost:5000', '_eval_distribute': <tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x123d2c2d0>, '_global_id_in_cluster': 0, '_master': 'grpc://localhost:5000', '_distribute_coordinator_mode': 'standalone_client'}\n", | |
"INFO:tensorflow:Updated config: {'_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n", | |
"graph_options {\n", | |
" rewrite_options {\n", | |
" meta_optimizer_iterations: ONE\n", | |
" }\n", | |
"}\n", | |
", '_keep_checkpoint_max': 5, '_task_type': 'worker', '_train_distribute': <tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x123d21c50>, '_is_chief': False, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x123d2cb90>, '_model_dir': '/var/folders/gn/sjntndrs1fs22kfr302697mr0000gn/T/tmpMkv3Vq', '_protocol': None, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_save_summary_steps': 100, '_device_fn': None, '_experimental_distribute': DistributeConfig(train_distribute=<tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x123d2cb50>, eval_distribute=<tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x123d2cb50>, remote_cluster={'worker': ['localhost:5000', 'localhost:5001']}), '_num_worker_replicas': 2, '_task_id': 1, '_log_step_count_steps': 100, '_evaluation_master': 'grpc://localhost:5001', '_eval_distribute': <tensorflow.contrib.distribute.python.collective_all_reduce_strategy.CollectiveAllReduceStrategy object at 0x123d2cb50>, '_global_id_in_cluster': 1, '_master': 'grpc://localhost:5001', '_distribute_coordinator_mode': 'standalone_client'}\n", | |
"WARNING:tensorflow:From /usr/local/lib/python2.7/site-packages/tensorflow/python/data/ops/dataset_ops.py:1763: make_initializable_iterator (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_initializable_iterator(dataset)`.\n", | |
"WARNING:tensorflow:From /usr/local/lib/python2.7/site-packages/tensorflow/python/data/ops/dataset_ops.py:1458: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Colocations handled automatically by placer.\n", | |
"INFO:tensorflow:Calling model_fn.\n", | |
"WARNING:tensorflow:From /usr/local/lib/python2.7/site-packages/tensorflow/python/ops/init_ops.py:1253: calling __init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Call initializer instance with the dtype argument instead of passing it to the constructor\n", | |
"INFO:tensorflow:Calling model_fn.\n", | |
"INFO:tensorflow:Collective All-reduce invoked with batches size = 2, num_workers = 2\n", | |
"INFO:tensorflow:Collective All-reduce invoked with batches size = 2, num_workers = 2\n", | |
"INFO:tensorflow:Done calling model_fn.\n", | |
"INFO:tensorflow:Done calling model_fn.\n", | |
"INFO:tensorflow:Create CheckpointSaverHook.\n", | |
"INFO:tensorflow:Creating chief session creator with config: device_filters: \"/job:worker/task:0\"\n", | |
"allow_soft_placement: true\n", | |
"graph_options {\n", | |
" rewrite_options {\n", | |
" meta_optimizer_iterations: ONE\n", | |
" scoped_allocator_optimization: ON\n", | |
" scoped_allocator_opts {\n", | |
" enable_op: \"CollectiveReduce\"\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"experimental {\n", | |
" collective_group_leader: \"/job:worker/replica:0/task:0\"\n", | |
"}\n", | |
"\n", | |
"INFO:tensorflow:Graph was finalized.\n", | |
"INFO:tensorflow:Create CheckpointSaverHook.\n", | |
"INFO:tensorflow:Creating chief session creator with config: device_filters: \"/job:worker/task:1\"\n", | |
"allow_soft_placement: true\n", | |
"graph_options {\n", | |
" rewrite_options {\n", | |
" meta_optimizer_iterations: ONE\n", | |
" scoped_allocator_optimization: ON\n", | |
" scoped_allocator_opts {\n", | |
" enable_op: \"CollectiveReduce\"\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"experimental {\n", | |
" collective_group_leader: \"/job:worker/replica:0/task:0\"\n", | |
"}\n", | |
"\n", | |
"INFO:tensorflow:Graph was finalized.\n", | |
"INFO:tensorflow:Running local_init_op.\n", | |
"INFO:tensorflow:Running local_init_op.\n", | |
"INFO:tensorflow:Done running local_init_op.\n", | |
"INFO:tensorflow:Done running local_init_op.\n", | |
"INFO:tensorflow:Saving checkpoints for 0 into /var/folders/gn/sjntndrs1fs22kfr302697mr0000gn/T/tmpMkv3Vq/model.ckpt.\n", | |
"INFO:tensorflow:loss = 1.3263894e-05, step = 0\n", | |
"INFO:tensorflow:loss = 1.3263894e-05, step = 0\n", | |
"WARNING:tensorflow:It seems that global step (tf.train.get_global_step) has not been increased. Current value (could be stable): 0 vs previous value: 0. You could increase the global step by passing tf.train.get_global_step() to Optimizer.apply_gradients or Optimizer.minimize.\n", | |
"INFO:tensorflow:Loss for final step: 0.0.\n", | |
"INFO:tensorflow:Loss for final step: 0.0.\n" | |
] | |
} | |
], | |
"source": [ | |
"import tensorflow as tf\n", | |
"from tensorflow.data import Dataset as tfd\n", | |
"from tensorflow.contrib.distribute import CollectiveAllReduceStrategy\n", | |
"from tensorflow.contrib.distribute import DistributeConfig\n", | |
"\n", | |
"\n", | |
"def input_fn():\n", | |
" features = tfd.from_tensors([[1.]]).repeat(32)\n", | |
" labels = tfd.from_tensors(1).repeat(32)\n", | |
" return tfd.zip((features, labels))\n", | |
"\n", | |
"def model_fn(features, labels, mode):\n", | |
" layer = tf.layers.Dense(1)\n", | |
" logits = layer(features)\n", | |
"\n", | |
" if mode == tf.estimator.ModeKeys.PREDICT:\n", | |
" predictions = {\"logits\": logits}\n", | |
" return tf.estimator.EstimatorSpec(mode, predictions=predictions)\n", | |
" \n", | |
" loss = tf.losses.mean_squared_error(\n", | |
" labels=labels, predictions=tf.reshape(logits, []))\n", | |
"\n", | |
" if mode == tf.estimator.ModeKeys.EVAL:\n", | |
" return tf.estimator.EstimatorSpec(mode, loss=loss)\n", | |
"\n", | |
" if mode == tf.estimator.ModeKeys.TRAIN:\n", | |
" train_op = tf.train.GradientDescentOptimizer(0.2).minimize(loss)\n", | |
" return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)\n", | |
"\n", | |
"train_spec = tf.estimator.TrainSpec(input_fn=input_fn)\n", | |
"eval_spec = tf.estimator.EvalSpec(input_fn=input_fn)\n", | |
"\n", | |
"distribution = CollectiveAllReduceStrategy(num_gpus_per_worker=0)\n", | |
"\n", | |
"config = tf.estimator.RunConfig(\n", | |
" experimental_distribute=DistributeConfig(\n", | |
" train_distribute=distribution,\n", | |
" eval_distribute=distribution,\n", | |
" remote_cluster={\n", | |
" 'worker': ['localhost:5000', 'localhost:5001'],\n", | |
" },\n", | |
" )\n", | |
")\n", | |
"\n", | |
"estimator = tf.estimator.Estimator(model_fn=model_fn, config=config)\n", | |
"tf.estimator.train_and_evaluate(estimator=estimator,\n", | |
" train_spec=train_spec,\n", | |
" eval_spec=eval_spec)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"All background processes were killed.\n" | |
] | |
} | |
], | |
"source": [ | |
"%killbgscripts" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"---" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.15" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
I also tried these three script in my develop machine. (a rather clean environment)
The #3 script still fail with error. Does this feature needs any pre-requisite packages?
Log from #3:
WARNING:tensorflow:Not all devices in tf.distribute.Strategy
are visible to TensorFlow.
WARNING:tensorflow:Not all devices in tf.distribute.Strategy
are visible to TensorFlow.
WARNING:tensorflow:From /home/sunbear/miniconda2/lib/python2.7/site-packages/tensorflow/python/data/ops/dataset_ops.py:1763: make_initializable_iterator (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use for ... in dataset:
to iterate over a dataset. If using tf.estimator
, return the Dataset
object directly from your input function. As a last resort, you can use tf.compat.v1.data.make_initializable_iterator(dataset)
.
WARNING:tensorflow:From /home/sunbear/miniconda2/lib/python2.7/site-packages/tensorflow/python/data/ops/dataset_ops.py:1458: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From /home/sunbear/miniconda2/lib/python2.7/site-packages/tensorflow/python/ops/init_ops.py:1253: calling init (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
E0117 19:12:52.497343637 4517 http_proxy.cc:62] 'https' scheme not supported in proxy URI
E0117 19:12:53.510391454 4515 http_proxy.cc:62] 'https' scheme not supported in proxy URI
WARNING:tensorflow:It seems that global step (tf.train.get_global_step) has not been increased. Current value (could be stable): 0 vs previous value: 0. You could increase the global step by passing tf.train.get_global_step() to Optimizer.apply_gradients or Optimizer.minimize.
Log from #1 or #2:
2019-01-17 19:12:29.215574: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-01-17 19:12:29.237936: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 3600000000 Hz
2019-01-17 19:12:29.238280: I tensorflow/compiler/xla/service/service.cc:162] XLA service 0x55a4bafcda60 executing computations on platform Host. Devices:
2019-01-17 19:12:29.238303: I tensorflow/compiler/xla/service/service.cc:169] StreamExecutor device (0): ,
2019-01-17 19:12:29.239407: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:250] Initialize GrpcChannelCache for job worker -> {0 -> localhost:6000, 1 -> localhost:6001}
2019-01-17 19:12:29.240302: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:387] Started server with target: grpc://localhost:6000
2019-01-17 19:12:29.240321: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:391] Server already started (target: grpc://localhost:6000)
2019-01-17 19:12:29.240340: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:391] Server already started (target: grpc://localhost:6000)
2019-01-17 19:12:53.513087: I tensorflow/core/distributed_runtime/master_session.cc:1192] Start master session 5a3ad4f61cec82ac with config: device_filters: "/job:worker/task:0" device_filters: "/job:worker/task:0" allow_soft_placement: true graph_options { rewrite_options { meta_optimizer_iterations: ONE scoped_allocator_optimization: ON scoped_allocator_opts { enable_op: "CollectiveReduce" enable_op: "CollectiveReduce" } } } experimental { collective_group_leader: "/job:worker/replica:0/task:0" }
E0117 19:12:53.525456321 4578 http_proxy.cc:62] 'https' scheme not supported in proxy URI
E0117 19:12:53.525458685 4577 http_proxy.cc:62] 'https' scheme not supported in proxy URI
E0117 19:12:53.525475494 4579 http_proxy.cc:62] 'https' scheme not supported in proxy URI
2019-01-17 19:12:53.701830: W tensorflow/core/common_runtime/base_collective_executor.cc:203] BaseCollectiveExecutor::StartAbort Out of range: End of sequence
[[{{node IteratorGetNext}}]]
byronyi,
I tried you scirpts and steps exactly same.
The worker script (#3 ) always stuck with following log : (any ideas? )
(tf-estimator-nightly 1.13.0.dev2019010910 tf-nightly 1.13.0.dev20190116)
Log:
[zhouhaiy@mlt-skx052 temp]$ python worker.py
2019-01-17 15:28:47.684458: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
2019-01-17 15:28:47.699155: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2500000000 Hz
2019-01-17 15:28:47.709979: I tensorflow/compiler/xla/service/service.cc:162] XLA service 0x7e17fa0 executing computations on platfo rm Host. Devices:
2019-01-17 15:28:47.710022: I tensorflow/compiler/xla/service/service.cc:169] StreamExecutor device (0): ,
WARNING:tensorflow:Not all devices in
tf.distribute.Strategy
are visible to TensorFlow.WARNING:tensorflow:Using temporary folder as model directory: /tmp/tmpQBujoJ
WARNING:tensorflow:Not all devices in
tf.distribute.Strategy
are visible to TensorFlow.WARNING:tensorflow:Not all devices in
tf.distribute.Strategy
are visible to TensorFlow.WARNING:tensorflow:From /home/zhouhaiy/.local/lib/python2.7/site-packages/tensorflow/python/data/ops/dataset_ops.py:1763: make_initi alizable_iterator (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use
for ... in dataset:
to iterate over a dataset. If usingtf.estimator
, return theDataset
object directly from your input f unction. As a last resort, you can usetf.compat.v1.data.make_initializable_iterator(dataset)
.WARNING:tensorflow:From /home/zhouhaiy/.local/lib/python2.7/site-packages/tensorflow/python/data/ops/dataset_ops.py:1458: colocate_w ith (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From /home/zhouhaiy/.local/lib/python2.7/site-packages/tensorflow/python/ops/init_ops.py:1253: calling init ( from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor