TuranTimur/gist:37ed459861b517c5b05d0157398d78e0

## gistfile1.txt
root@98c54cfbe5b9:/examples# mpirun -np 1     -H localhost:1     -bind-to none -map-by slot     -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH     -mca pml ob1 -mca btl ^openib         python benchmarks-horovod_v2/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py         --model resnet101         --batch_size 16         --variable_update horovod
/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
TensorFlow:  1.5
Model:       resnet101
Dataset:     imagenet (synthetic)
Mode:        training
SingleSess:  False
Batch size:  16 global
             16.0 per device
Num batches: 100
Num epochs:  0.00
Devices:     ['horovod/gpu:0']
Data format: NCHW
Layout optimizer: False
Optimizer:   sgd
Variables:   horovod
==========
Generating model
W0311 19:20:55.139298 139969835271936 tf_logging.py:118] From /examples/benchmarks-horovod_v2/scripts/tf_cnn_benchmarks/convnet_builder.py:372: calling reduce_mean (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
W0311 19:21:00.482042 139969835271936 tf_logging.py:118] From /examples/benchmarks-horovod_v2/scripts/tf_cnn_benchmarks/benchmark_cnn.py:1342: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
2018-03-11 19:21:01.715803: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2018-03-11 19:21:01.900144: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:895] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2018-03-11 19:21:01.901436: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1105] Found device 0 with properties:
name: GeForce GTX 1070 major: 6 minor: 1 memoryClockRate(GHz): 1.7085
pciBusID: 0000:00:0a.0
totalMemory: 7.92GiB freeMemory: 7.82GiB
2018-03-11 19:21:01.901475: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1195] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: GeForce GTX 1070, pci bus id: 0000:00:0a.0, compute capability: 6.1)
Running warm up

98c54cfbe5b9:48:55 [0] misc/ibvwrap.cu:60 WARN Failed to open libibverbs.so[.1]
98c54cfbe5b9:48:55 [0] INFO Using internal Network Socket
98c54cfbe5b9:48:55 [0] INFO Using NCCL Low-latency algorithm for sizes below 16384
98c54cfbe5b9:48:55 [0] INFO NET : Using interface eth0:172.17.0.4<0>
98c54cfbe5b9:48:55 [0] INFO NET/Socket : 1 interfaces found
NCCL version 2.1.4+cuda9.0
98c54cfbe5b9:48:55 [0] INFO Using 256 threads
98c54cfbe5b9:48:55 [0] INFO Min Comp Cap 6
98c54cfbe5b9:48:55 [0] INFO NCCL_SINGLE_RING_THRESHOLD=131072
Done warm up
Step	Img/sec	total_loss
1	images/sec: 63.6 +/- 0.0 (jitter = 0.0)	10.861
10	images/sec: 63.1 +/- 0.1 (jitter = 0.6)	9.914
20	images/sec: 62.8 +/- 0.2 (jitter = 0.6)	9.680
30	images/sec: 62.9 +/- 0.2 (jitter = 0.6)	9.571
40	images/sec: 62.8 +/- 0.1 (jitter = 0.6)	9.485
50	images/sec: 62.8 +/- 0.1 (jitter = 0.6)	9.475
60	images/sec: 62.9 +/- 0.1 (jitter = 0.5)	10.058
70	images/sec: 62.9 +/- 0.1 (jitter = 0.6)	9.692
80	images/sec: 62.8 +/- 0.1 (jitter = 0.6)	9.649
90	images/sec: 62.8 +/- 0.1 (jitter = 0.5)	9.369
100	images/sec: 62.7 +/- 0.1 (jitter = 0.5)	9.570
----------------------------------------------------------------
total images/sec: 62.70
----------------------------------------------------------------
root@98c54cfbe5b9:/examples# ls
	root@98c54cfbe5b9:/examples# mpirun -np 1 -H localhost:1 -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -mca pml ob1 -mca btl ^openib python benchmarks-horovod_v2/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --model resnet101 --batch_size 16 --variable_update horovod
	/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
	from ._conv import register_converters as _register_converters
	TensorFlow: 1.5
	Model: resnet101
	Dataset: imagenet (synthetic)
	Mode: training
	SingleSess: False
	Batch size: 16 global
	16.0 per device
	Num batches: 100
	Num epochs: 0.00
	Devices: ['horovod/gpu:0']
	Data format: NCHW
	Layout optimizer: False
	Optimizer: sgd
	Variables: horovod
	==========
	Generating model
	W0311 19:20:55.139298 139969835271936 tf_logging.py:118] From /examples/benchmarks-horovod_v2/scripts/tf_cnn_benchmarks/convnet_builder.py:372: calling reduce_mean (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
	Instructions for updating:
	keep_dims is deprecated, use keepdims instead
	W0311 19:21:00.482042 139969835271936 tf_logging.py:118] From /examples/benchmarks-horovod_v2/scripts/tf_cnn_benchmarks/benchmark_cnn.py:1342: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
	Instructions for updating:
	Please switch to tf.train.MonitoredTrainingSession
	2018-03-11 19:21:01.715803: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
	2018-03-11 19:21:01.900144: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:895] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
	2018-03-11 19:21:01.901436: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1105] Found device 0 with properties:
	name: GeForce GTX 1070 major: 6 minor: 1 memoryClockRate(GHz): 1.7085
	pciBusID: 0000:00:0a.0
	totalMemory: 7.92GiB freeMemory: 7.82GiB
	2018-03-11 19:21:01.901475: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1195] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: GeForce GTX 1070, pci bus id: 0000:00:0a.0, compute capability: 6.1)
	Running warm up

	98c54cfbe5b9:48:55 [0] misc/ibvwrap.cu:60 WARN Failed to open libibverbs.so[.1]
	98c54cfbe5b9:48:55 [0] INFO Using internal Network Socket
	98c54cfbe5b9:48:55 [0] INFO Using NCCL Low-latency algorithm for sizes below 16384
	98c54cfbe5b9:48:55 [0] INFO NET : Using interface eth0:172.17.0.4<0>
	98c54cfbe5b9:48:55 [0] INFO NET/Socket : 1 interfaces found
	NCCL version 2.1.4+cuda9.0
	98c54cfbe5b9:48:55 [0] INFO Using 256 threads
	98c54cfbe5b9:48:55 [0] INFO Min Comp Cap 6
	98c54cfbe5b9:48:55 [0] INFO NCCL_SINGLE_RING_THRESHOLD=131072
	Done warm up
	Step Img/sec total_loss
	1 images/sec: 63.6 +/- 0.0 (jitter = 0.0) 10.861
	10 images/sec: 63.1 +/- 0.1 (jitter = 0.6) 9.914
	20 images/sec: 62.8 +/- 0.2 (jitter = 0.6) 9.680
	30 images/sec: 62.9 +/- 0.2 (jitter = 0.6) 9.571
	40 images/sec: 62.8 +/- 0.1 (jitter = 0.6) 9.485
	50 images/sec: 62.8 +/- 0.1 (jitter = 0.6) 9.475
	60 images/sec: 62.9 +/- 0.1 (jitter = 0.5) 10.058
	70 images/sec: 62.9 +/- 0.1 (jitter = 0.6) 9.692
	80 images/sec: 62.8 +/- 0.1 (jitter = 0.6) 9.649
	90 images/sec: 62.8 +/- 0.1 (jitter = 0.5) 9.369
	100 images/sec: 62.7 +/- 0.1 (jitter = 0.5) 9.570
	----------------------------------------------------------------
	total images/sec: 62.70
	----------------------------------------------------------------
	root@98c54cfbe5b9:/examples# ls