jimmy15923/LMS_UM_test.py

## LMS_UM_test.py
"""
Tesing code for IBM LMS / CUDA Unified Memory
Run this script with CUDA Unified Memory by
```
python LMS_UM_test.py --image_size=224 --batch_size=256 --gpu_id=1 --cuda_memory=5
```

Run this script with IBM Large Model Support
```
python LMS_UM_test.py --image_size=224 --batch_size=256 --gpu_id=1 --use_lms=True
```

"""
import numpy as np
import time
import os

import tensorflow as tf
import tensorflow.contrib.slim as slim
import tensorflow.contrib.slim.nets as slimNet

tf.logging.set_verbosity(tf.logging.INFO)
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('f', '', 'kernel')
tf.app.flags.DEFINE_string("gpu_id", "0", "idx of GPU using")
tf.app.flags.DEFINE_string("model", "resnet50", "select from resnet50, googlenet")
tf.app.flags.DEFINE_integer("batch_size", 512, "Batch size")
tf.app.flags.DEFINE_integer("image_size", 224, "Image size")
tf.app.flags.DEFINE_float("cuda_memory", 1, "pre-alloctaed of CUDA unified memory")
tf.app.flags.DEFINE_bool("use_lms", False, "To Use LMS")

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu_id

# generate synthetic data
x = np.random.randint(0, 1, size=(FLAGS.batch_size, FLAGS.image_size, FLAGS.image_size, 3))
x = x.astype("float32")
y = np.random.randint(0, 1000, size=FLAGS.batch_size)
y = tf.keras.utils.to_categorical(y, 1000)

# def tf.data.Dataset
features_placeholder = tf.placeholder(x.dtype, x.shape)
labels_placeholder = tf.placeholder(y.dtype, y.shape)

dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
dataset = dataset.batch(FLAGS.batch_size).filter(lambda features, labels: tf.equal(tf.shape(labels)[0], FLAGS.batch_size))
dataset = dataset.repeat(500)
iterator = dataset.make_initializable_iterator()
inputs, labels = iterator.get_next()

# build model
if FLAGS.model == "resnet50":
    with slim.arg_scope(slimNet.resnet_utils.resnet_arg_scope(batch_norm_decay=0.99)):
        _, layers_dict = slimNet.resnet_v1.resnet_v1_50(inputs, num_classes=1000, global_pool=True, is_training=True)
        logits = layers_dict['resnet_v1_50/logits']
        logits = tf.keras.layers.Flatten()(logits)

elif FLAGS.model == "googlenet":
    with slim.arg_scope(slimNet.inception.inception_v1_arg_scope()):
        _, layers_dict = slimNet.inception.inception_v1(inputs, spatial_squeeze=False, num_classes=1000, is_training=True)
        fmap = layers_dict['Logits']
        output = tf.keras.layers.GlobalAveragePooling2D()(fmap)
        logits = tf.keras.layers.Dense(1000)(output)
else:
    print("No model support")

loss = tf.losses.softmax_cross_entropy(onehot_labels=labels,
                                       logits=logits)
# Create training op.
with tf.name_scope('adam_optimizer'):
    optimizer = tf.train.AdagradOptimizer(learning_rate=0.01)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        train_step = optimizer.minimize(loss, global_step=tf.train.get_global_step())

# import LMS and use
if FLAGS.use_lms:
    print("USING IBM LARGE MODEL SUPPORT")
    from tensorflow.contrib.lms import LMS
    lms_obj = LMS({'adam_optimizer'})
    lms_obj.run(graph=tf.get_default_graph())


# setup tf.ConfigProto for CUDA Unified memory
config = tf.ConfigProto()
if FLAGS.cuda_memory > 1:
    config.gpu_options.per_process_gpu_memory_fraction = FLAGS.cuda_memory
    print("USING CUDA UNIFIED MEMORY")

res = []
# Start session and training
with tf.train.MonitoredTrainingSession(config=config) as sess:
    sess.run(iterator.initializer, feed_dict={features_placeholder: x,
                                          labels_placeholder: y})
    print("RUNNING WARMUP")
    for w in range(5):
        sess.run(train_step)
    print("WARMUP DONE")
    for b in range(1, 61):
        t = time.time()
        sess.run(train_step)
        t1 = time.time()
        _loss = sess.run(loss)
        if b % 10 == 0:
            print("Num:", b, ", Loss: ", _loss, ", Elapsed time: ", t1 - t, "Images/sec: ", (FLAGS.batch_size / (t1-t)))
        res.append(FLAGS.batch_size / (t1-t))
print(np.mean(res), " +- ", np.std(res))
	"""
	Tesing code for IBM LMS / CUDA Unified Memory
	Run this script with CUDA Unified Memory by
	```
	python LMS_UM_test.py --image_size=224 --batch_size=256 --gpu_id=1 --cuda_memory=5
	```

	Run this script with IBM Large Model Support
	```
	python LMS_UM_test.py --image_size=224 --batch_size=256 --gpu_id=1 --use_lms=True
	```

	"""
	import numpy as np
	import time
	import os

	import tensorflow as tf
	import tensorflow.contrib.slim as slim
	import tensorflow.contrib.slim.nets as slimNet

	tf.logging.set_verbosity(tf.logging.INFO)
	FLAGS = tf.app.flags.FLAGS
	tf.app.flags.DEFINE_string('f', '', 'kernel')
	tf.app.flags.DEFINE_string("gpu_id", "0", "idx of GPU using")
	tf.app.flags.DEFINE_string("model", "resnet50", "select from resnet50, googlenet")
	tf.app.flags.DEFINE_integer("batch_size", 512, "Batch size")
	tf.app.flags.DEFINE_integer("image_size", 224, "Image size")
	tf.app.flags.DEFINE_float("cuda_memory", 1, "pre-alloctaed of CUDA unified memory")
	tf.app.flags.DEFINE_bool("use_lms", False, "To Use LMS")

	os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
	os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu_id

	# generate synthetic data
	x = np.random.randint(0, 1, size=(FLAGS.batch_size, FLAGS.image_size, FLAGS.image_size, 3))
	x = x.astype("float32")
	y = np.random.randint(0, 1000, size=FLAGS.batch_size)
	y = tf.keras.utils.to_categorical(y, 1000)

	# def tf.data.Dataset
	features_placeholder = tf.placeholder(x.dtype, x.shape)
	labels_placeholder = tf.placeholder(y.dtype, y.shape)

	dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
	dataset = dataset.batch(FLAGS.batch_size).filter(lambda features, labels: tf.equal(tf.shape(labels)[0], FLAGS.batch_size))
	dataset = dataset.repeat(500)
	iterator = dataset.make_initializable_iterator()
	inputs, labels = iterator.get_next()

	# build model
	if FLAGS.model == "resnet50":
	with slim.arg_scope(slimNet.resnet_utils.resnet_arg_scope(batch_norm_decay=0.99)):
	_, layers_dict = slimNet.resnet_v1.resnet_v1_50(inputs, num_classes=1000, global_pool=True, is_training=True)
	logits = layers_dict['resnet_v1_50/logits']
	logits = tf.keras.layers.Flatten()(logits)

	elif FLAGS.model == "googlenet":
	with slim.arg_scope(slimNet.inception.inception_v1_arg_scope()):
	_, layers_dict = slimNet.inception.inception_v1(inputs, spatial_squeeze=False, num_classes=1000, is_training=True)
	fmap = layers_dict['Logits']
	output = tf.keras.layers.GlobalAveragePooling2D()(fmap)
	logits = tf.keras.layers.Dense(1000)(output)
	else:
	print("No model support")

	loss = tf.losses.softmax_cross_entropy(onehot_labels=labels,
	logits=logits)
	# Create training op.
	with tf.name_scope('adam_optimizer'):
	optimizer = tf.train.AdagradOptimizer(learning_rate=0.01)
	update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
	with tf.control_dependencies(update_ops):
	train_step = optimizer.minimize(loss, global_step=tf.train.get_global_step())

	# import LMS and use
	if FLAGS.use_lms:
	print("USING IBM LARGE MODEL SUPPORT")
	from tensorflow.contrib.lms import LMS
	lms_obj = LMS({'adam_optimizer'})
	lms_obj.run(graph=tf.get_default_graph())


	# setup tf.ConfigProto for CUDA Unified memory
	config = tf.ConfigProto()
	if FLAGS.cuda_memory > 1:
	config.gpu_options.per_process_gpu_memory_fraction = FLAGS.cuda_memory
	print("USING CUDA UNIFIED MEMORY")

	res = []
	# Start session and training
	with tf.train.MonitoredTrainingSession(config=config) as sess:
	sess.run(iterator.initializer, feed_dict={features_placeholder: x,
	labels_placeholder: y})
	print("RUNNING WARMUP")
	for w in range(5):
	sess.run(train_step)
	print("WARMUP DONE")
	for b in range(1, 61):
	t = time.time()
	sess.run(train_step)
	t1 = time.time()
	_loss = sess.run(loss)
	if b % 10 == 0:
	print("Num:", b, ", Loss: ", _loss, ", Elapsed time: ", t1 - t, "Images/sec: ", (FLAGS.batch_size / (t1-t)))
	res.append(FLAGS.batch_size / (t1-t))
	print(np.mean(res), " +- ", np.std(res))