ay27/TF-cuda_opt.py

## TF-cuda_opt.py
# mainly from:
# 1. https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars
# 2. https://github.com/NVIDIA/DeepLearningExamples/issues/57
# 3. https://docs.nvidia.com/deeplearning/frameworks/tensorflow-user-guide/index.html#variablesaddtf

def is_using_hvd():
    env_vars = ["OMPI_COMM_WORLD_RANK", "OMPI_COMM_WORLD_SIZE"]

    if all([var in os.environ for var in env_vars]):
        return True
    else:
        return False


def cuda_opt(use_mixed_precision=True):
    # ============================================
    # Optimsation Flags - Do not remove
    # ============================================

    os.environ['CUDA_CACHE_DISABLE'] = '0'

    os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
    os.environ['TF_GPU_THREAD_COUNT'] = '1' if not is_using_hvd() else str(hvd.size())

    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'

    os.environ['TF_ADJUST_HUE_FUSED'] = '1'
    os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    os.environ['TF_SYNC_ON_FINISH'] = '0'
    os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
    os.environ['TF_DISABLE_NVTX_RANGES'] = '1'  # NVIDIA Tools Extension, only for debugging and profiling

    # fast math (essentially the tensor core)
    os.environ["TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32"] = "1"
    os.environ["TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32"] = "1"
    os.environ["TF_ENABLE_CUDNN_RNN_TENSOR_OP_MATH_FP32"] = "1"

    if use_mixed_precision:
        # tf auto mixed precision, will do these operations automatically:
        # 1. Insert the appropriate cast operations into your TensorFlow graph to use float16 execution and storage
        #    where appropriate -- this enables the use of tensor cores along with memory storage and bandwidth savings.
        #   also can set by "TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"
        # 2. Turn on automatic loss scaling inside the training Optimizer object.
        #   also can set by "TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"
        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1"
        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1"


def get_session_config(is_training, use_xla=True):
    config = tf.ConfigProto()

    config.allow_soft_placement = True
    config.log_device_placement = False

    config.gpu_options.allow_growth = True

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    if is_using_hvd():
        config.gpu_options.visible_device_list = str(hvd.local_rank())

    if use_xla:
        os.environ["TF_ENABLE_XLA"] = "1"
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

    config.gpu_options.force_gpu_compatible = True  # Force pinned memory

    if is_training:
        config.intra_op_parallelism_threads = 1  # Avoid pool of Eigen threads

        if is_using_hvd():
            config.inter_op_parallelism_threads = max(2, (multiprocessing.cpu_count() // hvd.size()) - 2)
        else:
            config.inter_op_parallelism_threads = 4

    return config
	# mainly from:
	# 1. https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars
	# 2. https://github.com/NVIDIA/DeepLearningExamples/issues/57
	# 3. https://docs.nvidia.com/deeplearning/frameworks/tensorflow-user-guide/index.html#variablesaddtf

	def is_using_hvd():
	env_vars = ["OMPI_COMM_WORLD_RANK", "OMPI_COMM_WORLD_SIZE"]

	if all([var in os.environ for var in env_vars]):
	return True
	else:
	return False


	def cuda_opt(use_mixed_precision=True):
	# ============================================
	# Optimsation Flags - Do not remove
	# ============================================

	os.environ['CUDA_CACHE_DISABLE'] = '0'

	os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'

	os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

	os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
	os.environ['TF_GPU_THREAD_COUNT'] = '1' if not is_using_hvd() else str(hvd.size())

	os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'

	os.environ['TF_ADJUST_HUE_FUSED'] = '1'
	os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
	os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

	os.environ['TF_SYNC_ON_FINISH'] = '0'
	os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
	os.environ['TF_DISABLE_NVTX_RANGES'] = '1' # NVIDIA Tools Extension, only for debugging and profiling

	# fast math (essentially the tensor core)
	os.environ["TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32"] = "1"
	os.environ["TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32"] = "1"
	os.environ["TF_ENABLE_CUDNN_RNN_TENSOR_OP_MATH_FP32"] = "1"

	if use_mixed_precision:
	# tf auto mixed precision, will do these operations automatically:
	# 1. Insert the appropriate cast operations into your TensorFlow graph to use float16 execution and storage
	# where appropriate -- this enables the use of tensor cores along with memory storage and bandwidth savings.
	# also can set by "TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"
	# 2. Turn on automatic loss scaling inside the training Optimizer object.
	# also can set by "TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"
	os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1"
	os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
	os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1"


	def get_session_config(is_training, use_xla=True):
	config = tf.ConfigProto()

	config.allow_soft_placement = True
	config.log_device_placement = False

	config.gpu_options.allow_growth = True

	# Horovod: pin GPU to be used to process local rank (one GPU per process)
	if is_using_hvd():
	config.gpu_options.visible_device_list = str(hvd.local_rank())

	if use_xla:
	os.environ["TF_ENABLE_XLA"] = "1"
	config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

	config.gpu_options.force_gpu_compatible = True # Force pinned memory

	if is_training:
	config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads

	if is_using_hvd():
	config.inter_op_parallelism_threads = max(2, (multiprocessing.cpu_count() // hvd.size()) - 2)
	else:
	config.inter_op_parallelism_threads = 4

	return config