Skip to content

Instantly share code, notes, and snippets.

@ay27
Last active September 17, 2021 09:32
Show Gist options
  • Save ay27/caba007cb2e27ca824c587c80303aafa to your computer and use it in GitHub Desktop.
Save ay27/caba007cb2e27ca824c587c80303aafa to your computer and use it in GitHub Desktop.
[TF CUDA Optimization Options] All the tensorflow with cuda optimization you need! Will speedup at least 1.3 times in Volta and Turing architecture! Only works with the tensorflow gpu version build from source with cuda/cudnn support, or use the docker image from nvidia gpu cloud <ngc.nvidia.com>. #tensorflow #cuda
# mainly from:
# 1. https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars
# 2. https://github.com/NVIDIA/DeepLearningExamples/issues/57
# 3. https://docs.nvidia.com/deeplearning/frameworks/tensorflow-user-guide/index.html#variablesaddtf
def is_using_hvd():
env_vars = ["OMPI_COMM_WORLD_RANK", "OMPI_COMM_WORLD_SIZE"]
if all([var in os.environ for var in env_vars]):
return True
else:
return False
def cuda_opt(use_mixed_precision=True):
# ============================================
# Optimsation Flags - Do not remove
# ============================================
os.environ['CUDA_CACHE_DISABLE'] = '0'
os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_GPU_THREAD_COUNT'] = '1' if not is_using_hvd() else str(hvd.size())
os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
os.environ['TF_ADJUST_HUE_FUSED'] = '1'
os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
os.environ['TF_SYNC_ON_FINISH'] = '0'
os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
os.environ['TF_DISABLE_NVTX_RANGES'] = '1' # NVIDIA Tools Extension, only for debugging and profiling
# fast math (essentially the tensor core)
os.environ["TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32"] = "1"
os.environ["TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32"] = "1"
os.environ["TF_ENABLE_CUDNN_RNN_TENSOR_OP_MATH_FP32"] = "1"
if use_mixed_precision:
# tf auto mixed precision, will do these operations automatically:
# 1. Insert the appropriate cast operations into your TensorFlow graph to use float16 execution and storage
# where appropriate -- this enables the use of tensor cores along with memory storage and bandwidth savings.
# also can set by "TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"
# 2. Turn on automatic loss scaling inside the training Optimizer object.
# also can set by "TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"
os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1"
os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1"
def get_session_config(is_training, use_xla=True):
config = tf.ConfigProto()
config.allow_soft_placement = True
config.log_device_placement = False
config.gpu_options.allow_growth = True
# Horovod: pin GPU to be used to process local rank (one GPU per process)
if is_using_hvd():
config.gpu_options.visible_device_list = str(hvd.local_rank())
if use_xla:
os.environ["TF_ENABLE_XLA"] = "1"
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
config.gpu_options.force_gpu_compatible = True # Force pinned memory
if is_training:
config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
if is_using_hvd():
config.inter_op_parallelism_threads = max(2, (multiprocessing.cpu_count() // hvd.size()) - 2)
else:
config.inter_op_parallelism_threads = 4
return config
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment