Last active
June 19, 2020 10:06
-
-
Save ptheywood/ce4e5a600216dd085cae5bd5802d35aa to your computer and use it in GitHub Desktop.
Select the cuda device which is being least-utilised. in a system.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
# As of 2020-06-19 this is no longer required for University of Sheffield HPC systems usign SGE (Iceberg or ShARC). | |
# The SGE GPU allocation process has been improved to limit the devices available. | |
# If you request one Device, your session can now only see one device, instead of all available. | |
# Outputs the number of GPUs requested by an SGE job. | |
function getSGERequestedGPUCount { | |
export PATH="$SGE_ROOT/bin/lx-amd64/:$PATH" | |
# Find job id even when using qrsh | |
JOB_ID=${JOB_ID-$(cat /proc/$$/cpuset | cut -d/ -f3 | cut -d. -f1)} | |
ngpus_per_core=$(SGE_SINGLE_LINE=1 qstat -j $JOB_ID | grep -Po '^hard resource_list.*gpu=\K[1-9][0-9]*') | |
if [ -z $ngpus_per_core ]; then | |
ngpus_per_core=0 | |
fi | |
ncpus=1 | |
if $(qstat -j $JOB_ID | grep -Eq '^parallel environment: .*(smp|openmp).*'); then | |
ncpus=$(qstat -j $JOB_ID | grep -Po 'parallel environment: .*(smp|openmp).* range: \K[1-9][0-9]*') | |
fi | |
ngpus=$(( $ncpus * $ngpus_per_core )) | |
echo "$ngpus" | |
} | |
# Function to set the value of CUDA_VISIBLE_DEVICES to the id of the least-utilised device in the system. | |
# This will not set the device if CUDA_VISIBLE_DEVICES has already been set, to prevent issues on the ShARC DGX-1 (Node 126) or other private nodes which use the device allocation model | |
function setCUDAVisibileDevices { | |
# If CUDA_VISIBLE_DEVICES is not currently set then proceed. | |
if [ -z "$CUDA_VISIBLE_DEVICES" ]; then | |
# If the nvidia-smi command is available | |
if [ $( command -v nvidia-smi ) ]; then | |
# Initialise the requested number of GPUs to 1 | |
NUM_GPUS_REQUESTED=1 | |
# If there is at least one argument, use the first argument | |
if [ $# -gt 0 ]; then | |
NUM_GPUS_REQUESTED=$1 | |
fi | |
# If the value is not a positive integer, error. | |
if [[ ! $NUM_GPUS_REQUESTED =~ ^[1-9][0-9]*$ ]]; then | |
>&2 echo "Error: ${FUNCNAME[0]} only accepts integer values larger than 0" | |
exit 1 | |
fi | |
# Query nvidia-smi for device utilisation information | |
nvidiasmi_output=$(nvidia-smi --query-gpu=index,name,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv,noheader,nounits) | |
# Find the number of GPUs in this system/node | |
devices_found=$(echo "$nvidiasmi_output" | wc -l ) | |
# If fewer devices are found than requested, error. | |
if [ "$devices_found" -lt "$NUM_GPUS_REQUESTED" ]; then | |
>&2 echo "Error: Requested $NUM_GPUS_REQUESTED but only found $devices_found" | |
exit 1 | |
fi | |
# Sort and filter the output of nvidia-smi to rank GPUs in reverse order of busyness. | |
# Return a comma separated list of NUM_GPUS_REQUESTED device indices | |
deviceid=$(echo "$nvidiasmi_output" | sort -t, -k3,3nr -k4,4n -k5,5n -k6,6n -k1,1nr | head -n $NUM_GPUS_REQUESTED | cut -d, -f1 | paste -sd "," -) | |
# Output and export the cuda visible devices variable | |
echo "Setting CUDA_VISIBLE_DEVICES=$deviceid" | |
export CUDA_VISIBLE_DEVICES=$deviceid | |
# Also export export CUDA_DEVICE_ORDER=PCI_BUS_ID, so that the cuda runtime enumerates in the same order as nvidia-smi, rather than in performance order. Requires CUDA 7 runtime or newer. | |
export CUDA_DEVICE_ORDER=PCI_BUS_ID | |
else | |
>&2 echo "nvidia-smi is not available. Aborting." | |
fi | |
else | |
# If CUDA_VISIBLE_DEVICES is already set, output the value to the user. | |
>&2 echo "CUDA_VISIBLE_DEVICES is already set. CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" | |
fi | |
} | |
# Set the property correctly | |
setCUDAVisibileDevices $(getSGERequestedGPUCount) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
As of 2020-06 it is no longer neccesary to use this on SGE based HPC systems at the University of Sheffield (Icerberg and Sharc).