Skip to content

Instantly share code, notes, and snippets.

@smutch
Created May 15, 2023 23:24
Show Gist options
  • Save smutch/5ae94f5df5806d44edf4fb833bbae5b0 to your computer and use it in GitHub Desktop.
Save smutch/5ae94f5df5806d44edf4fb833bbae5b0 to your computer and use it in GitHub Desktop.
sh: MPS SLURM wrapper script
#!/bin/bash
# Author: Simon Mutch <smutch.astro@gmail.com>
# Date: 2018-08-14
set -e # error on exit
get_log_dir() {
echo "$JOBFS/log_mps-$1"
}
get_pipe_dir() {
echo "$JOBFS/mps-$1"
}
# number of devices on each node
NDEVICES=$(( $(wc -c <<< $(sed s/,//g <<< $SLURM_STEP_GPUS))-1 ))
# use the first rank of each node to spawn an mps controller
if [[ $SLURM_LOCALID == 0 ]]; then
for (( i = 0; i < $NDEVICES; i++ )); do
log_dir=$(get_log_dir $i)
pipe_dir=$(get_pipe_dir $i)
mkdir $log_dir
mkdir $pipe_dir
CUDA_VISIBLE_DEVICES=$i CUDA_MPS_LOG_DIRECTORY=$log_dir CUDA_MPS_PIPE_DIRECTORY=$pipe_dir \
nvidia-cuda-mps-control -d
echo "$(hostname): Started mps-control for device $i in $pipe_dir"
done
fi
# Set the environment variables for each rank.
# Note that this is where the decision of which rank uses which GPU is being made...
local_size=$(sed s/\(.*\)//g <<< $SLURM_JOB_CPUS_PER_NODE)
# # TEMP --------
# local_size=32
# NDEVICES=1
# SLURM_LOCALID=$OMPI_COMM_WORLD_NODE_RANK
# # ---------------
my_device=$(( ($SLURM_LOCALID * $NDEVICES / $local_size) % $NDEVICES )) # consecutive ranks share
# --- OR ---
# my_device=$(( $SLURM_LOCALID % $NDEVICES )) # alternate
export CUDA_VISIBLE_DEVICES=0
export CUDA_MPS_PIPE_DIRECTORY=$JOBFS/mps-$my_device
function cleanup() {
# Once we've exited our program, stop the controller, copy the logs and cleanup.
if [[ $SLURM_LOCALID == 0 ]]; then
for (( i = 0; i < $NDEVICES; i++ )); do
log_dir=$(get_log_dir $i)
pipe_dir=$(get_pipe_dir $i)
echo "quit" | CUDA_MPS_PIPE_DIRECTORY=$pipe_dir nvidia-cuda-mps-control
cp $log_dir/control.log ./mps-control-$(hostname)_$i.log
cp $log_dir/server.log ./mps-server-$(hostname)_$i.log
rm -r $log_dir
rm -r $pipe_dir
echo "$(hostname): Stopped mps-control for device $i at $pipe_dir"
done
fi
}
trap cleanup EXIT SIGINT SIGTERM ERR KILL
# Run the actual executable!
# Note the absence of 'srun' or 'mpiexec' etc. here...
"$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment