Skip to content

Instantly share code, notes, and snippets.

@fanurs
Created May 3, 2024 22:07
Show Gist options
  • Save fanurs/3888e74f7cefdc7597f5bc85813ff36f to your computer and use it in GitHub Desktop.
Save fanurs/3888e74f7cefdc7597f5bc85813ff36f to your computer and use it in GitHub Desktop.
#!/bin/bash
TIMEOUT=60 # in seconds
TIMESTEP=3 # in seconds
JOB_NAME="tunnel" # any unique name
SCRIPT="/path/to/interactive.sbatch" # modify accordingly
# It starts an OpenSSH server to enable interactive sessions, allowing IDEs like VSCode to connect remotely.
# Here is an example script: https://crc-pages.pitt.edu/user-manual/slurm/vscode/#steps-performed-only-once
main () {
# parse arguments
RESTART_MODE=0 # default
if [ "$#" -eq 0 ]; then
:
elif [[ "$1" == "--restart" || "$1" == "-r" ]]; then
RESTART_MODE=1
elif [[ "$1" == "--help" || "$1" == "-h" ]]; then
echo_usage
exit 0
else
echo "Error: Invalid argument."
echo_usage
exit 1
fi
# start by querying all jobs named $JOB_NAME
declare -a job_id
declare -a job_state
i=0
while read -r _id _state; do
job_id[$i]=$_id
job_state[$i]=$_state
i=$((i + 1))
done < <(sq --Format='JobID,State' --sort='TimeLeft')
n_jobs=${#job_id[@]}
# if --restart flag is set
if [ "$RESTART_MODE" -eq 1 ]; then
if ! ([ "$n_jobs" -eq 1 ] && [ "${job_state[0]}" == "RUNNING" ]); then
echo "Error: --restart flag requires exactly one job and it must be RUNNING."
exit 1
fi
echo "Restarting by submitting a new job."
sbatch $SCRIPT &> /dev/null
echo "Submitted the new job."
echo "Waiting for the new job to start."
wait_till_timeout \
'[ "$(sq --states=RUNNING | grep -c .)" -eq 2 ]' \
"Timeout reached. Job is not started."
echo "New job is running."
# kill the old job
echo "Will be cancelling the old job (this terminal instance)."
echo "You may need to reload this terminal instance to get into the new job."
scancel ${job_id[0]} &> /dev/null
return
fi
# if there is exactly one job (regardless of its state)
if [ "$n_jobs" -eq 1 ]; then
# if the job is neither running nor pending, cancel it
if [ "${job_state[0]}" != "RUNNING" ] && [ "${job_state[0]}" != "PENDING" ]; then
scancel ${job_id[0]} &> /dev/null
wait_till_timeout \
'[ "$(sq | grep -c .)" -eq 0 ]' \
"Timeout reached. Failed to cancel the job."
fi
# wait until the job is running
wait_till_timeout \
'[ "$(sq --states=RUNNING | grep -c .)" -eq 1 ]' \
"Timeout reached. Job is not started."
echo_nc
# early termination as the job is already running
return
fi
# we either have no job or more than one job from here on
# if there are more than one job, cancel all of them
if [ "$n_jobs" -ge 2 ]; then
scancel --user=$USER --name=$JOB_NAME &> /dev/null
wait_till_timeout \
'[ "$(sq | grep -c .)" -eq 0 ]' \
"Timeout reached.. Failed to cancel all jobs."
fi
# submit a new job
sbatch $SCRIPT &> /dev/null
wait_till_timeout \
'[ "$(sq --states=RUNNING | grep -c .)" -eq 1 ]' \
"Timeout reached. Job is not started."
echo_nc
}
echo_usage() {
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " -r, --restart Resubmit a new job. Requires exactly one job named '$JOB_NAME' to be currently active."
echo " -h, --help Display this help message and exit."
echo ""
echo "Description:"
echo " Here, the term 'job' always refers to a SLURM job with the name '$JOB_NAME' under the current user only."
echo " It ensures that there is a single active running job of this name."
echo " Here's how it handles different scenarios:"
echo " - If no jobs are found, it submits a new job."
echo " - If exactly one job is found, it waits until this job is running."
echo " - If more than one job is found, it cancels all such jobs and submits a new one."
}
# shorthand for squeue that we care about in this script
sq() {
squeue --user=$USER --name=$JOB_NAME --local --noheader "$@"
}
# wait until the condition is met or timeout is reached
wait_till_timeout() {
local wait_condition=$1
local timeout_message=$2
elapsed=0
while ! eval "$wait_condition"; do
sleep $TIMESTEP
elapsed=$((elapsed + $TIMESTEP))
if [ "$elapsed" -ge "$TIMEOUT" ]; then
echo "$timeout_message"
exit 1
fi
done
}
# echo the NodeList (node) and Comment (port) of the running job with the most time left
echo_nc() {
local elapsed=0
while ! read -r node port < <(sq --states=RUNNING --sort='TimeLeft' --Format='NodeList,Comment' | tail -n 1); do
if [ -n "$node" ] && [[ "$port" =~ ^[0-9]+$ ]]; then
break
fi
elapsed=$((elapsed + $TIMESTEP))
if [ "$elapsed" -ge "$TIMEOUT" ]; then
echo "Timeout reached. Failed to get valid node and port."
exit 1
fi
done
echo $node $port
}
main "$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment