Skip to content

Instantly share code, notes, and snippets.

@tcooper
Last active August 29, 2015 14:21
Show Gist options
  • Save tcooper/bd40bac59c4d6660ac92 to your computer and use it in GitHub Desktop.
Save tcooper/bd40bac59c4d6660ac92 to your computer and use it in GitHub Desktop.
Slurm utilization for Graphite
#!/bin/sh
# Usage : Show queue utilization for running & pending jobs
E_INVALID_OPT=1
usage() { echo "Usage: $0" 1>&2; exit 1; }
cleanup () { /bin/rm $_squeue_tmp; }
while getopts ":h" opt; do
case $opt in
h)
usage
;;
\?)
echo "Error: Invalid option \"-$OPTARG\"" >&2
exit $E_INVALID_OPT
;;
esac
done
# Store timestamp and hostname for later use...
_timestamp=$(date +%s)
_hostname=$(hostname -s)
_prefix="hosts.hpc.${_hostname}."
# Temp file...
_squeue_tmp=$(mktemp)
# Various squeue values...
#JOBID PARTITION ST SUBMIT_TIME START_TIME TIME TIME_LIMIT TIME_LEFT NODES CPUS GRES
#540278 compute R 2015-05-12T14:13:05 2015-05-12T14:33:37 9:52:29 18:00:00 8:07:31 2 48 null
#539917 compute R 2015-05-11T10:08:07 2015-05-12T11:28:17 12:57:49 1-00:00:00 11:02:11 72 1728 null
#529933 compute R 2015-05-08T02:07:24 2015-05-08T17:10:18 4-07:15:48 5-00:00:00 16:44:12 2 48 null
#512211 compute PD 2015-04-27T13:23:19 N/A 0:00 2-00:00:00 2-00:00:00 2 48 null
#528687 compute PD 2015-05-07T12:29:57 N/A 0:00 2-00:00:00 2-00:00:00 10 240 null
#540326 shared R 2015-05-12T18:46:00 2015-05-12T21:52:28 2:34:48 7-00:00:00 6-21:25:12 1 23 null
#534022 shared PD 2015-05-08T17:32:47 2015-05-14T00:27:00 0:00 37:00 37:00 1 24 null
#538537_[0-27] gpu PD 2015-05-10T06:37:09 2015-05-14T00:26:00 0:00 2-00:00:00 2-00:00:00 6 24 gpu:4
#540058 gpu PD 2015-05-11T20:39:04 2015-05-14T00:26:00 0:00 5:00 5:00 24 24 gpu:1
#540059 gpu-shared PD 2015-05-11T20:39:19 2015-05-14T00:26:00 0:00 5:00 5:00 1 1 gpu:1
#531366 gpu PD 2015-05-08T14:41:09 2015-05-14T00:26:00 0:00 1:30:00 1:30:00 1 24 gpu:4
# Call Slurm squeue command for running & pending jobs with custom output format and remove parens
# '()' around empty gres. Store result in temp file and parse later line by line...
/usr/bin/squeue -h --state=running,pending --format="%i %P %t %V %S %M %l %L %D %C %b" | \
/bin/sed 's/(null)/null/g' > ${_squeue_tmp}
# Convert variable length time string to seconds
# Parsable inputs are currently... D-HH:MM:SS, HH:MM:SS, MM:SS or SS
function dhmsToSecs() {
IFS=$'-:'
declare i secs=0
declare a _dhms=($@)
# NOTE: The 10# ensures we stay in decimal base and 08 and 09 are interpreted as octal
if [[ ${#_dhms[@]} -eq 4 ]]; then
secs=$(( (10#${_dhms[0]} * 86400) + (10#${_dhms[1]} * 3600) + (10#${_dhms[2]} * 60) + 10#${_dhms[3]} ))
elif [[ ${#_dhms[@]} -eq 3 ]]; then
secs=$(( (10#${_dhms[0]} * 3600) + (10#${_dhms[1]} * 60) + 10#${_dhms[2]} ))
elif [[ ${#_dhms[@]} -eq 2 ]]; then
secs=$(( (10#${_dhms[0]} * 60) + 10#${_dhms[1]} ))
else
secs=$(( 10#${_dhms[0]} ))
fi
unset IFS
echo $secs
}
# Multiply SUs by job array task count
# Input: ArrayJobID, SUs/Job
# Output: Total SUs
function arrayJobMultiplier() {
# To be completed... return SUs for single task for now
echo $2
}
declare i _r_jobs=0 _r_sus=0 _pd_jobs=0 _pd_sus=0
declare i _r_cpus=0 _r_nodes=0 _pd_cpus=0 _r_cpus=0
# read using the file descriptors
exec 3<&0
exec 0<${_squeue_tmp}
while read _jobid _partition _state _submit_time _start_time _time _time_limit _time_left _nodes _cpus _gres
do
declare i _cpuFactor=1 _gresFactor=1 _memFactor=1
declare i _secs=0 _cpuSecs=0 _gresSecs=0 _memSecs=0
declare i _sus=0
if [ "${_partition}" == "gpu" ] || [ "${_partition}" == "gpu-shared" ]; then
# GPU's cost 2x/cpu
_cpuFactor=2
fi
if [ "${_state}" == "PD" ]; then
_pd_jobs=$((10#${_pd_jobs} + 1))
_pd_cpus=$((10#${_pd_cpus} + ${_cpus}))
_pd_nodes=$((10#${_pd_nodes} + ${_nodes}))
_secs=$(dhmsToSecs "${_time_limit}")
else
_r_jobs=$((10#${_r_jobs} + 1))
_r_cpus=$((10#${_r_cpus} + ${_cpus}))
_r_nodes=$((10#${_r_nodes} + ${_nodes}))
_secs=$(dhmsToSecs "${_time}")
fi
_cpuSecs=$((${_cpus}*${_secs}*${_cpuFactor}))
_gresSecs=0 #_gresSecs=$((${_gres}*${_secs}*${_gresFactor}))
_sus=$(((${_cpuSecs}+${_gresSecs})/3600))
# All jobs cost a minimum of 1 SU
[[ ${_sus} -lt 1 ]] && _sus=1
# Array jobs in pending state are listed as a group, running array jobs can be counted individually
if [[ ${_jobid} =~ [0-9]+_ ]]; then
_sus=$(arrayJobMultiplier ${_jobid} ${_sus})
fi
# Track queued and running SUs separately
if [ "${_state}" == "PD" ]; then
((_pd_sus += ${_sus}))
else
((_r_sus += ${_sus}))
fi
done
exec 0<&3
echo "${_prefix}.squeue.nodes.util ${_r_nodes} ${_timestamp}"
echo "${_prefix}.squeue.cpu.util ${_r_cpus} ${_timestamp}"
echo "${_prefix}.squeue.jobs.running ${_r_jobs} ${_timestamp}"
echo "${_prefix}.squeue.jobs.queued ${_pd_jobs} ${_timestamp}"
echo "${_prefix}.squeue.sus.running ${_r_sus} ${_timestamp}"
echo "${_prefix}.squeue.sus.queued ${_pd_sus} ${_timestamp}"
cleanup
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment