Last active
April 23, 2024 14:51
-
-
Save JiankunW/ade629366f0c43beb6d3cb1f5fbfd0c1 to your computer and use it in GitHub Desktop.
[M]on[I]toring GPU usages of a Slurm cluster with multiple nodes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
if [[ "$1" == "-a" ]] | |
then | |
echo "Avaiable (MIXED or IDEL) nodes:" | |
sinfo -N --noheader | grep -e mix -e idle | while read line | |
do | |
arr=($line) | |
srun -p ${arr[2]} -w ${arr[0]} gpustat -p --color 2>/dev/null </dev/null | |
done | |
echo "Unavailable (NOT MIXED and NOT IDEL) nodes:" | |
sinfo | grep -e mix -e idle -v | |
elif [[ "$1" == "-w" ]] | |
then | |
if [ -z "$2" ] | |
then echo "Please designate the node name. Options: [-w] NODE_NAME" | |
else | |
line=`sinfo -N --noheader | grep $2` | |
if [ -z "$line" ] | |
then echo "Node $2 not found." | |
else | |
arr=($line) | |
srun -p ${arr[2]} -w ${arr[0]} gpustat -p --color 2>/dev/null </dev/null | |
fi | |
fi | |
elif [[ "$1" == "-p" ]] | |
then | |
if [ -z "$2" ] | |
then echo "Please designate the partition name. Options: [-p] PARTITION_NAME" | |
else | |
IFS=$'\n' read -a array -d '' <<< `sinfo -N --noheader | grep $2 | grep -e mix -e idle` | |
if [ -z "$array" ] | |
then echo "Partition $2 not found." | |
else | |
echo "Avaiable (MIXED or IDEL) nodes on $2:" | |
for line in "${array[@]}"; | |
do | |
arr=($line) | |
srun -p ${arr[2]} -w ${arr[0]} gpustat -p --color 2>/dev/null </dev/null | |
done | |
echo "Unavailable (NOT MIXED and NOT IDEL) nodes on $2:" | |
sinfo -N | head -n 1 | |
sinfo -N | grep $2 | grep -e mix -e idle -v | |
fi | |
fi | |
else | |
echo "Usage: mi [options]" | |
echo "[M]on[I]toring GPU usage on nodes of slurm cluster" | |
echo "Options: " | |
echo " [-h, --help] show this help message" | |
echo " [-a] show all stats on each node (take dozens of seconds)" | |
echo " [-w] NODE_NAME show the stats on a specific node" | |
echo " [-p] PARTITION_NAME show the stats on a specific partition" | |
exit 0 | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment