Skip to content

Instantly share code, notes, and snippets.

@nathanieltarshish
Last active November 7, 2021 17:24
Show Gist options
  • Save nathanieltarshish/a401f8434aebad22755013dc58988385 to your computer and use it in GitHub Desktop.
Save nathanieltarshish/a401f8434aebad22755013dc58988385 to your computer and use it in GitHub Desktop.
network topology profiler
#!/bin/sh
Help()
{
# Display Help
echo "Probes the network topology of a SLURM cluster."
echo
echo "Syntax: network_topology.sh [-h -p partition]"
echo "options:"
echo "p only profile specific partition."
echo "h Print this Help."
echo
echo "Nodes have the fastest communication with other nodes that "
echo "are connected to the same physical switch. This script displays "
echo "the switches defined in the cluster's topology.conf file. The "
echo "resources (nodes, cpus, memory, features) associated with each "
echo "switch are also shown. For jobs, use sbatch --switches=<count> "
}
switches=$(scontrol show topology)
while getopts ":hp" option; do
case $option in
h) # display Help
Help
exit;;
p) # only profile a partition
partition=$2
switches=$(scontrol show topology | grep $partition)
esac
done
#slurm text block -> array of single switch info per line
readarray -t switches <<<"$switches"
for switch in "${switches[@]}"; do
#convert info from string to array
switch=($switch)
name=${switch[0]: +11}
level=${switch[1]: +6}
nodes=${switch[3]: +6}
#check if leaf switch
if [ $level -eq 0 ]; then
info=$(sinfo -n $nodes -O "Nodelist:400,CPUs,Memory,Features" -e --noheader)
#check if info is empty. This occurs if nodes no longer exist on cluster but
#are still listed in an out-of-date SLURM topology.conf file
if [ ! -z "$info" ]; then
readarray -t info <<<"$info"
echo "------------ Switch "$name"----------------"
data="Nodes | CPUs | Memory | Features \n"
for identical_group_info in "${info[@]}"; do
identical_group_info=($identical_group_info)
identical_nodes=${identical_group_info[0]}
identical_nodes=($identical_nodes)
IFS="," read -a identical_nodes <<< $identical_nodes
cpus=${identical_group_info[1]}
memory=$((${identical_group_info[2]}/1024))
features=${identical_group_info[3]}
number=${#identical_nodes[@]}
data="$data $number | $cpus | $memory Gb | $features \n"
done
echo -e $data | column -t -s '|'
echo "---------------------------------------------"
else
echo "------------ Switch "$name"----------------"
echo "sinfo did not return information for nodes: "$nodes
echo "These nodes may no longer exist and switch"
echo "configuration is out-of-date. "
echo "---------------------------------------------"
fi
fi
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment