Skip to content

Instantly share code, notes, and snippets.

Last active July 8, 2016 19:25
Show Gist options
  • Save giovtorres/2ce295be802554904665 to your computer and use it in GitHub Desktop.
Save giovtorres/2ce295be802554904665 to your computer and use it in GitHub Desktop.
This script uses the Python SLURM bindings to get cluster utilization,
which accounts for nodes being allocated exclusively or totally allocated.
import hostlist
import pyslurm
__author__ = "Giovanni Torres"
def human_readable(num, suffix="B"):
"""Convert bytes to a human readable form"""
if num == 0:
return "0.0 GB"
for unit in ['', 'K', 'M', 'G', 'T', 'P']:
if abs(num) < 1024.0:
return "%.1f %s%s" % (num, unit, suffix)
num /= 1024.0
def get_part_info():
"""Return dictionary of defmempercpu limits."""
all_parts = pyslurm.partition().get()
defmempercpu = {}
all_nodes = {}
for part, partinfo in all_parts.items():
defmempercpu[part] = partinfo["def_mem_per_cpu"]
nodes = hostlist.expand_hostlist(partinfo["nodes"])
if part not in ["interactive", "quick", "maint"]:
for node in nodes:
all_nodes[node] = part
return (defmempercpu, all_nodes)
def get_util(nodes):
""" Return a tuple of cpu and memory percent values.
IN: (dict) dictionary of all nodes from pyslurm.node().get()
OUT: (tuple) cpu and mem percent util
all_metrics = {"total_cpus_alloc": 0,
"total_cpus_idle": 0,
"total_cpus_down": 0,
"total_cpus_unalloc": 0,
"total_cpus_config": 0,
"total_memory_alloc": 0,
"total_memory_idle": 0,
"total_memory_down": 0,
"total_memory_unalloc": 0,
"total_memory_overalloc": 0,
"total_memory_config": 0,
"total_nodes_mixed": 0,
"total_nodes_alloc": 0,
"total_nodes_idle": 0,
"total_nodes_down": 0,
"total_nodes_config": 0}
defmempercpu, all_nodes = get_part_info()
for node in nodes:
nodeinfo = nodes.get(node)
state = nodeinfo.get("state").upper()
cpus_alloc = nodeinfo.get("alloc_cpus")
cpus_total = nodeinfo.get("cpus")
memory_alloc = nodeinfo.get("alloc_mem")
memory_real = nodeinfo.get("real_memory")
shared = nodeinfo.get("shared")
all_metrics["total_nodes_config"] += 1
all_metrics["total_cpus_config"] += cpus_total
all_metrics["total_memory_config"] += memory_real
if "DOWN" in state or "DRAIN" in state:
all_metrics["total_nodes_down"] += 1
all_metrics["total_cpus_down"] += cpus_total
all_metrics["total_memory_down"] += memory_real
all_metrics["total_cpus_alloc"] += cpus_alloc
if ("ALLOCATED" in state) or (shared == 0):
all_metrics["total_nodes_alloc"] += 1
all_metrics["total_cpus_unalloc"] += cpus_total - cpus_alloc
if memory_alloc > memory_real:
all_metrics["total_memory_alloc"] += memory_real
all_metrics["total_memory_unalloc"] += 0
all_metrics["total_memory_overalloc"] += memory_alloc - memory_real
all_metrics["total_memory_alloc"] += memory_alloc
all_metrics["total_memory_unalloc"] += memory_real - memory_alloc
elif "MIXED" in state:
all_metrics["total_nodes_mixed"] += 1
if memory_alloc > memory_real:
all_metrics["total_cpus_unalloc"] += cpus_total - cpus_alloc
all_metrics["total_memory_alloc"] += memory_real
all_metrics["total_memory_unalloc"] += 0
all_metrics["total_memory_overalloc"] += memory_alloc - memory_real
elif (memory_real - memory_alloc) < (2 * defmempercpu[all_nodes[node]]):
all_metrics["total_cpus_unalloc"] += cpus_total - cpus_alloc
all_metrics["total_memory_alloc"] += memory_alloc
all_metrics["total_memory_unalloc"] += memory_real - memory_alloc
all_metrics["total_cpus_idle"] += cpus_total - cpus_alloc
all_metrics["total_memory_alloc"] += memory_alloc
all_metrics["total_memory_idle"] += memory_real - memory_alloc
elif "IDLE" in state:
all_metrics["total_nodes_idle"] += 1
all_metrics["total_cpus_idle"] += cpus_total
all_metrics["total_memory_idle"] += memory_real
print node
assert all_metrics["total_nodes_alloc"] + \
all_metrics["total_nodes_mixed"] + \
all_metrics["total_nodes_idle"] + \
all_metrics["total_nodes_down"] == all_metrics["total_nodes_config"]
assert (all_metrics["total_cpus_alloc"] +
all_metrics["total_cpus_idle"] +
all_metrics["total_cpus_unalloc"] +
all_metrics["total_cpus_down"] == all_metrics["total_cpus_config"])
assert (all_metrics["total_memory_alloc"] +
all_metrics["total_memory_idle"] +
all_metrics["total_memory_unalloc"] +
all_metrics["total_memory_down"] == all_metrics["total_memory_config"])
return all_metrics
def display_metrics(metrics):
""" Print cluster utilization.
IN: (dict) dictionary of all node, cpu and memory states
print ""
print "Total Allocated Nodes : {0:>8}".format(
print "Total Mixed Nodes : {0:>8}".format(
print "Total Idle Nodes : {0:>8}".format(
print "Total Down/Offline Nodes : {0:>8}".format(
print "Total Eligible Nodes : {0:>8}".format(
metrics["total_nodes_config"] - metrics["total_nodes_down"])
print "Total Configured Nodes : {0:>8}".format(
print ""
print "Total Allocated CPUs : {0:>8}".format(
print "Total Idle CPUs : {0:>8}".format(
print "Total Down CPUs : {0:>8}".format(
print "Total Unallocatable CPUs : {0:>8}".format(
print "Total Eligible CPUs : {0:>8}".format(
metrics["total_cpus_config"] - metrics["total_cpus_down"])
print "Total Configured CPUs : {0:>8}".format(
print "Cluster CPU % Unallocatable : {0:>7}%".format(
metrics["total_cpus_unalloc"] * 100 / (metrics["total_cpus_config"] -
print "Cluster CPU % (Alloc + Unalloc) : {0:>7}%".format(
(metrics["total_cpus_alloc"] + metrics["total_cpus_unalloc"]) * 100 / (
metrics["total_cpus_config"] - metrics["total_cpus_down"]))
print ""
print "Total Allocated Memory : {0:>8}".format(
human_readable(metrics["total_memory_alloc"] * 1024 * 1024))
print "Total Idle Memory : {0:>8}".format(
human_readable(metrics["total_memory_idle"] * 1024 * 1024))
print "Total Down Memory : {0:>8}".format(
human_readable(metrics["total_memory_down"] * 1024 * 1024))
print "Total Unallocatable Memory : {0:>8}".format(
human_readable(metrics["total_memory_unalloc"] * 1024 * 1024))
print "Total Overallocated Memory : {0:>8}".format(
human_readable(metrics["total_memory_overalloc"] * 1024 * 1024))
print "Total Eligible Memory : {0:>8}".format(
human_readable((metrics["total_memory_config"] -
metrics["total_memory_down"]) * 1024 * 1024))
print "Total Configured Memory : {0:>8}".format(
human_readable(metrics["total_memory_config"] * 1024 * 1024))
print "Cluster Memory % Unallocatable : {0:>7}%".format(
metrics["total_memory_unalloc"] * 100 / (metrics["total_memory_config"] -
print "Cluster Memory % (Alloc + Unalloc): {0:>7}%".format(
(metrics["total_memory_alloc"] + metrics["total_memory_unalloc"]) * 100 / (
metrics["total_memory_config"] - metrics["total_memory_down"]))
print ""
if __name__ == "__main__":
# Make sure pyslurm works or else exit here
pyslurmnode = pyslurm.node()
# Get all node info
nodes = pyslurmnode.get()
except ValueError as e:
print 'Query failed - %s' % (e)
metrics = get_util(nodes)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment