Skip to content

Instantly share code, notes, and snippets.

@giovtorres
Last active July 8, 2016 19:25
Show Gist options
  • Save giovtorres/2ce295be802554904665 to your computer and use it in GitHub Desktop.
Save giovtorres/2ce295be802554904665 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
"""
show-cluster-util
This script uses the Python SLURM bindings to get cluster utilization,
which accounts for nodes being allocated exclusively or totally allocated.
"""
import hostlist
import pyslurm
__author__ = "Giovanni Torres"
def human_readable(num, suffix="B"):
"""Convert bytes to a human readable form"""
if num == 0:
return "0.0 GB"
else:
for unit in ['', 'K', 'M', 'G', 'T', 'P']:
if abs(num) < 1024.0:
return "%.1f %s%s" % (num, unit, suffix)
num /= 1024.0
def get_part_info():
"""Return dictionary of defmempercpu limits."""
all_parts = pyslurm.partition().get()
defmempercpu = {}
all_nodes = {}
for part, partinfo in all_parts.items():
defmempercpu[part] = partinfo["def_mem_per_cpu"]
nodes = hostlist.expand_hostlist(partinfo["nodes"])
if part not in ["interactive", "quick", "maint"]:
for node in nodes:
all_nodes[node] = part
return (defmempercpu, all_nodes)
def get_util(nodes):
""" Return a tuple of cpu and memory percent values.
IN: (dict) dictionary of all nodes from pyslurm.node().get()
OUT: (tuple) cpu and mem percent util
"""
all_metrics = {"total_cpus_alloc": 0,
"total_cpus_idle": 0,
"total_cpus_down": 0,
"total_cpus_unalloc": 0,
"total_cpus_config": 0,
"total_memory_alloc": 0,
"total_memory_idle": 0,
"total_memory_down": 0,
"total_memory_unalloc": 0,
"total_memory_overalloc": 0,
"total_memory_config": 0,
"total_nodes_mixed": 0,
"total_nodes_alloc": 0,
"total_nodes_idle": 0,
"total_nodes_down": 0,
"total_nodes_config": 0}
defmempercpu, all_nodes = get_part_info()
for node in nodes:
nodeinfo = nodes.get(node)
state = nodeinfo.get("state").upper()
cpus_alloc = nodeinfo.get("alloc_cpus")
cpus_total = nodeinfo.get("cpus")
memory_alloc = nodeinfo.get("alloc_mem")
memory_real = nodeinfo.get("real_memory")
shared = nodeinfo.get("shared")
all_metrics["total_nodes_config"] += 1
all_metrics["total_cpus_config"] += cpus_total
all_metrics["total_memory_config"] += memory_real
if "DOWN" in state or "DRAIN" in state:
all_metrics["total_nodes_down"] += 1
all_metrics["total_cpus_down"] += cpus_total
all_metrics["total_memory_down"] += memory_real
else:
all_metrics["total_cpus_alloc"] += cpus_alloc
if ("ALLOCATED" in state) or (shared == 0):
all_metrics["total_nodes_alloc"] += 1
all_metrics["total_cpus_unalloc"] += cpus_total - cpus_alloc
if memory_alloc > memory_real:
all_metrics["total_memory_alloc"] += memory_real
all_metrics["total_memory_unalloc"] += 0
all_metrics["total_memory_overalloc"] += memory_alloc - memory_real
else:
all_metrics["total_memory_alloc"] += memory_alloc
all_metrics["total_memory_unalloc"] += memory_real - memory_alloc
elif "MIXED" in state:
all_metrics["total_nodes_mixed"] += 1
if memory_alloc > memory_real:
all_metrics["total_cpus_unalloc"] += cpus_total - cpus_alloc
all_metrics["total_memory_alloc"] += memory_real
all_metrics["total_memory_unalloc"] += 0
all_metrics["total_memory_overalloc"] += memory_alloc - memory_real
elif (memory_real - memory_alloc) < (2 * defmempercpu[all_nodes[node]]):
all_metrics["total_cpus_unalloc"] += cpus_total - cpus_alloc
all_metrics["total_memory_alloc"] += memory_alloc
all_metrics["total_memory_unalloc"] += memory_real - memory_alloc
else:
all_metrics["total_cpus_idle"] += cpus_total - cpus_alloc
all_metrics["total_memory_alloc"] += memory_alloc
all_metrics["total_memory_idle"] += memory_real - memory_alloc
elif "IDLE" in state:
all_metrics["total_nodes_idle"] += 1
all_metrics["total_cpus_idle"] += cpus_total
all_metrics["total_memory_idle"] += memory_real
else:
print node
assert all_metrics["total_nodes_alloc"] + \
all_metrics["total_nodes_mixed"] + \
all_metrics["total_nodes_idle"] + \
all_metrics["total_nodes_down"] == all_metrics["total_nodes_config"]
assert (all_metrics["total_cpus_alloc"] +
all_metrics["total_cpus_idle"] +
all_metrics["total_cpus_unalloc"] +
all_metrics["total_cpus_down"] == all_metrics["total_cpus_config"])
assert (all_metrics["total_memory_alloc"] +
all_metrics["total_memory_idle"] +
all_metrics["total_memory_unalloc"] +
all_metrics["total_memory_down"] == all_metrics["total_memory_config"])
return all_metrics
def display_metrics(metrics):
""" Print cluster utilization.
IN: (dict) dictionary of all node, cpu and memory states
"""
print ""
print "Total Allocated Nodes : {0:>8}".format(
metrics["total_nodes_alloc"])
print "Total Mixed Nodes : {0:>8}".format(
metrics["total_nodes_mixed"])
print "Total Idle Nodes : {0:>8}".format(
metrics["total_nodes_idle"])
print "Total Down/Offline Nodes : {0:>8}".format(
metrics["total_nodes_down"])
print "Total Eligible Nodes : {0:>8}".format(
metrics["total_nodes_config"] - metrics["total_nodes_down"])
print "Total Configured Nodes : {0:>8}".format(
metrics["total_nodes_config"])
print ""
print "Total Allocated CPUs : {0:>8}".format(
metrics["total_cpus_alloc"])
print "Total Idle CPUs : {0:>8}".format(
metrics["total_cpus_idle"])
print "Total Down CPUs : {0:>8}".format(
metrics["total_cpus_down"])
print "Total Unallocatable CPUs : {0:>8}".format(
metrics["total_cpus_unalloc"])
print "Total Eligible CPUs : {0:>8}".format(
metrics["total_cpus_config"] - metrics["total_cpus_down"])
print "Total Configured CPUs : {0:>8}".format(
metrics["total_cpus_config"])
print "Cluster CPU % Unallocatable : {0:>7}%".format(
metrics["total_cpus_unalloc"] * 100 / (metrics["total_cpus_config"] -
metrics["total_cpus_down"]))
print "Cluster CPU % (Alloc + Unalloc) : {0:>7}%".format(
(metrics["total_cpus_alloc"] + metrics["total_cpus_unalloc"]) * 100 / (
metrics["total_cpus_config"] - metrics["total_cpus_down"]))
print ""
print "Total Allocated Memory : {0:>8}".format(
human_readable(metrics["total_memory_alloc"] * 1024 * 1024))
print "Total Idle Memory : {0:>8}".format(
human_readable(metrics["total_memory_idle"] * 1024 * 1024))
print "Total Down Memory : {0:>8}".format(
human_readable(metrics["total_memory_down"] * 1024 * 1024))
print "Total Unallocatable Memory : {0:>8}".format(
human_readable(metrics["total_memory_unalloc"] * 1024 * 1024))
print "Total Overallocated Memory : {0:>8}".format(
human_readable(metrics["total_memory_overalloc"] * 1024 * 1024))
print "Total Eligible Memory : {0:>8}".format(
human_readable((metrics["total_memory_config"] -
metrics["total_memory_down"]) * 1024 * 1024))
print "Total Configured Memory : {0:>8}".format(
human_readable(metrics["total_memory_config"] * 1024 * 1024))
print "Cluster Memory % Unallocatable : {0:>7}%".format(
metrics["total_memory_unalloc"] * 100 / (metrics["total_memory_config"] -
metrics["total_memory_down"]))
print "Cluster Memory % (Alloc + Unalloc): {0:>7}%".format(
(metrics["total_memory_alloc"] + metrics["total_memory_unalloc"]) * 100 / (
metrics["total_memory_config"] - metrics["total_memory_down"]))
print ""
if __name__ == "__main__":
try:
# Make sure pyslurm works or else exit here
pyslurmnode = pyslurm.node()
# Get all node info
nodes = pyslurmnode.get()
except ValueError as e:
print 'Query failed - %s' % (e)
sys.exit(1)
metrics = get_util(nodes)
display_metrics(metrics)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment