Skip to content

Instantly share code, notes, and snippets.

@giovtorres
Last active April 3, 2020 08:41
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save giovtorres/a26bcd754bf0eaa4b4e10b8e48bdfa22 to your computer and use it in GitHub Desktop.
Save giovtorres/a26bcd754bf0eaa4b4e10b8e48bdfa22 to your computer and use it in GitHub Desktop.
Graph Slurm's sdiag with PySlurm and Graphite
#!/usr/bin/python
# vim: set ts=4 sw=4 et
"""
slurm_sched_stats.py
A script that uses PySlurm to get the slurm scheduler statistics.
"""
import pickle
import socket
import struct
import sys
import time
import pyslurm
__author__ = "Giovanni Torres"
CARBON_SERVER = "127.0.0.1"
CARBON_PICKLE_PORT = 2004
DELAY=30
def get_sched_stats():
stats = {}
try:
sdiag = pyslurm.statistics().get()
except:
return
else:
# Slurmctld Stats
stats["server_thread_count"] = sdiag.get("server_thread_count")
stats["agent_queue_size"] = sdiag.get("agent_queue_size")
# Jobs Stats
stats["jobs_submitted"] = sdiag.get("jobs_submitted")
stats["jobs_started"] = sdiag.get("jobs_started")
stats["jobs_completed"] = sdiag.get("jobs_completed")
stats["jobs_canceled"] = sdiag.get("jobs_canceled")
stats["jobs_failed"] = sdiag.get("jobs_failed")
# Main Scheduler Stats
stats["main_last_cycle"] = sdiag.get("schedule_cycle_last")
stats["main_max_cycle"] = sdiag.get("schedule_cycle_max")
stats["main_total_cycles"] = sdiag.get("schedule_cycle_counter")
if sdiag.get("schedule_cycle_counter") > 0:
stats["main_mean_cycle"] = (
sdiag.get("schedule_cycle_sum") / sdiag.get("schedule_cycle_counter")
)
stats["main_mean_depth_cycle"] = (
sdiag.get("schedule_cycle_depth") / sdiag.get("schedule_cycle_counter")
)
if (sdiag.get("req_time") - sdiag.get("req_time_start")) > 60:
stats["main_cycles_per_minute"] = (
sdiag.get("schedule_cycle_counter") /
((sdiag.get("req_time") - sdiag.get("req_time_start")) / 60)
)
stats["main_last_queue_length"] = sdiag.get("schedule_queue_len")
# Backfilling stats
stats["bf_total_jobs_since_slurm_start"] = sdiag.get("bf_backfilled_jobs")
stats["bf_total_jobs_since_cycle_start"] = sdiag.get("bf_last_backfilled_jobs")
stats["bf_total_cycles"] = sdiag.get("bf_cycle_counter")
stats["bf_last_cycle"] = sdiag.get("bf_cycle_last")
stats["bf_max_cycle"] = sdiag.get("bf_cycle_max")
stats["bf_queue_length"] = sdiag.get("bf_queue_len")
if sdiag.get("bf_cycle_counter") > 0:
stats["bf_mean_cycle"] = (
sdiag.get("bf_cycle_sum") / sdiag.get("bf_cycle_counter")
)
stats["bf_depth_mean"] = (
sdiag.get("bf_depth_sum") / sdiag.get("bf_cycle_counter")
)
stats["bf_depth_mean_try"] = (
sdiag.get("bf_depth_try_sum") / sdiag.get("bf_cycle_counter")
)
stats["bf_queue_length_mean"] = (
sdiag.get("bf_queue_len_sum") / sdiag.get("bf_cycle_counter")
)
stats["bf_last_depth_cycle"] = sdiag.get("bf_last_depth")
stats["bf_last_depth_cycle_try"] = sdiag.get("bf_last_depth_try")
return stats
def run(sock, delay):
while True:
now = int(time.time())
tuples = ([])
stats = get_sched_stats()
if stats is not None:
prefix = "cluster.slurm_sched_stats.gauge-"
for key in stats:
tuples.append((prefix + key, (now, stats[key])))
package = pickle.dumps(tuples, 1)
size = struct.pack('!L', len(package))
try:
sock.sendall(size)
sock.sendall(package)
except socket.error:
pass
time.sleep(delay)
def main():
sock = socket.socket()
try:
sock.connect((CARBON_SERVER, CARBON_PICKLE_PORT))
except socket.error:
raise SystemExit("Couldn't connect to %s on port %d. Is carbon-cache \
running" % (CARBON_SERVER, CARBON_PICKLE_PORT))
try:
run(sock, DELAY)
except KeyboardInterrupt:
sys.stderr.write("\nExiting on CTRL-c\n")
sys.exit(0)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment