Skip to content

Instantly share code, notes, and snippets.

@ebirn
Last active September 30, 2022 14:26
Show Gist options
  • Save ebirn/36b3a3c863224423557db784fc3d292d to your computer and use it in GitHub Desktop.
Save ebirn/36b3a3c863224423557db784fc3d292d to your computer and use it in GitHub Desktop.
SLURM job efficiency
#!/usr/bin/env python3
# porting of https://github.com/SchedMD/slurm/blob/master/contribs/seff/seff
#
# sacct -P -n -a --format JobID,State,Cluster,AllocCPUS,REQMEM,TotalCPU,Elapsed,MaxRSS,ExitCode,NNodes,NTasks -j 12655203
# sacct --format=Jobid,JobName,State,QOS,Partition%5,Timelimit,Elapsed,NCPU,CPUTime,TotalCpu,SystemCPU,UserCPU,ReqMem,MaxRSS,MaxVMSize $@
import sys
import subprocess
import json
from collections import Counter
#sacct_cmd = ['/usr/bin/sacct', '--json', '-n', '-a', '--partition', 'xyz', '-A', 'someaccount', '--start', '2022-09-23']
sacct_cmd = ['/usr/bin/sacct', '--json'] + sys.argv[1:]
sacct = subprocess.Popen(sacct_cmd, stdout=subprocess.PIPE)
json_out = json.load(sacct.stdout)
#works in python 3.0+
#for line in proc.stdout:
job_state_cnt = Counter()
job_account_cnt = Counter()
job_partition_cnt = Counter()
job_qos_cnt = Counter()
slurm_jobs = []
job = {}
job_num = None
step_name = None
job_prefix = None
from datetime import datetime, timedelta
skip_states = set(['PENDING', 'RUNNING'])
detail_states = set(['FAILED', 'NODE_FAIL', 'OUT_OF_MEMORY', 'TIMEOUT'])
good_states = set(['COMPLETED'])
print(f"showing job details for states: {', '.join(detail_states) }")
def print_jobstats(job_state, info, force=False):
if force or job_state in detail_states:
print(info)
global_stats = []
for slurm_job in json_out['jobs']:
job_stats = {'waiting': 0.0}
job_time = slurm_job['time']
job_required = slurm_job['required']
job_state = slurm_job['state']['current']
job_state_cnt.update([job_state])
job_account_cnt.update([slurm_job['account']])
job_partition_cnt.update([slurm_job['partition']])
job_qos_cnt.update([slurm_job['qos']])
job_state = slurm_job['state']['current']
# pending or running have no usable information yet
if slurm_job['state']['current'] in skip_states:
continue
# collect global stats of good jobs only
if job_state in good_states:
global_stats.append(job_stats)
# for skipit in skip_states:
# if skipit in slurm_job['state']['current']:
# ##if slurm_job['state']['current'] in skip_states:
# # print(f"skipping {slurm_job['job_id']}:{slurm_job['name']} -> {slurm_job['state']['current']}")
# continue
job_stats['elapsed_eff'] = float(slurm_job['time']['elapsed']) / (float(slurm_job['time']['limit']) * 60.0)
#FIXME: bug in sacct: submission == start (it's actually the submission time that is correct, start time is not on job level.
# check step 'batch' start time for this
# job_stats['waiting'] = (float(slurm_job['time']['start']) - float(slurm_job['time']['submission'])) / 60.0 / 60.0
print_jobstats (job_state, f"job: {slurm_job['job_id']} {slurm_job['name']} ({job_state})")
step_total_weight = job_time['elapsed']
list_stats = []
for idx, job_step in enumerate(slurm_job['steps']):
step_stats = {}
#step_stats['name'] = job_step['step']['name']
#print(f" step: {job_step['step']['name']}")
# ignore external step, will mostly be 0
if job_step['step']['name'] == 'extern':
continue
list_stats.append(step_stats)
# job_step['tres']['requested']
# job_step['tres']['consumed']
tres_alloc = {}
step_time = job_step['time']
# step_total_weight += tres['count']
step_stats['weight']=0.0
if step_total_weight > 0.0:
step_stats['weight'] = step_time['elapsed']/step_total_weight
step_duration_hrs = float(step_time['elapsed']) / 60.0 / 60.0
step_stats['node_hrs'] = job_step['nodes']['count'] * step_duration_hrs
for tres in job_step['tres']['allocated']:
tres_type = tres['type']
hrs_name = tres_type + '_hrs'
if tres['count']:
step_stats[hrs_name] = tres['count'] * step_duration_hrs
# memory in GB hours
if tres_type == 'mem':
step_stats[hrs_name] = tres['count'] * step_duration_hrs / 1024.0
# tres: type name id count
# print(" allocated: {type}: {count}".format(**tres))
tres_alloc[tres_type] = tres['count']
for tres in job_step['tres']['requested']['total']:
# print(" requested: {type}: {count}".format(**tres))
tres_type = tres['type']
eff_name = tres_type + '_eff'
# skip if we have no alloc record of consumed resource
if tres_type not in tres_alloc or tres_alloc[tres_type] is None:
continue
tres_factor = tres['count'] / tres_alloc[tres_type]
if tres_type == 'mem' and tres.get('count') is not None:
tres_factor = tres['count'] / (tres_alloc[tres_type] * 1024.0 * 1024.0)
if step_stats.get(eff_name) is None:
step_stats[eff_name] = tres_factor
if step_stats['weight'] == 0:
continue
if idx==0:
# use 'batch; step as actual start time, the timestamp on "job" object level is incorrect
job_stats['waiting'] = (float(step_time['start']) - float(slurm_job['time']['submission'])) / 60.0 / 60.0
## step_stats['cpu_eff'] = step_time['elapsed'] * tres_alloc['cpu'] / (step_time['total']['seconds'] + (step_time['total']['microseconds']/1000.0))
#step_stats['cpu_eff'] = (step_time['total']['seconds'] + (step_time['total']['microseconds']/1000000.0)) / tres_alloc['cpu'] / step_time['elapsed']
step_stats['cpu_eff'] = float(step_time['total']['seconds'] + (step_time['total']['microseconds']/1000000.0)) / float(tres_alloc['cpu']) / float(step_time['elapsed'])
# print step statistics line
step_stats_str = ''
for sss in step_stats:
step_stats_str += f" { sss }: {step_stats[sss]:4.2f}"
#print(f" step {job_step['step']['name']:8s}: {step_stats_str}")
# print(f"calcing: {step_stats['cpu_eff']}")
# print(f" step {idx}: {job_step['step']['name']}")
#print(f" stats: {step_stats}")
#print(f" step tres alloc: { job_step['tres']['allocated'] }")
#print(f" step tres req: { job_step['tres']['requested']['total'] }")
# print(f" alloc: {tres_alloc}")
# print(f" time: {step_time}")
#job_stats['cpu_eff'] = jobstats.get('cpu_eff', 0.0) + ()
#print(f"total weight: {step_total_weight}")
## add up weighted step stats
for eff in ['cpu_eff', 'mem_eff']:
job_stats[eff] = sum(item[eff]*item['weight'] for item in list_stats)
for hrs in ['cpu_hrs', 'mem_hrs', 'gres_hrs', 'node_hrs']:
job_stats[hrs] = sum(item.get(hrs, 0.0) for item in list_stats)
summary_stats_str = ''
if job_stats['waiting'] > 0.01:
summary_stats_str += f" waiting: {job_stats['waiting']:4.2f} hrs "
summary_stats_str += '\n'
summary_stats_str += f" job eff: "
for stat in job_stats:
if stat.endswith('_eff'):
summary_stats_str += f"{ stat }: {100.0*job_stats[stat]:2.1f}% "
summary_stats_str += '\n resource hours: '
for stat in job_stats:
if stat.endswith('_hrs'):
summary_stats_str += f" { stat }: {job_stats[stat]:2.1f}h"
print_jobstats (job_state, summary_stats_str)
# print('===')
def counter_summary(cc):
return ', '.join([f"{key}: {cnt}" for key,cnt in cc.items()])
def print_stats(data, name, unit, scale=1.0):
import statistics
import math
import operator
from functools import reduce
value_list = [stats[name] for stats in data]
stat_mean = statistics.mean(value_list)
stat_median = statistics.median(value_list)
non_zero_list = list(filter(lambda x: x>0.0, value_list))
stat_geomean = 0.0
if len(non_zero_list) > 0:
stat_geomean = float(reduce(operator.mul, non_zero_list, 1)) ** (1.0/float(len(non_zero_list))) #math.prod(value_list) ** (1.0/len(value_list))
# python 3.8
# stat_quantiles = statistics.quantiles(value_list)
print(f" {name:12s}: median: {scale * stat_median:6.2f} {unit}, geomean: {scale * stat_geomean:6.2f} {unit}, avg: { scale * stat_mean:6.2f} {unit}, min: {scale * min(value_list):6.2f} {unit}, max: {scale * max(value_list):6.2f} {unit}, total: {scale * sum(value_list):6.2f} {unit}")
print ("done.")
print()
print ("generated from output of: " + " ".join(sacct_cmd))
print(f"SUMMARY of { sum(job_state_cnt.values()) } jobs:")
print("job states:", counter_summary(job_state_cnt))
print("job accounts:", counter_summary(job_account_cnt))
print("job partition:", counter_summary(job_partition_cnt))
print("job qos:", counter_summary(job_qos_cnt))
print(f"stats for {len(global_stats)} COMPLETED jobs:")
print_stats(global_stats, 'waiting', 'h')
for k in ['elapsed_eff', 'cpu_eff', 'mem_eff']:
print_stats(global_stats, k, '%')
# resource hours
print("consumed resource hours:")
for k in ['cpu_hrs', 'gres_hrs', 'mem_hrs', 'node_hrs']:
print_stats(global_stats, k, 'h')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment