Skip to content

Instantly share code, notes, and snippets.

@ebirn
Created August 1, 2018 13:31
Show Gist options
  • Save ebirn/91523d28d9423756d1befe89d5310bf9 to your computer and use it in GitHub Desktop.
Save ebirn/91523d28d9423756d1befe89d5310bf9 to your computer and use it in GitHub Desktop.
SLURM job resource utilization things
<!DOCTYPE html>
<html lang="en">
<head>
<link href="bootstrap/css/bootstrap.min.css" rel="stylesheet">
<link href="bootstrap/css/bootstrap-responsive.min.css" rel="stylesheet">
<title>CIR Status Board</title>
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<script type="text/javascript" src="date.js"></script>
<script type="text/javascript" src="jquery-1.9.1.min.js"></script>
<script type="text/javascript" src="bootstrap/js/bootstrap.min.js"></script>
<script>
function reloadImg(id) {
var obj = document.getElementById(id);
var src = obj.src;
obj.src = src;
var date = new Date();
var stamp = date.toString('dd.MM.yyyy HH:mm');
var updated = 'last updated: ' + stamp;
var text = document.getElementById("update-timestamp");
//obj.src = src + '&v=' + date.getTime();
text.innerHTML = updated;
return false;
}
</script>
</head>
<body>
<?php
date_default_timezone_set('Europe/Vienna');
include 'menu.inc.php';
?>
<div class="container">
<?php
# echo '<div class="row-fluid">';
# echo "\n";
# echo '<div class="span6">';
$db_name="slurm_accounting";
$db_user="slurm";
$db_pass="SUPER_SECRET_PASSWORD";
$mysql = new mysqli('bender.cir.meduniwien.ac.at', $db_user, $db_pass, $db_name);
# name, passwd, uid, gid, gecos, dir, shell
$posix_data= posix_getpwnam($_SERVER['PHP_AUTH_USER']);
$my_uid_number=$posix_data['uid'];
$job_state=array('PENDING', 'RUNNING', 'SUSPENDED', 'COMPLETED', 'CANCELLED', 'FAILED', 'TIMEOUT', 'NODE FAILED', 'PREEMPTED', 'BOOT FAIL', 'JOB END');
$job_state_label=array('label-primary', 'label-success', 'label-warning', 'label-success', 'label-warning', 'label-danger', 'label-danger', 'label-danger', 'label-warning', 'label-danger', 'label-info');
$global_stats = $mysql->query("select state, count(*) as count from circ_job_table group by state order by state");
$reservations = $mysql->query("select * from circ_resv_table where deleted=0 and time_end > unix_timestamp() order by time_start ASC");
$usage_query_hours = $mysql->query("select * from circ_usage_hour_table order by creation_time DESC limit 24");
$usage_query_days = $mysql->query("select * from circ_usage_day_table order by creation_time DESC limit 30");
$usage_query_months = $mysql->query("select * from circ_usage_month_table order by creation_time DESC limit 12");
$time_now = time();
$global_counts=array();
# convert memory values all to bytes
function read_memory($mem) {
if($mem < 1000000) {
$mem = $mem * 1024 * 1024;
}
return $mem;
}
foreach ($job_state as $state)
$global_counts[$state] = 0;
while ($job_count = $global_stats->fetch_assoc()) {
foreach ($job_state as $state) {
if ($job_count['state'] == array_search($state, $job_state)) {
$global_counts[$state] = $job_count['count'];
}
}
}
function format_timestamp($unixtime, $withtime=TRUE) {
$date_str = "unknown";
if($unixtime > 0) {
$time_part = '';
if ($withtime) $time_part = ' <b>'. date('H:i', $unixtime) . '</b>';
$date_str = date('Y-m-d', $unixtime) . $time_part;
}
return $date_str;
}
?>
<div class="row">
<h3>my SLURM jobs</h3>
<p>Global job stats: total jobs: <?php echo $global_counts['RUNNING'] + $global_counts['PENDING']; ?> (running: <?php echo $global_counts['RUNNING']; ?>, pending: <?php echo $global_counts['PENDING'] ?>). Total reservations: <?php echo $reservations->num_rows; ?></p>
<p>For details of what you can do and how to do it, see the <a href="https://www.cir.meduniwien.ac.at/wiki/internal:computingenvironment">computing environment documentation</a> in the wiki. These pages may be of particular interest:
<ul>
<li><a href="https://www.cir.meduniwien.ac.at/wiki/internal:softwaremodules">Software modules</a></li>
<li><a href="https://www.cir.meduniwien.ac.at/wiki/internal:slurm_scheduler_tutorial">SLURM scheduler</a></li>
</ul>
</div>
<div class="tabbable"> <!-- Only required for left/right tabs -->
<ul class="nav nav-tabs">
<li class="active"><a href="#jobs_active" data-toggle="tab">Jobs Active (<?php echo $global_counts['RUNNING'] + $global_counts['PENDING']; ?>)</a></li>
<li><a href="#jobs_done" data-toggle="tab">Jobs Done</a></li>
<li><a href="#reservations" data-toggle="tab">Reservations</a></li>
<li><a href="#usage" data-toggle="tab">Usage</a></li>
</ul>
<div class="tab-content">
<div class="tab-pane active" id="jobs_active">
<?php
# job_db_inx | mod_time | deleted | account | cpus_req | cpus_alloc | derived_ec | derived_es | exit_code | job_name | id_assoc | id_block | id_job | id_qos | id_resv | id_wckey | id_user | id_group | kill_requid | mem_req | nodelist | nodes_alloc | node_inx | partition | priority | state | timelimit | time_submit | time_eligible | time_start | time_end | time_suspended | gres_req | gres_alloc | gres_used | wckey | track_steps
# | deleted | cpus_alloc | exit_code | id_step | kill_requid | nodelist | nodes_alloc | node_inx | state | step_name | task_cnt | task_dist | time_start | time_end | time_suspended | user_sec | user_usec | sys_sec | sys_usec | max_pages | max_pages_task | max_pages_node | ave_pages | max_rss | max_rss_task | max_rss_node | ave_rss | max_vsize | max_vsize_task | max_vsize_node | ave_vsize | min_cpu | min_cpu_task | min_cpu_node | ave_cpu | act_cpufreq | consumed_energy | req_cpufreq | max_disk_read | max_disk_read_task | max_disk_read_node | ave_disk_read | max_disk_write | max_disk_write_task | max_disk_write_node | ave_disk_write |
$select_fields = 'jt.*';
$queue_order="time_start DESC, priority DESC, id_job ASC";
$query_str='select jt.*, qos.name as qos_name from (select * from circ_job_table where state < 3 order by '. $queue_order .') jt left join qos_table qos ON (jt.id_qos=qos.id) left join circ_step_table st USING (job_db_inx) ORDER BY ' . $queue_order;
#$query_str='select * from circ_job_table where id_user=' . $my_uid_number . ' and state < 3 order by time_submit DESC, job_db_inx dESC';
$active_jobs = $mysql->query($query_str);
if ($active_jobs->num_rows > 0) {
?>
<p>for similar output at the cmd line use: <code>squeue -o '%A %8u %16j %20S %10l (%10L) %6Q %10T %8N %3C %10m %6b'</code></p>
<table class="table table-striped">
<thead>
<tr>
<th class="span1">Job ID</th>
<th>User</th>
<th class="span2">Name</th>
<th>Started</th>
<th>Timelimit (remaining)</th>
<th>Priority</th>
<th>CPUs</th>
<th>Memory</th>
<th>gen. Res.</th>
<th>QOS</th>
<th>State</th>
</tr>
</thead>
<tbody>
<?php
$prev_job=-1;
while ($job = $active_jobs->fetch_assoc()) {
if($prev_job != $job['id_job']) {
$is_my_job = ($job["id_user"] == $my_uid_number) ? true : false;
$state_label = $job_state_label[$job['state']];
$prev_job=$job['id_job'];
$state_label = $job_state_label[$job['state']];
$job_posix_user = posix_getpwuid($job["id_user"]);
$job_username = $job_posix_user['name'];
$endTime = new DateTime();
$endTime->setTimestamp($job['time_start'] + (60 * $job['timelimit']));
$startTime = new DateTime();
$startTime->setTimestamp($job['time_start']);
$timelimit = $endTime->diff($startTime);
$timeRemaining = $endTime->diff(new DateTime());
echo "<tr class=\"" . ($is_my_job ? "info" : "") ."\">";
echo "<td>". $job["id_job"] ."</td>";
echo "<td>". $job_username ."</td>";
echo "<td>". $job["job_name"] ."</td>";
echo "<td>". format_timestamp($job['time_start']) ."</td>";
echo "<td>". $timelimit->format("%d-%H:%i:%s") . ($job['time_start'] > 0 ? " (" . $timeRemaining->format("%d-%H:%i:%s") .")" : "") . "</td>";
echo "<td>". $job['priority'] ."</td>";
echo "<td>". $job['cpus_req'] ."</td>";
#echo "<td>". round($job['mem_req']/1024/1024) ."</td>";
echo "<td>". read_memory($job['mem_req'])/1024/1024 ." MB</td>";
echo "<td>". $job['gres_req'] ."</td>";
echo "<td>". $job['qos_name'] ."</td>";
echo "<td><span class=\"label $state_label\">". $job_state[$job['state']] ."</span></td>";
}
}
?>
</tbody>
</table>
<?php
} // close if: num rows > 0
else {
echo "<p>no active jobs.</p>";
}
?>
</div>
<!-- COMPLETED JOBS -->
<div class="tab-pane" id="jobs_done">
<div class="panel-group" id="job-complete-panel" role="tablist" aria-multiselectable="true">
<?php
$job_count=50;
$query_str='select * from (select * from circ_job_table where id_user='.$my_uid_number.' and state > 2 order by time_submit DESC, job_db_inx dESC limit '. $job_count .') jt ORDER BY time_submit DESC';
#$query_str='select * from circ_job_table where id_user=' . $my_uid_number . ' and state > 2 order by time_submit DESC, job_db_inx dESC limit ' . $job_count;
$past_job_result = $mysql->query($query_str);
while ($job = $past_job_result->fetch_assoc()) {
$state_label = $job_state_label[$job['state']];
?>
<div class="panel panel-default">
<div class="panel-heading" role="tab" id="job-collapse-heading-<?php echo $job['id_job']; ?>">
<div class="row">
<div class="col-md-3">
<h4 class="panel-title"> <?php echo $job["job_name"]; ?></h4>
<p>(ID <?php echo $job["id_job"]; ?>)</p>
</div>
<div class="col-md-3">
<p>Start: <?php echo format_timestamp($job['time_start']); ?> <p>
<p>End: <?php echo format_timestamp($job['time_end']); ?> </p>
</div>
<div class="col-md-3">
<p>Nodes: <?php echo $job['nodelist']; ?> (CPUs: <?php echo $job['cpus_req']; ?>) <p>
<p>Memory: <?php echo round(read_memory($job['mem_req'])/1024/1024); ?>MB</p>
</div>
<div class="col-md-1">
</div>
<div class="col-md-2">
<p><?php echo "<span class=\"label $state_label\">". $job_state[$job['state']] ."</span> "; ?></p>
<p>
<a data-toggle="collapse" data-parent="#job-complete-panel" href="#job-collapse-<?php echo $job['id_job']; ?>" aria-expanded="false" aria-controls="job-collapse-<?php echo $job['id_job']; ?>">(show job details)</a>
</p>
</div>
</div> <!-- row / col -->
</div> <!-- panel header-->
<div id="job-collapse-<?php echo $job['id_job']; ?>" class="panel-collapse collapse" role="tabpanel" aria-labelledby="job-collapse-heading-<?php echo $job['id_job']; ?>">
<div class="panel-body">
<?php
// render job step
//
//<!-- step fields st.state as step_state, st.time_start as step_time_start, st.time_end as step_time_end, st.id_step-->
$step_query = 'select * from circ_step_table where job_db_inx='.$job['job_db_inx'].' ORDER BY id_step';
$step_result = $mysql->query($step_query);
while($step = $step_result->fetch_assoc()) {
$state_label = $job_state_label[$step['state']];
?>
<div class="row">
<div class="col-md-2">
<p><b>Step: <?php echo $step['step_name']; ?> </b></p>
<?php
$dateStart = new DateTime();
$dateStart->setTimestamp($step['time_start']);
$dateEnd = new DateTime();
$dateEnd->setTimestamp($step['time_end']);
$duration = $dateStart->diff($dateEnd);
?>
<p>Duration: <?php echo $duration->format('%d-%H:%i:%s'); ?> </p>
</div>
<div class="col-md-2">
<?php
$taskFactor = $step['task_cnt'];
$cpuFactor = $step['cpus_alloc'];
$userTime = $step['user_sec'];
$sysTime = $step['sys_sec'];
$totalTime = $step['time_end'] - $step['time_start'];
$totalCpuTime = $totalTime * $taskFactor * $cpuFactor;
$userPercent = $totalCpuTime > 0 ? ($userTime / $totalCpuTime * 100) : 0;
$sysPercent = $totalCpuTime > 0 ? ($sysTime / $totalCpuTime * 100) : 0;
$idlePercent = 100.0-$userPercent-$sysPercent;
?>
<p>Time user/sys/idle<p>
<p>
<div class="progress">
<div class="progress-bar progress-bar-success" style="width: <?php echo $userPercent; ?>%">
<span class="">user <?php echo round($userPercent); ?>%</span>
</div>
<div class="progress-bar progress-bar-danger" style="width: <?php echo $sysPercent; ?>%">
<span class="">sys <?php echo round($userPercent); ?>%</span>
</div>
<div class="progress-bar progress-bar-info" style="width: <?php echo $idlePercent; ?>%">
<span class="">idle <?php echo round($idlePercent); ?>%</span>
</div>
</div>
</p>
</div>
<div class="col-md-2">
<p>Nodes: <?php echo $step['nodes_alloc']; ?></p>
<p>Tasks: <?php echo $step['task_cnt']; ?> (CPUs: <?php echo $step['cpus_alloc']; ?>)</p>
</div>
<div class="col-md-2">
<p>avg disk IN: <?php echo round($step['ave_disk_read']); ?> MB</p>
<p>avg disk OUT: <?php echo round($step['ave_disk_write']); ?> MB</p>
</div>
<?php
$memUsedPercent = ($step['max_rss']/read_memory($job['mem_req']) *1024 * 100);
$memUsedPercent = round($memUsedPercent);
$alertClass = "";
$mem_bar_type = 'info';
if($memUsedPercent < 25.0) {
$mem_bar_type = "danger";
$alertClass = "alert alert-danger";
}
else if($memUsedPercent < 50.0) {
$mem_bar_type = "warning";
}
else if($memUsedPercent < 75.0) {
$mem_bar_type = "info";
}
else {
$mem_bar_type = "success";
}
?>
<div class="<?php echo "col-md-2 " . $alertClass; ?>" >
<p>Memory: <?php echo round($step['max_rss']/1024); ?> MB</p>
<p>
<div class="progress">
<div class="progress-bar progress-bar-<?php echo $mem_bar_type; ?>" style="width: <?php echo round($memUsedPercent);?>%">
<span class=""><?php echo round($memUsedPercent); ?>%</span>
</div>
</div>
</p>
</div>
<div class="col-md-2">
</div>
<div class="col-md-2">
<p>Exit code: <?php echo $step['exit_code']; ?></p>
<p><?php echo "<span class=\"label $state_label\">". $job_state[$step['state']] ."</span> "; ?></p>
</div>
</div>
<?php
}
?>
</div> <!-- steps panel body-->
</div> <!-- collapse steps panel body-->
</div><!-- job group panel -->
<?php
} // jobs while loop
?>
</div> <!-- job complete container panel -->
</div> <!-- tab pane jobs_done -->
<div class="tab-pane" id="reservations">
<p>resources reserved for a user or group of users</p>
<table class="table table-striped table-condensed">
<thead>
<tr>
<th>Name</th>
<th>Start</th>
<th>End</th>
<th>Nodes</th>
<th>CPUs</th>
</tr>
</thead>
<tbody>
<?php
while ($resv = $reservations->fetch_assoc()) {
$is_active = "";
if ($resv['time_start'] < $time_now && $time_now < $resv['time_end']) {
$is_active="info";
}
echo "<tr class=\"". $is_active ."\">";
echo "<td>" . $resv['resv_name'] . "</td>";
echo "<td>" . format_timestamp($resv['time_start']) . "</td>";
echo "<td>" . format_timestamp($resv['time_end']) . "</td>";
echo "<td>" . $resv['nodelist'] . "</td>";
echo "<td>" . $resv['cpus'] . "</td>";
echo "</tr>";
}
?>
</tbody>
</table>
</div>
<div class="tab-pane" id="usage">
<h4>current wait time</h4>
<?php
function wait_query($fromTime) {
return 'select avg_wait, job_count, name, description, priority, usage_factor from qos_table qt left join (select AVG(time_start-time_eligible) as avg_wait, id_qos, count(*) as job_count from circ_job_table jt where time_eligible>0 and state not in (0,4) and time_start > '.$fromTime.' group by id_qos) wt ON (wt.id_qos = qt.id) ORDER BY priority';
}
$now = time();
$day = (60*60*24);
$lastDay = $now - $day;
$lastWeek = $now - (7 * $day);
$lastMonth = $now - (30 * $day);
$waitResult[0] = $mysql->query(wait_query($lastDay));
$waitResult[1] = $mysql->query(wait_query($lastWeek));
$waitResult[2] = $mysql->query(wait_query($lastMonth));
?>
<table class="table table-striped">
<thead>
<tr>
<th>QOS</th>
<th>last 24h (jobs)</th>
<th>last 7 days (jobs)</th>
<th>last 30 days (jobs)</th>
</tr>
</thead>
<tbody>
<?php
function render_duration($dur, $job_count) {
$waitText = "no jobs";
if($dur != null) {
$zeroDate = new DateTime();
$zeroDate->setTimestamp(0);
$waitDate = new DateTime();
$waitDate->setTimestamp($dur);
$diff = $zeroDate->diff($waitDate, true);
$fmt = '%im %ss';
if($diff->d > 0) $fmt = "%dd %Hh " . $fmt;
else if($diff->h > 0) $fmt = "%Hh " . $fmt;
$waitText = $diff->format($fmt) . " (" . $job_count. ")";
}
return $waitText;
}
for($row = 0; $row < $waitResult[0]->num_rows ; $row++) {
$day = $waitResult[0]->fetch_object();
$week = $waitResult[1]->fetch_object();
$month = $waitResult[2]->fetch_object();
echo "<tr>";
echo "<td><span title=\"" . $day->description . "\">" . $day->name . "</span></td>";
#echo "<td>" . $day->description . "</td>";
echo "<td>". render_duration($day->avg_wait, $day->job_count) . "</td>";
echo "<td>". render_duration($week->avg_wait, $week->job_count) . "</td>";
echo "<td>". render_duration($month->avg_wait, $month->job_count) . "</td>";
echo "</tr>";
}
?>
</tbody>
</table>
<h4>Usage by CPU core reservations</h4>
<?php
function percent_time($seconds, $cpu_count, $time_unit) {
return round($seconds / $cpu_count / $time_unit * 100.0, 2);
}
function render_usage($time_unit, $usage_query, $withtime=FALSE) {
?>
<table class="table table-condensed">
<thead>
<tr>
<th>Time</th>
<th>CPUs</th>
<th>usage</th>
</tr>
</thead>
<tbody>
<?php
while($usage = $usage_query->fetch_assoc()) {
$cpu_count = $usage['cpu_count'];
$percent_alloc = percent_time($usage['alloc_cpu_secs'], $cpu_count, $time_unit);
$percent_down = percent_time($usage['down_cpu_secs'], $cpu_count, $time_unit);
$percent_pdown = percent_time($usage['pdown_cpu_secs'], $cpu_count, $time_unit);
$percent_idle = percent_time($usage['idle_cpu_secs'], $cpu_count, $time_unit);
$percent_resv = percent_time($usage['resv_cpu_secs'], $cpu_count, $time_unit);
$percent_over = percent_time($usage['over_cpu_secs'], $cpu_count, $time_unit);
echo "<tr>";
echo "<td>" . format_timestamp($usage['creation_time'], $withtime) . "</td>";
echo "<td>" . $cpu_count . "</td>";
?>
<td>
<div class="progress">
<div class="progress-bar progress-bar-info" style="width: <?php echo $percent_alloc; ?>%">
<span class=""><?php echo $percent_alloc; ?>% alloc</span>
</div>
<div class="progress-bar progress-bar-success" style="width: <?php echo $percent_idle; ?>%">
<span class=""><?php echo $percent_idle; ?>% idle</span>
</div>
<div class="progress-bar progress-bar-warning" style="width: <?php echo $percent_resv; ?>%">
<span class=""><?php echo $percent_resv; ?>% reserved</span>
</div>
<div class="progress-bar progress-bar-danger" style="width: <?php echo $percent_down; ?>%">
<span class=""><?php echo $percent_down; ?>% down</span>
</div>
<div class="progress-bar progress-bar-danger" style="width: <?php echo $percent_over; ?>%">
<span class=""><?php echo $percent_over; ?>% over</span>
</div>
</div>
</td>
<?php
echo "</tr>";
}
?>
</tbody>
</table>
<?php
}
?>
<div class="panel-group" id="usage-accordion" role="tablist" aria-multiselectable="true">
<div class="panel panel-default">
<div class="panel-heading">
<h4 class="panel-title">
<a class="accordion-toggle" data-toggle="collapse" data-parent="#usage-accordion" href="#collapseHour">Last 24h</a>
</h4>
</div>
<div id="collapseHour" class="panel-collapse collapse in">
<div class="accordion-inner">
<?php render_usage(60*60, $usage_query_hours, TRUE); ?>
</div>
</div>
</div>
<div class="panel panel-default">
<div class="panel-heading">
<h4 class="panel-title">
<a class="accordion-toggle" data-toggle="collapse" data-parent="#usage-accordion" href="#collapseDay">Last 30 days</a>
</h4>
</div>
<div id="collapseDay" class="panel-collapse collapse">
<div class="accordion-inner">
<?php render_usage(60*60*24, $usage_query_days); ?>
</div>
</div>
</div>
<div class="panel panel-default">
<div class="panel-heading">
<h4 class="panel-title">
<a class="accordion-toggle" data-toggle="collapse" data-parent="#usage-accordion" href="#collapseMonth">Last 12 months</a>
</h4>
</div>
<div id="collapseMonth" class="panel-collapse collapse">
<div class="accordion-inner">
<?php render_usage(60*60*24*30, $usage_query_months); ?>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<?php
# check circ_step_table for details (usage) of jobs, steps
# select job_db_inx,id_step,state,step_name,task_cnt,max_rss,max_rss_task from circ_step_table where job_db_inx=1912;
# select * from circ_step_table st left join circ_job_table jt USING (job_db_inx) where st.job_db_inx=1919;
$mysql->close();
?>
</div>
</body>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment