Skip to content

Instantly share code, notes, and snippets.

@giovtorres
Created April 10, 2016 00:46
Show Gist options
  • Save giovtorres/cfad76aa4cbfd8ddd550554a7b2a7870 to your computer and use it in GitHub Desktop.
Save giovtorres/cfad76aa4cbfd8ddd550554a7b2a7870 to your computer and use it in GitHub Desktop.
{
"id": 23,
"title": "SLURM Scheduler Statistics",
"originalTitle": "SLURM Scheduler Statistics",
"tags": [
"Slurm"
],
"style": "dark",
"timezone": "browser",
"editable": true,
"hideControls": false,
"sharedCrosshair": true,
"rows": [
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"content": "**VALUES ARE RESET AT MIDNIGHT UTC TIME BY DEFAULT.**\n\n**Server thread count** The number of current active slurmctld threads. A high number would mean a high load processing events like job submissions, jobs dispatching, jobs completing, etc. If this is often close to `MAX_SERVER_THREADS (256)` it could point to a potential bottleneck.\n\n**Agent queue size** Slurm design has scalability in mind and sending messages to thousands of nodes is not a trivial task. The agent mechanism helps to control communication between the slurm daemons and the controller for a best effort. If (**agent queue size** + `AGENT_THREAD_COUNT` + 2) > `MAX_SERVER_THREADS`, there could be some delays affecting jobs management. `AGENT_THREAD_COUNT (10)`\n\n**Jobs submitted** Number of jobs submitted since last reset\n\n**Jobs started** Number of jobs started since last reset. This includes backfilled jobs.\n\n**Jobs completed** Number of jobs completed since last reset.\n\n**Jobs canceled** Number of jobs canceled since last reset.\n\n**Jobs failed** Number of jobs failed since last reset.\n",
"editable": true,
"error": false,
"id": 6,
"isNew": true,
"links": [],
"mode": "markdown",
"span": 12,
"style": {},
"title": "Slurmctld Execution",
"type": "text"
}
],
"title": "New row"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {
"Agent Queue Size": "#CCA300",
"Jobs Canceled": "#BF1B00",
"Jobs Completed": "#E5AC0E",
"Jobs Failed": "#962D82",
"Jobs Started": "#6ED0E0",
"Server Thread Count": "#6ED0E0"
},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": 0,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 7,
"isNew": true,
"leftYAxisLabel": "thread count",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"rightYAxisLabel": "queue size",
"seriesOverrides": [
{
"alias": "Agent Queue Size",
"yaxis": 2
}
],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"refId": "A",
"target": "alias(cluster.slurm_sched_stats.gauge-server_thread_count, 'Server Thread Count')"
},
{
"refId": "B",
"target": "alias(cluster.slurm_sched_stats.gauge-agent_queue_size, 'Agent Queue Size')"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Server Thread Count / Agent Queue Size",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"short",
"short"
]
},
{
"aliasColors": {
"Jobs Submitted": "#E24D42"
},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 1,
"isNew": true,
"leftYAxisLabel": "jobs",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"rightYAxisLabel": "jobs per second",
"seriesOverrides": [
{
"alias": "Jobs Submitted per second",
"bars": true,
"lines": false,
"yaxis": 2
}
],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"refId": "A",
"target": "alias(cluster.slurm_sched_stats.gauge-jobs_submitted, 'Jobs Submitted')"
},
{
"refId": "C",
"target": "alias(nonNegativeDerivative(cluster.slurm_sched_stats.gauge-jobs_submitted), 'Jobs Submitted per second')"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Jobs Submitted",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"short",
"short"
]
},
{
"aliasColors": {
"Jobs Started": "#6ED0E0",
"Jobs Started per second": "#7EB26D"
},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 2,
"isNew": true,
"leftYAxisLabel": "jobs",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"rightYAxisLabel": "jobs / sec",
"seriesOverrides": [
{
"alias": "Jobs Started per second",
"bars": true,
"lines": false,
"yaxis": 2
}
],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"refId": "A",
"target": "alias(cluster.slurm_sched_stats.gauge-jobs_started, 'Jobs Started')"
},
{
"refId": "B",
"target": "alias(nonNegativeDerivative(cluster.slurm_sched_stats.gauge-jobs_started), 'Jobs Started per second')"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Jobs Started",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"short",
"short"
]
},
{
"aliasColors": {
"Jobs Completed": "#E5AC0E",
"Jobs Completed per second": "#5195CE",
"Jobs Started": "#6ED0E0"
},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 3,
"isNew": true,
"leftYAxisLabel": "jobs",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"rightYAxisLabel": "jobs / sec",
"seriesOverrides": [
{
"alias": "Jobs Completed per second",
"bars": true,
"lines": false,
"yaxis": 2
}
],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"refId": "A",
"target": "alias(cluster.slurm_sched_stats.gauge-jobs_completed, 'Jobs Completed')"
},
{
"refId": "B",
"target": "alias(nonNegativeDerivative(cluster.slurm_sched_stats.gauge-jobs_completed), 'Jobs Completed per second')"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Jobs Completed",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"short",
"short"
]
},
{
"aliasColors": {
"Jobs Canceled": "#BF1B00",
"Jobs Completed": "#E5AC0E",
"Jobs Started": "#6ED0E0"
},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 4,
"isNew": true,
"leftYAxisLabel": "jobs",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"rightYAxisLabel": "jobs / sec",
"seriesOverrides": [
{
"alias": "Jobs Canceled per second",
"bars": true,
"lines": false,
"yaxis": 2
}
],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"refId": "A",
"target": "alias(cluster.slurm_sched_stats.gauge-jobs_canceled, 'Jobs Canceled')"
},
{
"refId": "B",
"target": "alias(nonNegativeDerivative(cluster.slurm_sched_stats.gauge-jobs_canceled), 'Jobs Canceled per second')"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Jobs Canceled",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"short",
"short"
]
},
{
"aliasColors": {
"Jobs Canceled": "#BF1B00",
"Jobs Completed": "#E5AC0E",
"Jobs Failed": "#962D82",
"Jobs Started": "#6ED0E0"
},
"bars": true,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 5,
"isNew": true,
"leftYAxisLabel": "jobs",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": false,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"refId": "A",
"target": "alias(cluster.slurm_sched_stats.gauge-jobs_failed, 'Jobs Failed')"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Jobs Failed",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"short",
"short"
]
}
],
"title": "Row"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"content": "A scheduling cycle implies to get the job_write_lock lock, then trying to get resources for jobs pending, starting from the most priority one and going in descendent order. Once a job can not get the resources the loop keeps going but just for jobs requesting other partitions. Jobs with dependencies or affected by accounts limits are not processed.\n\n**Last cycle** Time in microseconds for last scheduling cycle.\n\n**Max cycle** Time in microseconds for the maximum scheduling cycle since last reset.\n\n**Total cycles** Number of scheduling cycles since last reset. Scheduling is done in periodically and when a job is submitted or a job is completed.\n\n**Mean cycle** Mean of scheduling cycles since last reset.\n\n**Mean depth cycle** Mean of cycle depth. Depth means number of jobs processed in a scheduling cycle.\n\n**Cycles per minute** Counter of scheduling executions per minute\n\n**Last queue length** Length of jobs pending queue.\n",
"editable": true,
"error": false,
"id": 8,
"isNew": true,
"links": [],
"mode": "markdown",
"span": 12,
"style": {},
"title": "Main Scheduling Algorithm",
"type": "text"
}
],
"title": "New row"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 9,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"refId": "A",
"target": "alias(cluster.slurm_sched_stats.gauge-main_last_cycle, 'Time for last scheduling cycle')"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Last Cycle Time",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"µs",
"short"
]
},
{
"aliasColors": {
"Number of scheduling cycles": "#CCA300",
"Scheduling cycles / second": "#BF1B00"
},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": 0,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 11,
"isNew": true,
"leftYAxisLabel": "cycles",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"rightYAxisLabel": "cycles per second",
"seriesOverrides": [
{
"alias": "Scheduling cycles / second",
"bars": true,
"lines": false,
"yaxis": 2
}
],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"refId": "A",
"target": "alias(cluster.slurm_sched_stats.gauge-main_total_cycles, 'Number of scheduling cycles')"
},
{
"refId": "B",
"target": "alias(nonNegativeDerivative(cluster.slurm_sched_stats.gauge-main_total_cycles), 'Scheduling cycles / second')"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Total Cycles",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"none",
"none"
]
},
{
"aliasColors": {
"Mean of scheduling cycles": "#0A50A1"
},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 12,
"isNew": true,
"leftYAxisLabel": "seconds / cycle",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"refId": "A",
"target": "alias(scale(cluster.slurm_sched_stats.gauge-main_mean_cycle, 0.000001), 'Mean of scheduling cycles')"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Mean Cycle",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"none",
"short"
]
},
{
"aliasColors": {
"Mean of jobs processed in a scheduling cycle": "#1F78C1"
},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 13,
"isNew": true,
"leftYAxisLabel": "cycles",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"refId": "A",
"target": "alias(cluster.slurm_sched_stats.gauge-main_mean_depth_cycle, 'Mean of jobs processed in a scheduling cycle')"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Mean Depth Cycle",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"none",
"short"
]
},
{
"aliasColors": {
"Schedluing executions per minute": "#806EB7"
},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 14,
"isNew": true,
"leftYAxisLabel": "executions per minute",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"refId": "A",
"target": "alias(cluster.slurm_sched_stats.gauge-main_cycles_per_minute, 'Scheduling executions per minute')"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Cycles per minute",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"none",
"short"
]
},
{
"aliasColors": {
"Length of jobs pending queue": "#BF1B00"
},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 15,
"isNew": true,
"leftYAxisLabel": "jobs",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"refId": "A",
"target": "alias(cluster.slurm_sched_stats.gauge-main_last_queue_length, 'Length of jobs pending queue')"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Last Queue Length",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"none",
"short"
]
}
],
"title": "New row"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"content": "A backfilling scheduling cycle implies to get locks for jobs, nodes and partitions objects then trying to get resources for jobs pending. Jobs are processed based on priorities. If a job can not get resources the algorithm calculates when it could get them obtaining a future start time for the job. Then next job is processed and the algorithm tries to get resources for that job but avoiding to affect the previous ones, and again it calculates the future start time if not current resources available. The backfilling algorithm takes more time for each new job to process since more priority jobs can not be affected. The algorithm itself takes measures for avoiding a long execution cycle and for taking all the locks for too long.\n\n**Total backfilled jobs (since last slurm start)** Number of jobs started thanks to backfilling since last slurm start.\n\n**Total backfilled jobs (since last stats cycle start)** Number of jobs started thanks to backfilling since last time stats where reset. By default these values are reset at midnight UTC time.\n\n**Total cycles** Number of scheduling cycles since last reset\n\n**Last cycle** Time in microseconds of last backfilling cycle. It counts only execution time removing sleep time inside a scheduling cycle when it takes too much time. Note that locks are released during the sleep time so that other work can proceed.\n\n**Max cycle** Time in microseconds of maximum backfilling cycle execution since last reset. It counts only execution time removing sleep time inside a scheduling cycle when it takes too much time. Note that locks are released during the sleep time so that other work can proceed.\n\n**Mean cycle** Mean of backfilling scheduling cycles in microseconds since last reset\n\n**Last depth cycle** Number of processed jobs during last backfilling scheduling cycle. It counts every process even if it has no option to execute due to dependencies or limits.\n\n**Last depth cycle (try sched)** Number of processed jobs during last backfilling scheduling cycle. It counts only processes with a chance to run waiting for available resources. These jobs are which makes the backfilling algorithm heavier.\n\n**Depth Mean** Mean of processed jobs during backfilling scheduling cycles since last reset.\n\n**Depth Mean (try sched)** Mean of processed jobs during backfilling scheduling cycles since last reset. It counts only processes with a chance to run waiting for available resources. These jobs are which makes the backfilling algorithm heavier.\n\n**Last queue length** Number of jobs pending to be processed by backfilling algorithm. A job appears as much times as partitions it requested.\n\n**Queue length Mean** Mean of jobs pending to be processed by backfilling algorithm.",
"editable": true,
"error": false,
"id": 16,
"isNew": true,
"links": [],
"mode": "markdown",
"span": 12,
"style": {},
"title": "Backfilling Scheduling",
"type": "text"
}
],
"title": "New row"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {
"Backfill Jobs Started": "#447EBC"
},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 17,
"isNew": true,
"leftYAxisLabel": "jobs",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"refId": "A",
"target": "alias(cluster.slurm_sched_stats.gauge-bf_total_jobs_since_slurm_start, 'Backfill Jobs Started')"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Total backfilled jobs (since last slurm start)",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"none",
"short"
]
},
{
"aliasColors": {
"Jobs started thanks to backfilling since last stats reset": "#5195CE"
},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 18,
"isNew": true,
"leftYAxisLabel": "jobs",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"rightYAxisLabel": "jobs / sec",
"seriesOverrides": [
{
"alias": "Backfill Jobs Started per second",
"bars": true,
"lines": false,
"yaxis": 2
}
],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"refId": "A",
"target": "alias(cluster.slurm_sched_stats.gauge-bf_total_jobs_since_cycle_start, 'Backfill Jobs Started')"
},
{
"refId": "B",
"target": "alias(nonNegativeDerivative(cluster.slurm_sched_stats.gauge-bf_total_jobs_since_cycle_start), 'Backfill Jobs Started per second')"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Total backfilled jobs (since last stats cycle start)",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"none",
"none"
]
},
{
"aliasColors": {
"Backfill scheduling cycles": "#E24D42"
},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 19,
"isNew": true,
"leftYAxisLabel": "cycles",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"rightYAxisLabel": "cycles / second",
"seriesOverrides": [
{
"alias": "Backfill scheduling cycles per second",
"bars": true,
"lines": false,
"yaxis": 2
}
],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"refId": "A",
"target": "alias(cluster.slurm_sched_stats.gauge-bf_total_cycles, 'Backfill scheduling cycles')"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Total Backfill Cycles",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"none",
"short"
]
},
{
"aliasColors": {},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 20,
"isNew": true,
"leftYAxisLabel": "",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"refId": "A",
"target": "alias(cluster.slurm_sched_stats.gauge-bf_last_cycle, 'Time of last backfilling cycle')"
},
{
"refId": "B",
"target": "alias(cluster.slurm_sched_stats.gauge-bf_mean_cycle, 'Mean of backfilling scheduling cycles')"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Last/Mean Backfill Cycle Time",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"µs",
"short"
]
},
{
"aliasColors": {
"Processed jobs during last backfill": "#806EB7",
"Processed jobs during last backfill (try sched)": "#DEDAF7"
},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 21,
"isNew": true,
"leftYAxisLabel": "jobs",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"refId": "A",
"target": "alias(cluster.slurm_sched_stats.gauge-bf_last_depth_cycle, 'Processed jobs during last backfill')"
},
{
"refId": "B",
"target": "alias(cluster.slurm_sched_stats.gauge-bf_last_depth_cycle_try, 'Processed jobs during last backfill (try sched)')"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Last Depth Cycle",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"none",
"short"
]
},
{
"aliasColors": {
"Jobs pending backfill processing": "#BF1B00",
"Processed jobs during last backfill": "#806EB7",
"Processed jobs during last backfill (try sched)": "#DEDAF7"
},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 22,
"isNew": true,
"leftYAxisLabel": "jobs",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"refId": "A",
"target": "alias(cluster.slurm_sched_stats.gauge-bf_queue_length, 'Jobs pending backfill processing')"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Last Backfill Queue Length",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"none",
"short"
]
}
],
"title": "New row"
}
],
"time": {
"from": "now-24h",
"to": "now"
},
"timepicker": {
"now": true,
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"templating": {
"list": []
},
"annotations": {
"list": []
},
"refresh": "5m",
"schemaVersion": 8,
"version": 30,
"links": []
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment