Created
January 26, 2022 16:58
-
-
Save benjvi/e5c44f05e77f146de5ee7c177464ee8b to your computer and use it in GitHub Desktop.
Backup Execution Dashboard - CronJobs & Velero
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"__inputs": [ | |
{ | |
"name": "DS_PROMETHEUS", | |
"label": "prometheus", | |
"description": "", | |
"type": "datasource", | |
"pluginId": "prometheus", | |
"pluginName": "Prometheus" | |
} | |
], | |
"__requires": [ | |
{ | |
"type": "grafana", | |
"id": "grafana", | |
"name": "Grafana", | |
"version": "7.5.4" | |
}, | |
{ | |
"type": "panel", | |
"id": "graph", | |
"name": "Graph", | |
"version": "" | |
}, | |
{ | |
"type": "panel", | |
"id": "piechart", | |
"name": "Pie chart v2", | |
"version": "" | |
}, | |
{ | |
"type": "datasource", | |
"id": "prometheus", | |
"name": "Prometheus", | |
"version": "1.0.0" | |
}, | |
{ | |
"type": "panel", | |
"id": "stat", | |
"name": "Stat", | |
"version": "" | |
}, | |
{ | |
"type": "panel", | |
"id": "table", | |
"name": "Table", | |
"version": "" | |
} | |
], | |
"annotations": { | |
"list": [ | |
{ | |
"builtIn": 1, | |
"datasource": "-- Grafana --", | |
"enable": true, | |
"hide": true, | |
"iconColor": "rgba(0, 211, 255, 1)", | |
"name": "Annotations & Alerts", | |
"type": "dashboard" | |
} | |
] | |
}, | |
"editable": true, | |
"gnetId": null, | |
"graphTooltip": 0, | |
"id": null, | |
"links": [], | |
"panels": [ | |
{ | |
"collapsed": false, | |
"datasource": null, | |
"gridPos": { | |
"h": 1, | |
"w": 24, | |
"x": 0, | |
"y": 0 | |
}, | |
"id": 13, | |
"panels": [], | |
"title": "CronJobs", | |
"type": "row" | |
}, | |
{ | |
"datasource": "${DS_PROMETHEUS}", | |
"fieldConfig": { | |
"defaults": { | |
"color": { | |
"mode": "thresholds" | |
}, | |
"custom": { | |
"align": null, | |
"filterable": false | |
}, | |
"mappings": [], | |
"noValue": "-", | |
"thresholds": { | |
"mode": "absolute", | |
"steps": [ | |
{ | |
"color": "green", | |
"value": null | |
}, | |
{ | |
"color": "#EAB839", | |
"value": 12 | |
}, | |
{ | |
"color": "red", | |
"value": 24 | |
} | |
] | |
} | |
}, | |
"overrides": [ | |
{ | |
"matcher": { | |
"id": "byName", | |
"options": "Last Successful (h)" | |
}, | |
"properties": [ | |
{ | |
"id": "custom.displayMode", | |
"value": "color-background" | |
} | |
] | |
}, | |
{ | |
"matcher": { | |
"id": "byName", | |
"options": "Last Scheduled (h)" | |
}, | |
"properties": [ | |
{ | |
"id": "custom.displayMode", | |
"value": "color-background" | |
} | |
] | |
}, | |
{ | |
"matcher": { | |
"id": "byName", | |
"options": "Last Failed (d)" | |
}, | |
"properties": [ | |
{ | |
"id": "custom.displayMode", | |
"value": "color-background" | |
}, | |
{ | |
"id": "thresholds", | |
"value": { | |
"mode": "absolute", | |
"steps": [ | |
{ | |
"color": "green", | |
"value": null | |
}, | |
{ | |
"color": "red", | |
"value": 0.0001 | |
}, | |
{ | |
"color": "orange", | |
"value": 3 | |
}, | |
{ | |
"color": "#EAB839", | |
"value": 10 | |
}, | |
{ | |
"color": "green", | |
"value": 30 | |
} | |
] | |
} | |
}, | |
{ | |
"id": "custom.width", | |
"value": null | |
} | |
] | |
} | |
] | |
}, | |
"gridPos": { | |
"h": 7, | |
"w": 24, | |
"x": 0, | |
"y": 1 | |
}, | |
"id": 2, | |
"options": { | |
"showHeader": true, | |
"sortBy": [] | |
}, | |
"pluginVersion": "7.5.4", | |
"targets": [ | |
{ | |
"exemplar": true, | |
"expr": "# note: doesn't account for schedules with no successful runs in time period\n(\n time()\n - \n (\n max(\n # get jobs associated to a Cronjob\n kube_job_owner{owner_kind=\"CronJob\"} \n * \n on (namespace, job_name) \n group_right(owner_name) \n (\n (kube_job_status_succeeded == 1) \n * \n on (namespace, job_name) \n group_right(owner_name) \n (kube_job_status_completion_time) \n )\n ) by (namespace, owner_name) \n )\n) / 3600", | |
"format": "table", | |
"instant": true, | |
"interval": "", | |
"legendFormat": "", | |
"queryType": "randomWalk", | |
"refId": "A" | |
}, | |
{ | |
"exemplar": true, | |
"expr": "# note: doesn't account for schedules with no failed runs in time period\n(\n time()\n - \n (\n max(\n # get jobs associated to a Cronjob\n kube_job_owner{owner_kind=\"CronJob\"} \n * \n on (namespace, job_name) \n group_right(owner_name) \n (\n (kube_job_status_failed == 1) \n * \n on (namespace, job_name) \n group_right(owner_name) \n (kube_job_status_start_time) \n )\n ) by (namespace, owner_name) \n )\n) / 86400", | |
"format": "table", | |
"hide": false, | |
"instant": true, | |
"interval": "", | |
"legendFormat": "", | |
"refId": "B" | |
}, | |
{ | |
"exemplar": true, | |
"expr": "(\n time()\n -\n max(\n label_replace(kube_cronjob_status_last_schedule_time, \"owner_name\", \"$1\", \"cronjob\", \"(.*)\")\n ) by (namespace, owner_name) \n) / 3600", | |
"format": "table", | |
"hide": false, | |
"instant": true, | |
"interval": "", | |
"legendFormat": "", | |
"refId": "C" | |
} | |
], | |
"title": "Age of Last CronJob Runs", | |
"transformations": [ | |
{ | |
"id": "merge", | |
"options": {} | |
}, | |
{ | |
"id": "filterFieldsByName", | |
"options": {} | |
}, | |
{ | |
"id": "organize", | |
"options": { | |
"excludeByName": {}, | |
"indexByName": {}, | |
"renameByName": { | |
"Value #A": "Last Successful (h)", | |
"Value #B": "Last Failed (d)", | |
"Value #C": "Last Scheduled (h)", | |
"owner_name": "" | |
} | |
} | |
} | |
], | |
"type": "table" | |
}, | |
{ | |
"datasource": "${DS_PROMETHEUS}", | |
"description": "Jobs in the current time period *AND* retained according to the CronJob's retention policy", | |
"fieldConfig": { | |
"defaults": { | |
"color": { | |
"mode": "thresholds" | |
}, | |
"custom": { | |
"align": null, | |
"filterable": false | |
}, | |
"mappings": [], | |
"thresholds": { | |
"mode": "absolute", | |
"steps": [ | |
{ | |
"color": "green", | |
"value": null | |
}, | |
{ | |
"color": "#EAB839", | |
"value": 1 | |
}, | |
{ | |
"color": "red", | |
"value": 2 | |
} | |
] | |
} | |
}, | |
"overrides": [ | |
{ | |
"matcher": { | |
"id": "byName", | |
"options": "Not Successful" | |
}, | |
"properties": [ | |
{ | |
"id": "custom.displayMode", | |
"value": "color-background" | |
} | |
] | |
}, | |
{ | |
"matcher": { | |
"id": "byName", | |
"options": "Last Run (h)" | |
}, | |
"properties": [ | |
{ | |
"id": "custom.displayMode", | |
"value": "color-background" | |
} | |
] | |
} | |
] | |
}, | |
"gridPos": { | |
"h": 7, | |
"w": 24, | |
"x": 0, | |
"y": 8 | |
}, | |
"id": 11, | |
"options": { | |
"showHeader": true, | |
"sortBy": [ | |
{ | |
"desc": false, | |
"displayName": "Total" | |
} | |
] | |
}, | |
"pluginVersion": "7.5.4", | |
"targets": [ | |
{ | |
"exemplar": true, | |
"expr": "count(\n # get jobs associated to a Cronjob\n kube_job_owner{owner_kind=\"CronJob\"} \n * \n on (namespace, job_name) \n group_right(owner_name) \n avg(kube_job_status_succeeded == 1) by (namespace,job_name)\n * \n on (namespace, job_name) \n group_right(owner_name) \n kube_job_status_start_time > ${__from:date:seconds}\n) by (namespace, owner_name) ", | |
"format": "table", | |
"hide": false, | |
"instant": true, | |
"interval": "", | |
"legendFormat": "", | |
"queryType": "randomWalk", | |
"refId": "A" | |
}, | |
{ | |
"exemplar": true, | |
"expr": "count(\n # get jobs associated to a Cronjob\n kube_job_owner{owner_kind=\"CronJob\"} \n * \n on (namespace, job_name) \n group_right(owner_name) \n avg(kube_job_status_failed == 1) by (namespace,job_name)\n * \n on (namespace, job_name) \n group_right(owner_name) \n kube_job_status_start_time > ${__from:date:seconds}\n) by (namespace, owner_name) ", | |
"format": "table", | |
"hide": false, | |
"instant": true, | |
"interval": "", | |
"legendFormat": "", | |
"refId": "B" | |
}, | |
{ | |
"exemplar": true, | |
"expr": "count(\n # get jobs associated to a Cronjob\n kube_job_owner{owner_kind=\"CronJob\"} \n * \n on (namespace, job_name) \n group_right(owner_name) \n avg(kube_job_info) by (namespace,job_name)\n * \n on (namespace, job_name) \n group_right(owner_name) \n kube_job_status_start_time > ${__from:date:seconds}\n) by (namespace, owner_name) ", | |
"format": "table", | |
"hide": false, | |
"instant": true, | |
"interval": "", | |
"legendFormat": "", | |
"refId": "C" | |
}, | |
{ | |
"exemplar": true, | |
"expr": "count(\n # get jobs associated to a Cronjob\n kube_job_owner{owner_kind=\"CronJob\"} \n * \n on (namespace, job_name) \n group_right(owner_name) \n avg(kube_job_info) by (namespace,job_name)\n * \n on (namespace, job_name) \n group_right(owner_name) \n kube_job_status_start_time > ${__from:date:seconds}\n) by (namespace, owner_name) \n-\ncount(\n # get jobs associated to a Cronjob\n kube_job_owner{owner_kind=\"CronJob\"} \n * \n on (namespace, job_name) \n group_right(owner_name) \n avg(kube_job_status_succeeded == 1) by (namespace,job_name)\n * \n on (namespace, job_name) \n group_right(owner_name) \n kube_job_status_start_time > ${__from:date:seconds}\n) by (namespace, owner_name) ", | |
"format": "table", | |
"hide": false, | |
"instant": true, | |
"interval": "", | |
"legendFormat": "", | |
"refId": "D" | |
} | |
], | |
"title": "CronJob Jobs Recently Run", | |
"transformations": [ | |
{ | |
"id": "merge", | |
"options": {} | |
}, | |
{ | |
"id": "filterFieldsByName", | |
"options": {} | |
}, | |
{ | |
"id": "organize", | |
"options": { | |
"excludeByName": {}, | |
"indexByName": { | |
"Time": 0, | |
"Value #A": 4, | |
"Value #C": 3, | |
"Value #D": 5, | |
"namespace": 1, | |
"owner_name": 2 | |
}, | |
"renameByName": { | |
"Value #A": "Successful", | |
"Value #B": "Failed", | |
"Value #C": "Total", | |
"Value #D": "Not Successful", | |
"{namespace=\"argo-events\", owner_name=\"load-askgit-data\"}": "Not" | |
} | |
} | |
} | |
], | |
"type": "table" | |
}, | |
{ | |
"collapsed": false, | |
"datasource": null, | |
"gridPos": { | |
"h": 1, | |
"w": 24, | |
"x": 0, | |
"y": 15 | |
}, | |
"id": 4, | |
"panels": [], | |
"title": "Velero", | |
"type": "row" | |
}, | |
{ | |
"aliasColors": {}, | |
"bars": false, | |
"dashLength": 10, | |
"dashes": false, | |
"datasource": "${DS_PROMETHEUS}", | |
"fieldConfig": { | |
"defaults": {}, | |
"overrides": [] | |
}, | |
"fill": 2, | |
"fillGradient": 0, | |
"gridPos": { | |
"h": 8, | |
"w": 7, | |
"x": 0, | |
"y": 16 | |
}, | |
"hiddenSeries": false, | |
"id": 6, | |
"legend": { | |
"avg": false, | |
"current": false, | |
"max": false, | |
"min": false, | |
"show": true, | |
"total": false, | |
"values": false | |
}, | |
"lines": true, | |
"linewidth": 1, | |
"nullPointMode": "null", | |
"options": { | |
"alertThreshold": true | |
}, | |
"percentage": false, | |
"pluginVersion": "7.5.4", | |
"pointradius": 2, | |
"points": false, | |
"renderer": "flot", | |
"seriesOverrides": [ | |
{ | |
"$$hashKey": "object:209", | |
"alias": "attempts", | |
"stack": false | |
} | |
], | |
"spaceLength": 10, | |
"stack": true, | |
"steppedLine": false, | |
"targets": [ | |
{ | |
"exemplar": true, | |
"expr": "sum(velero_backup_attempt_total)", | |
"format": "time_series", | |
"hide": false, | |
"instant": false, | |
"interval": "", | |
"legendFormat": "attempts", | |
"queryType": "randomWalk", | |
"refId": "A" | |
}, | |
{ | |
"exemplar": true, | |
"expr": "sum(velero_backup_partial_failure_total)", | |
"format": "time_series", | |
"hide": false, | |
"interval": "", | |
"legendFormat": "completed with partial failures", | |
"refId": "B" | |
}, | |
{ | |
"exemplar": true, | |
"expr": "sum(velero_backup_success_total)", | |
"hide": false, | |
"interval": "", | |
"legendFormat": "succeeded", | |
"refId": "C" | |
}, | |
{ | |
"exemplar": true, | |
"expr": "sum(velero_backup_validation_failure_total)", | |
"hide": false, | |
"interval": "", | |
"legendFormat": "failed", | |
"refId": "D" | |
} | |
], | |
"thresholds": [], | |
"timeFrom": null, | |
"timeRegions": [], | |
"timeShift": null, | |
"title": "Backups", | |
"tooltip": { | |
"shared": true, | |
"sort": 0, | |
"value_type": "individual" | |
}, | |
"type": "graph", | |
"xaxis": { | |
"buckets": null, | |
"mode": "time", | |
"name": null, | |
"show": true, | |
"values": [] | |
}, | |
"yaxes": [ | |
{ | |
"$$hashKey": "object:51", | |
"format": "short", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": "0", | |
"show": true | |
}, | |
{ | |
"$$hashKey": "object:52", | |
"format": "short", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
} | |
], | |
"yaxis": { | |
"align": false, | |
"alignLevel": null | |
} | |
}, | |
{ | |
"datasource": "${DS_PROMETHEUS}", | |
"fieldConfig": { | |
"defaults": { | |
"color": { | |
"mode": "palette-classic" | |
}, | |
"mappings": [], | |
"thresholds": { | |
"mode": "absolute", | |
"steps": [ | |
{ | |
"color": "green", | |
"value": null | |
}, | |
{ | |
"color": "red", | |
"value": 80 | |
} | |
] | |
} | |
}, | |
"overrides": [] | |
}, | |
"gridPos": { | |
"h": 8, | |
"w": 7, | |
"x": 7, | |
"y": 16 | |
}, | |
"id": 8, | |
"options": { | |
"displayLabels": [ | |
"value" | |
], | |
"legend": { | |
"displayMode": "list", | |
"placement": "right", | |
"values": [] | |
}, | |
"pieType": "pie", | |
"reduceOptions": { | |
"calcs": [ | |
"last" | |
], | |
"fields": "", | |
"values": false | |
}, | |
"text": {} | |
}, | |
"pluginVersion": "7.5.4", | |
"targets": [ | |
{ | |
"exemplar": true, | |
"expr": "sum(delta(velero_backup_attempt_total[$__range])) by (schedule) + on(schedule) (((time() - velero_backup_last_successful_timestamp) / 86400 < 7 ) * 0)", | |
"format": "time_series", | |
"hide": false, | |
"instant": true, | |
"interval": "", | |
"legendFormat": "schedule: - {{ schedule }}", | |
"queryType": "randomWalk", | |
"refId": "A" | |
}, | |
{ | |
"exemplar": true, | |
"expr": "velero_backup_attempt_total + on(schedule) (((time() - velero_backup_last_successful_timestamp) / 86400 < 7 ) * 0)", | |
"hide": true, | |
"interval": "", | |
"legendFormat": "", | |
"refId": "B" | |
} | |
], | |
"timeFrom": null, | |
"timeShift": null, | |
"title": "Attempted Backups By Schedule In Time Range", | |
"type": "piechart" | |
}, | |
{ | |
"datasource": "${DS_PROMETHEUS}", | |
"fieldConfig": { | |
"defaults": { | |
"color": { | |
"mode": "thresholds" | |
}, | |
"mappings": [], | |
"thresholds": { | |
"mode": "absolute", | |
"steps": [ | |
{ | |
"color": "green", | |
"value": null | |
}, | |
{ | |
"color": "#EAB839", | |
"value": 24 | |
}, | |
{ | |
"color": "red", | |
"value": 48 | |
} | |
] | |
} | |
}, | |
"overrides": [] | |
}, | |
"gridPos": { | |
"h": 8, | |
"w": 5, | |
"x": 14, | |
"y": 16 | |
}, | |
"id": 10, | |
"options": { | |
"colorMode": "value", | |
"graphMode": "area", | |
"justifyMode": "auto", | |
"orientation": "auto", | |
"reduceOptions": { | |
"calcs": [ | |
"lastNotNull" | |
], | |
"fields": "", | |
"values": false | |
}, | |
"text": {}, | |
"textMode": "auto" | |
}, | |
"pluginVersion": "7.5.4", | |
"targets": [ | |
{ | |
"exemplar": true, | |
"expr": "min((time() - velero_backup_last_successful_timestamp) / 3600)", | |
"format": "time_series", | |
"instant": false, | |
"interval": "", | |
"legendFormat": "", | |
"queryType": "randomWalk", | |
"refId": "A" | |
} | |
], | |
"title": "Hours since last successful backup", | |
"type": "stat" | |
} | |
], | |
"refresh": false, | |
"schemaVersion": 27, | |
"style": "dark", | |
"tags": [], | |
"templating": { | |
"list": [] | |
}, | |
"time": { | |
"from": "now-7d", | |
"to": "now" | |
}, | |
"timepicker": {}, | |
"timezone": "", | |
"title": "Backup Execution", | |
"uid": "LcwXjBcnz", | |
"version": 14 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment