Last active
April 20, 2023 10:50
-
-
Save benjvi/a843ecbfeb354379b79ed9abffb7a642 to your computer and use it in GitHub Desktop.
BOSH System-level Alerts for Healthwatch2, based on BOSH-Prometheus alerts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
groups: | |
- name: bosh-system | |
rules: | |
- alert: BOSHVMLowFreeRAM | |
expr: avg(system_mem_percent{exported_job!~"^compilation.*",deployment!="bosh-health"}) by(system_domain, deployment, exported_job, index) > 90 | |
for: 10m | |
labels: | |
service: bosh-system | |
severity: warning | |
annotations: | |
summary: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` is reporting low free RAM" | |
description: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` has used more than 90% of its RAM for 10m: {{$value}}" | |
- alert: BOSHVMSystemDiskFull | |
expr: avg(system_disk_system_percent{exported_job!~"^compilation.*",deployment!="bosh-health"}) by(system_domain, deployment, exported_job, index) > 90 | |
for: 30m | |
labels: | |
service: bosh-system | |
severity: critical | |
annotations: | |
summary: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` is running out of system disk" | |
description: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` has used more than 90% of its system disk for 30m: {{$value}}" | |
- alert: BOSHVMEphemeralDiskFull | |
expr: avg(system_disk_ephemeral_percent{exported_job!~"^compilation.*",deployment!="bosh-health"}) by(system_domain, deployment, exported_job, index) > 80 | |
for: 30m | |
labels: | |
service: bosh-system | |
severity: critical | |
annotations: | |
summary: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` is running out of ephemeral disk" | |
description: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` has used more than 80% of its ephemeral disk for 30m: {{$value}}" | |
- alert: BOSHVMPersistentDiskFull | |
expr: avg(system_disk_persistent_percent{exported_job!~"^compilation.*",deployment!="bosh-health"}) by(system_domain, deployment, exported_job, index) > 90 | |
for: 30m | |
labels: | |
service: bosh-system | |
severity: critical | |
annotations: | |
summary: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` is running out of persistent disk" | |
description: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` has used more than 90% of its persistent disk for 30m: {{$value}}" | |
- alert: BOSHVMPersistentDiskInodesExhausted | |
expr: avg(system_disk_persistent_inode_percent{exported_job!~"^compilation.*",deployment!="bosh-health"}) by(system_domain, deployment, exported_job, index) > 90 | |
for: | |
labels: | |
service: bosh-system | |
severity: critical | |
annotations: | |
summary: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` is running out of inodes" | |
description: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` has used more than 90% of its persistent disk inodes for : {{$value}}" | |
- alert: BOSHVMSystemDiskPredictWillFill | |
expr: | | |
( | |
avg(predict_linear(system_disk_system_percent{exported_job!~"^compilation.*",deployment!="bosh-health"}[6h], 4*24*60*60)) by(system_domain, deployment, exported_job, index) > 95 | |
and | |
avg(system_disk_system_percent{exported_job!~"^compilation.*",deployment!="bosh-health"}) by(system_domain, deployment, exported_job, index) > 60 | |
) | |
for: 30m | |
labels: | |
service: bosh-system | |
severity: warning | |
annotations: | |
summary: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` will run out of system disk in {{humanizeDuration 4*24*60*60}}" | |
description: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` system disk will be used more than 95% in {{humanizeDuration 4*24*60*60}}" | |
- alert: BOSHVMEphemeralDiskPredictWillFill | |
expr: | | |
( | |
avg(predict_linear(system_disk_ephemeral_percent{exported_job!~"^(compilation|smoke-tests).*"}[6h], 4*24*60*60)) by(system_domain, deployment, exported_job, index) > 95 | |
and | |
avg(system_disk_ephemeral_percent{exported_job!~"^(compilation|smoke-tests).*"}) by(system_domain, deployment, exported_job, index) > 60 | |
) | |
for: 30m | |
labels: | |
service: bosh-system | |
severity: warning | |
annotations: | |
summary: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` will run out of ephemeral disk in {{humanizeDuration 4*24*60*60}}" | |
description: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` ephemeral disk will be used more than 95% in {{humanizeDuration 4*24*60*60}}" | |
- alert: BOSHVMPersistentDiskPredictWillFill | |
expr: | | |
( | |
avg(predict_linear(system_disk_persistent_percent{exported_job!~"^compilation.*",deployment!="bosh-health"}[6h], 4*24*60*60)) by(system_domain, deployment, exported_job, index) > 95 | |
and | |
avg(system_disk_persistent_percent{exported_job!~"^compilation.*",deployment!="bosh-health"}) by(system_domain, deployment, exported_job, index) > 60 | |
) | |
for: 30m | |
labels: | |
service: bosh-system | |
severity: warning | |
annotations: | |
summary: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` will run out of persistent disk in {{humanizeDuration 4*24*60*60}}" | |
description: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` persistent disk will be used more than 95% in {{humanizeDuration 4*24*60*60}}" | |
- alert: BOSHJobExtendedUnhealthy | |
expr: max(system_healthy{exported_job!~"^compilation.*",deployment!="bosh-health"}) by(system_domain, deployment, exported_job, index) < 1 | |
for: 30m | |
labels: | |
service: bosh-system | |
severity: critical | |
annotations: | |
summary: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` has been unhealthy for a long time" | |
description: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` has been reported unhealthy for more than 30m" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment