Skip to content

Instantly share code, notes, and snippets.

@benjvi
Last active April 20, 2023 10:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save benjvi/a843ecbfeb354379b79ed9abffb7a642 to your computer and use it in GitHub Desktop.
Save benjvi/a843ecbfeb354379b79ed9abffb7a642 to your computer and use it in GitHub Desktop.
BOSH System-level Alerts for Healthwatch2, based on BOSH-Prometheus alerts
groups:
- name: bosh-system
rules:
- alert: BOSHVMLowFreeRAM
expr: avg(system_mem_percent{exported_job!~"^compilation.*",deployment!="bosh-health"}) by(system_domain, deployment, exported_job, index) > 90
for: 10m
labels:
service: bosh-system
severity: warning
annotations:
summary: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` is reporting low free RAM"
description: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` has used more than 90% of its RAM for 10m: {{$value}}"
- alert: BOSHVMSystemDiskFull
expr: avg(system_disk_system_percent{exported_job!~"^compilation.*",deployment!="bosh-health"}) by(system_domain, deployment, exported_job, index) > 90
for: 30m
labels:
service: bosh-system
severity: critical
annotations:
summary: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` is running out of system disk"
description: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` has used more than 90% of its system disk for 30m: {{$value}}"
- alert: BOSHVMEphemeralDiskFull
expr: avg(system_disk_ephemeral_percent{exported_job!~"^compilation.*",deployment!="bosh-health"}) by(system_domain, deployment, exported_job, index) > 80
for: 30m
labels:
service: bosh-system
severity: critical
annotations:
summary: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` is running out of ephemeral disk"
description: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` has used more than 80% of its ephemeral disk for 30m: {{$value}}"
- alert: BOSHVMPersistentDiskFull
expr: avg(system_disk_persistent_percent{exported_job!~"^compilation.*",deployment!="bosh-health"}) by(system_domain, deployment, exported_job, index) > 90
for: 30m
labels:
service: bosh-system
severity: critical
annotations:
summary: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` is running out of persistent disk"
description: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` has used more than 90% of its persistent disk for 30m: {{$value}}"
- alert: BOSHVMPersistentDiskInodesExhausted
expr: avg(system_disk_persistent_inode_percent{exported_job!~"^compilation.*",deployment!="bosh-health"}) by(system_domain, deployment, exported_job, index) > 90
for:
labels:
service: bosh-system
severity: critical
annotations:
summary: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` is running out of inodes"
description: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` has used more than 90% of its persistent disk inodes for : {{$value}}"
- alert: BOSHVMSystemDiskPredictWillFill
expr: |
(
avg(predict_linear(system_disk_system_percent{exported_job!~"^compilation.*",deployment!="bosh-health"}[6h], 4*24*60*60)) by(system_domain, deployment, exported_job, index) > 95
and
avg(system_disk_system_percent{exported_job!~"^compilation.*",deployment!="bosh-health"}) by(system_domain, deployment, exported_job, index) > 60
)
for: 30m
labels:
service: bosh-system
severity: warning
annotations:
summary: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` will run out of system disk in {{humanizeDuration 4*24*60*60}}"
description: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` system disk will be used more than 95% in {{humanizeDuration 4*24*60*60}}"
- alert: BOSHVMEphemeralDiskPredictWillFill
expr: |
(
avg(predict_linear(system_disk_ephemeral_percent{exported_job!~"^(compilation|smoke-tests).*"}[6h], 4*24*60*60)) by(system_domain, deployment, exported_job, index) > 95
and
avg(system_disk_ephemeral_percent{exported_job!~"^(compilation|smoke-tests).*"}) by(system_domain, deployment, exported_job, index) > 60
)
for: 30m
labels:
service: bosh-system
severity: warning
annotations:
summary: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` will run out of ephemeral disk in {{humanizeDuration 4*24*60*60}}"
description: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` ephemeral disk will be used more than 95% in {{humanizeDuration 4*24*60*60}}"
- alert: BOSHVMPersistentDiskPredictWillFill
expr: |
(
avg(predict_linear(system_disk_persistent_percent{exported_job!~"^compilation.*",deployment!="bosh-health"}[6h], 4*24*60*60)) by(system_domain, deployment, exported_job, index) > 95
and
avg(system_disk_persistent_percent{exported_job!~"^compilation.*",deployment!="bosh-health"}) by(system_domain, deployment, exported_job, index) > 60
)
for: 30m
labels:
service: bosh-system
severity: warning
annotations:
summary: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` will run out of persistent disk in {{humanizeDuration 4*24*60*60}}"
description: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` persistent disk will be used more than 95% in {{humanizeDuration 4*24*60*60}}"
- alert: BOSHJobExtendedUnhealthy
expr: max(system_healthy{exported_job!~"^compilation.*",deployment!="bosh-health"}) by(system_domain, deployment, exported_job, index) < 1
for: 30m
labels:
service: bosh-system
severity: critical
annotations:
summary: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` has been unhealthy for a long time"
description: "BOSH VM `{{$labels.system_domain}}/{{$labels.deployment}}/{{$labels.exported_job}}/{{$labels.index}}` has been reported unhealthy for more than 30m"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment