|
groups: |
|
- name: prometheus |
|
rules: |
|
|
|
- alert: PrometheusConfigurationReload |
|
expr: prometheus_config_last_reload_successful != 1 |
|
for: 5m |
|
labels: |
|
severity: error |
|
annotations: |
|
summary: "Prometheus configuration reload (instance {{ $labels.instance }})" |
|
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" |
|
|
|
- alert: PrometheusNotConnectedToAlertmanager |
|
expr: prometheus_notifications_alertmanagers_discovered < 1 |
|
for: 5m |
|
labels: |
|
severity: error |
|
annotations: |
|
summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})" |
|
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" |
|
|
|
- alert: AlertmanagerConfigurationReload |
|
expr: alertmanager_config_last_reload_successful != 1 |
|
for: 5m |
|
labels: |
|
severity: error |
|
annotations: |
|
summary: "AlertManager configuration reload (instance {{ $labels.instance }})" |
|
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" |
|
|
|
- alert: ExporterDown |
|
expr: up == 0 |
|
for: 5m |
|
labels: |
|
severity: warning |
|
annotations: |
|
summary: "Exporter down (instance {{ $labels.instance }})" |
|
description: "Prometheus exporter down\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" |
|
|
|
- name: node-exporter |
|
rules: |
|
|
|
- alert: OutOfMemory |
|
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 |
|
for: 5m |
|
labels: |
|
severity: critical |
|
annotations: |
|
summary: "Out of memory (instance {{ $labels.instance }})" |
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" |
|
|
|
- alert: UnusualNetworkThroughputIn |
|
expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 |
|
for: 5m |
|
labels: |
|
severity: warning |
|
annotations: |
|
summary: "Unusual network throughput in (instance {{ $labels.instance }})" |
|
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" |
|
|
|
- alert: UnusualNetworkThroughputOut |
|
expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 |
|
for: 5m |
|
labels: |
|
severity: warning |
|
annotations: |
|
summary: "Unusual network throughput out (instance {{ $labels.instance }})" |
|
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" |
|
|
|
- alert: UnusualDiskReadRate |
|
expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 |
|
for: 5m |
|
labels: |
|
severity: warning |
|
annotations: |
|
summary: "Unusual disk read rate (instance {{ $labels.instance }})" |
|
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" |
|
|
|
- alert: UnusualDiskWriteRate |
|
expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 |
|
for: 5m |
|
labels: |
|
severity: warning |
|
annotations: |
|
summary: "Unusual disk write rate (instance {{ $labels.instance }})" |
|
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" |
|
|
|
- alert: OutOfDiskSpace |
|
expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10 |
|
for: 5m |
|
labels: |
|
severity: critical |
|
annotations: |
|
summary: "Out of disk space (instance {{ $labels.instance }})" |
|
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" |
|
|
|
- alert: HighCpuLoad |
|
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 |
|
for: 5m |
|
labels: |
|
severity: critical |
|
annotations: |
|
summary: "High CPU load (instance {{ $labels.instance }})" |
|
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" |
|
|
|
- alert: SwapIsFillingUp |
|
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 |
|
for: 5m |
|
labels: |
|
severity: warning |
|
annotations: |
|
summary: "Swap is filling up (instance {{ $labels.instance }})" |
|
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" |
|
|
|
- name: smartmon |
|
rules: |
|
|
|
- alert: DiskUnhealthy |
|
expr: smartmon_device_smart_healthy < 1 |
|
for: 1m |
|
labels: |
|
severity: critical |
|
annotations: |
|
summary: "Disk is unhealthy (instance {{ $labels.instance }})" |
|
description: "Disk {{ $labels.disk }} is unhealthy\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" |
|
|
|
- name: cadvisor |
|
rules: |
|
|
|
- alert: ContainerCpuUsage |
|
expr: (sum(rate(container_cpu_usage_seconds_total{name=~".+"}[3m])) BY (instance, name) * 100) > 30 |
|
for: 5m |
|
labels: |
|
severity: error |
|
annotations: |
|
summary: "Container CPU usage (instance {{ $labels.instance }})" |
|
description: "Container CPU usage is above 30%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" |