Skip to content

Instantly share code, notes, and snippets.

@Cellane
Last active July 16, 2021 19:38
Show Gist options
  • Save Cellane/7ee4d8cb4b54eb245701605814350021 to your computer and use it in GitHub Desktop.
Save Cellane/7ee4d8cb4b54eb245701605814350021 to your computer and use it in GitHub Desktop.
groups:
- name: prometheus
rules:
- alert: PrometheusConfigurationReload
expr: prometheus_config_last_reload_successful != 1
for: 5m
labels:
severity: error
annotations:
summary: "Prometheus configuration reload (instance {{ $labels.instance }})"
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusNotConnectedToAlertmanager
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 5m
labels:
severity: error
annotations:
summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})"
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: AlertmanagerConfigurationReload
expr: alertmanager_config_last_reload_successful != 1
for: 5m
labels:
severity: error
annotations:
summary: "AlertManager configuration reload (instance {{ $labels.instance }})"
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ExporterDown
expr: up == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Exporter down (instance {{ $labels.instance }})"
description: "Prometheus exporter down\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: node-exporter
rules:
- alert: OutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 5m
labels:
severity: critical
annotations:
summary: "Out of memory (instance {{ $labels.instance }})"
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: UnusualNetworkThroughputIn
expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Unusual network throughput in (instance {{ $labels.instance }})"
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: UnusualNetworkThroughputOut
expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Unusual network throughput out (instance {{ $labels.instance }})"
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: UnusualDiskReadRate
expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "Unusual disk read rate (instance {{ $labels.instance }})"
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: UnusualDiskWriteRate
expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "Unusual disk write rate (instance {{ $labels.instance }})"
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: OutOfDiskSpace
expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10
for: 5m
labels:
severity: critical
annotations:
summary: "Out of disk space (instance {{ $labels.instance }})"
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HighCpuLoad
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: critical
annotations:
summary: "High CPU load (instance {{ $labels.instance }})"
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: SwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Swap is filling up (instance {{ $labels.instance }})"
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: smartmon
rules:
- alert: DiskUnhealthy
expr: smartmon_device_smart_healthy < 1
for: 1m
labels:
severity: critical
annotations:
summary: "Disk is unhealthy (instance {{ $labels.instance }})"
description: "Disk {{ $labels.disk }} is unhealthy\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: cadvisor
rules:
- alert: ContainerCpuUsage
expr: (sum(rate(container_cpu_usage_seconds_total{name=~".+"}[3m])) BY (instance, name) * 100) > 30
for: 5m
labels:
severity: error
annotations:
summary: "Container CPU usage (instance {{ $labels.instance }})"
description: "Container CPU usage is above 30%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: czech-in-japan
rule_files:
- alert.rules
alerting:
alertmanagers:
- scheme: https
basic_auth:
username: <supersecret>
password: <supersecret>
static_configs:
- targets:
- 'alertmanager.milanvit.net'
scrape_configs:
- job_name: prometheus
scrape_interval: 15s
static_configs:
- targets:
- 'localhost:9090'
- job_name: cadvisor
scrape_interval: 15s
scheme: https
basic_auth:
username: <supersecret>
password: <supersecret>
static_configs:
- targets:
- 'cadvisor.milanvit.net'
- job_name: node-exporter
scrape_interval: 15s
scheme: https
basic_auth:
username: <supersecret>
password: <supersecret>
static_configs:
- targets:
- 'node-exporter.milanvit.net'
route:
receiver: 'slack'
receivers:
- name: 'slack'
slack_configs:
- send_resolved: true
username: '<slack-username>'
channel: '<channel-name>'
api_url: '<hook-url>'
title: '{{ template "custom_title" . }}'
text: '{{ template "custom_slack_message" . }}'
templates:
- /alertmanager/notifications.tmpl
{{ define "__single_message_title" }}{{ range .Alerts.Firing }}{{ .Labels.alertname }} @ {{ .Annotations.summary }}{{ end }}{{ range .Alerts.Resolved }}{{ .Labels.alertname }} @ {{ .Annotations.summary }}{{ end }}{{ end }}
{{ define "custom_title" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ if or (and (eq (len .Alerts.Firing) 1) (eq (len .Alerts.Resolved) 0)) (and (eq (len .Alerts.Firing) 0) (eq (len .Alerts.Resolved) 1)) }}{{ template "__single_message_title" . }}{{ end }}{{ end }}
{{ define "custom_slack_message" }}
{{ if or (and (eq (len .Alerts.Firing) 1) (eq (len .Alerts.Resolved) 0)) (and (eq (len .Alerts.Firing) 0) (eq (len .Alerts.Resolved) 1)) }}
{{ range .Alerts.Firing }}{{ .Annotations.description }}{{ end }}{{ range .Alerts.Resolved }}{{ .Annotations.description }}{{ end }}
{{ else }}
{{ if gt (len .Alerts.Firing) 0 }}
*Alerts Firing:*
{{ range .Alerts.Firing }}- {{ .Annotations.summary }}: {{ .Annotations.description }}
{{ end }}{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
*Alerts Resolved:*
{{ range .Alerts.Resolved }}- {{ .Annotations.summary }}: {{ .Annotations.description }}
{{ end }}{{ end }}
{{ end }}
{{ end }}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment