Last active
July 16, 2021 19:38
-
-
Save Cellane/7ee4d8cb4b54eb245701605814350021 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
groups: | |
- name: prometheus | |
rules: | |
- alert: PrometheusConfigurationReload | |
expr: prometheus_config_last_reload_successful != 1 | |
for: 5m | |
labels: | |
severity: error | |
annotations: | |
summary: "Prometheus configuration reload (instance {{ $labels.instance }})" | |
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" | |
- alert: PrometheusNotConnectedToAlertmanager | |
expr: prometheus_notifications_alertmanagers_discovered < 1 | |
for: 5m | |
labels: | |
severity: error | |
annotations: | |
summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})" | |
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" | |
- alert: AlertmanagerConfigurationReload | |
expr: alertmanager_config_last_reload_successful != 1 | |
for: 5m | |
labels: | |
severity: error | |
annotations: | |
summary: "AlertManager configuration reload (instance {{ $labels.instance }})" | |
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" | |
- alert: ExporterDown | |
expr: up == 0 | |
for: 5m | |
labels: | |
severity: warning | |
annotations: | |
summary: "Exporter down (instance {{ $labels.instance }})" | |
description: "Prometheus exporter down\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" | |
- name: node-exporter | |
rules: | |
- alert: OutOfMemory | |
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 | |
for: 5m | |
labels: | |
severity: critical | |
annotations: | |
summary: "Out of memory (instance {{ $labels.instance }})" | |
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" | |
- alert: UnusualNetworkThroughputIn | |
expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 | |
for: 5m | |
labels: | |
severity: warning | |
annotations: | |
summary: "Unusual network throughput in (instance {{ $labels.instance }})" | |
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" | |
- alert: UnusualNetworkThroughputOut | |
expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 | |
for: 5m | |
labels: | |
severity: warning | |
annotations: | |
summary: "Unusual network throughput out (instance {{ $labels.instance }})" | |
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" | |
- alert: UnusualDiskReadRate | |
expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 | |
for: 5m | |
labels: | |
severity: warning | |
annotations: | |
summary: "Unusual disk read rate (instance {{ $labels.instance }})" | |
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" | |
- alert: UnusualDiskWriteRate | |
expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 | |
for: 5m | |
labels: | |
severity: warning | |
annotations: | |
summary: "Unusual disk write rate (instance {{ $labels.instance }})" | |
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" | |
- alert: OutOfDiskSpace | |
expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10 | |
for: 5m | |
labels: | |
severity: critical | |
annotations: | |
summary: "Out of disk space (instance {{ $labels.instance }})" | |
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" | |
- alert: HighCpuLoad | |
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 | |
for: 5m | |
labels: | |
severity: critical | |
annotations: | |
summary: "High CPU load (instance {{ $labels.instance }})" | |
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" | |
- alert: SwapIsFillingUp | |
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 | |
for: 5m | |
labels: | |
severity: warning | |
annotations: | |
summary: "Swap is filling up (instance {{ $labels.instance }})" | |
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" | |
- name: smartmon | |
rules: | |
- alert: DiskUnhealthy | |
expr: smartmon_device_smart_healthy < 1 | |
for: 1m | |
labels: | |
severity: critical | |
annotations: | |
summary: "Disk is unhealthy (instance {{ $labels.instance }})" | |
description: "Disk {{ $labels.disk }} is unhealthy\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" | |
- name: cadvisor | |
rules: | |
- alert: ContainerCpuUsage | |
expr: (sum(rate(container_cpu_usage_seconds_total{name=~".+"}[3m])) BY (instance, name) * 100) > 30 | |
for: 5m | |
labels: | |
severity: error | |
annotations: | |
summary: "Container CPU usage (instance {{ $labels.instance }})" | |
description: "Container CPU usage is above 30%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
global: | |
scrape_interval: 15s | |
evaluation_interval: 15s | |
external_labels: | |
monitor: czech-in-japan | |
rule_files: | |
- alert.rules | |
alerting: | |
alertmanagers: | |
- scheme: https | |
basic_auth: | |
username: <supersecret> | |
password: <supersecret> | |
static_configs: | |
- targets: | |
- 'alertmanager.milanvit.net' | |
scrape_configs: | |
- job_name: prometheus | |
scrape_interval: 15s | |
static_configs: | |
- targets: | |
- 'localhost:9090' | |
- job_name: cadvisor | |
scrape_interval: 15s | |
scheme: https | |
basic_auth: | |
username: <supersecret> | |
password: <supersecret> | |
static_configs: | |
- targets: | |
- 'cadvisor.milanvit.net' | |
- job_name: node-exporter | |
scrape_interval: 15s | |
scheme: https | |
basic_auth: | |
username: <supersecret> | |
password: <supersecret> | |
static_configs: | |
- targets: | |
- 'node-exporter.milanvit.net' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
route: | |
receiver: 'slack' | |
receivers: | |
- name: 'slack' | |
slack_configs: | |
- send_resolved: true | |
username: '<slack-username>' | |
channel: '<channel-name>' | |
api_url: '<hook-url>' | |
title: '{{ template "custom_title" . }}' | |
text: '{{ template "custom_slack_message" . }}' | |
templates: | |
- /alertmanager/notifications.tmpl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{{ define "__single_message_title" }}{{ range .Alerts.Firing }}{{ .Labels.alertname }} @ {{ .Annotations.summary }}{{ end }}{{ range .Alerts.Resolved }}{{ .Labels.alertname }} @ {{ .Annotations.summary }}{{ end }}{{ end }} | |
{{ define "custom_title" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ if or (and (eq (len .Alerts.Firing) 1) (eq (len .Alerts.Resolved) 0)) (and (eq (len .Alerts.Firing) 0) (eq (len .Alerts.Resolved) 1)) }}{{ template "__single_message_title" . }}{{ end }}{{ end }} | |
{{ define "custom_slack_message" }} | |
{{ if or (and (eq (len .Alerts.Firing) 1) (eq (len .Alerts.Resolved) 0)) (and (eq (len .Alerts.Firing) 0) (eq (len .Alerts.Resolved) 1)) }} | |
{{ range .Alerts.Firing }}{{ .Annotations.description }}{{ end }}{{ range .Alerts.Resolved }}{{ .Annotations.description }}{{ end }} | |
{{ else }} | |
{{ if gt (len .Alerts.Firing) 0 }} | |
*Alerts Firing:* | |
{{ range .Alerts.Firing }}- {{ .Annotations.summary }}: {{ .Annotations.description }} | |
{{ end }}{{ end }} | |
{{ if gt (len .Alerts.Resolved) 0 }} | |
*Alerts Resolved:* | |
{{ range .Alerts.Resolved }}- {{ .Annotations.summary }}: {{ .Annotations.description }} | |
{{ end }}{{ end }} | |
{{ end }} | |
{{ end }} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment