Cellane/01__prometheus_config_alert.rules Secret

## 01__prometheus_config_alert.rules
groups:
- name: prometheus
  rules:

  - alert: PrometheusConfigurationReload
    expr: prometheus_config_last_reload_successful != 1
    for: 5m
    labels:
      severity: error
    annotations:
      summary: "Prometheus configuration reload (instance {{ $labels.instance }})"
      description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusNotConnectedToAlertmanager
    expr: prometheus_notifications_alertmanagers_discovered < 1
    for: 5m
    labels:
      severity: error
    annotations:
      summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})"
      description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: AlertmanagerConfigurationReload
    expr: alertmanager_config_last_reload_successful != 1
    for: 5m
    labels:
      severity: error
    annotations:
      summary: "AlertManager configuration reload (instance {{ $labels.instance }})"
      description: "AlertManager configuration reload error\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ExporterDown
    expr: up == 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Exporter down (instance {{ $labels.instance }})"
      description: "Prometheus exporter down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

- name: node-exporter
  rules:

  - alert: OutOfMemory
    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Out of memory (instance {{ $labels.instance }})"
      description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: UnusualNetworkThroughputIn
    expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Unusual network throughput in (instance {{ $labels.instance }})"
      description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: UnusualNetworkThroughputOut
    expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Unusual network throughput out (instance {{ $labels.instance }})"
      description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: UnusualDiskReadRate
    expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Unusual disk read rate (instance {{ $labels.instance }})"
      description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: UnusualDiskWriteRate
    expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Unusual disk write rate (instance {{ $labels.instance }})"
      description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: OutOfDiskSpace
    expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"}  * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Out of disk space (instance {{ $labels.instance }})"
      description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HighCpuLoad
    expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "High CPU load (instance {{ $labels.instance }})"
      description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: SwapIsFillingUp
    expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Swap is filling up (instance {{ $labels.instance }})"
      description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

- name: smartmon
  rules:

  - alert: DiskUnhealthy
    expr: smartmon_device_smart_healthy < 1
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "Disk is unhealthy (instance {{ $labels.instance }})"
      description: "Disk {{ $labels.disk }} is unhealthy\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

- name: cadvisor
  rules:

  - alert: ContainerCpuUsage
    expr: (sum(rate(container_cpu_usage_seconds_total{name=~".+"}[3m])) BY (instance, name) * 100) > 30
    for: 5m
    labels:
      severity: error
    annotations:
      summary: "Container CPU usage (instance {{ $labels.instance }})"
      description: "Container CPU usage is above 30%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

## 02__prometheus_config_prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    monitor: czech-in-japan

rule_files:
  - alert.rules

alerting:
  alertmanagers:
    - scheme: https
      basic_auth:
        username: <supersecret>
        password: <supersecret>
      static_configs:
        - targets:
            - 'alertmanager.milanvit.net'

scrape_configs:
  - job_name: prometheus
    scrape_interval: 15s
    static_configs:
      - targets:
          - 'localhost:9090'
  - job_name: cadvisor
    scrape_interval: 15s
    scheme: https
    basic_auth:
      username: <supersecret>
      password: <supersecret>
    static_configs:
      - targets:
          - 'cadvisor.milanvit.net'
  - job_name: node-exporter
    scrape_interval: 15s
    scheme: https
    basic_auth:
      username: <supersecret>
      password: <supersecret>
    static_configs:
      - targets:
          - 'node-exporter.milanvit.net'

## 03__alertmanager_config_alertmanager.yml
route:
  receiver: 'slack'

receivers:
  - name: 'slack'
    slack_configs:
      - send_resolved: true
        username: '<slack-username>'
        channel: '<channel-name>'
        api_url: '<hook-url>'
        title: '{{ template "custom_title" . }}'
        text: '{{ template "custom_slack_message" . }}'

templates:
  - /alertmanager/notifications.tmpl

## 04__alertmanager_data_notifications.tmpl
{{ define "__single_message_title" }}{{ range .Alerts.Firing }}{{ .Labels.alertname }} @ {{ .Annotations.summary }}{{ end }}{{ range .Alerts.Resolved }}{{ .Labels.alertname }} @ {{ .Annotations.summary }}{{ end }}{{ end }}

{{ define "custom_title" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ if or (and (eq (len .Alerts.Firing) 1) (eq (len .Alerts.Resolved) 0)) (and (eq (len .Alerts.Firing) 0) (eq (len .Alerts.Resolved) 1)) }}{{ template "__single_message_title" . }}{{ end }}{{ end }}

{{ define "custom_slack_message" }}
{{ if or (and (eq (len .Alerts.Firing) 1) (eq (len .Alerts.Resolved) 0)) (and (eq (len .Alerts.Firing) 0) (eq (len .Alerts.Resolved) 1)) }}
{{ range .Alerts.Firing }}{{ .Annotations.description }}{{ end }}{{ range .Alerts.Resolved }}{{ .Annotations.description }}{{ end }}
{{ else }}
{{ if gt (len .Alerts.Firing) 0 }}
*Alerts Firing:*
{{ range .Alerts.Firing }}- {{ .Annotations.summary }}: {{ .Annotations.description }}
{{ end }}{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
*Alerts Resolved:*
{{ range .Alerts.Resolved }}- {{ .Annotations.summary }}: {{ .Annotations.description }}
{{ end }}{{ end }}
{{ end }}
{{ end }}
	groups:
	- name: prometheus
	rules:

	- alert: PrometheusConfigurationReload
	expr: prometheus_config_last_reload_successful != 1
	for: 5m
	labels:
	severity: error
	annotations:
	summary: "Prometheus configuration reload (instance {{ $labels.instance }})"
	description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

	- alert: PrometheusNotConnectedToAlertmanager
	expr: prometheus_notifications_alertmanagers_discovered < 1
	for: 5m
	labels:
	severity: error
	annotations:
	summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})"
	description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

	- alert: AlertmanagerConfigurationReload
	expr: alertmanager_config_last_reload_successful != 1
	for: 5m
	labels:
	severity: error
	annotations:
	summary: "AlertManager configuration reload (instance {{ $labels.instance }})"
	description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

	- alert: ExporterDown
	expr: up == 0
	for: 5m
	labels:
	severity: warning
	annotations:
	summary: "Exporter down (instance {{ $labels.instance }})"
	description: "Prometheus exporter down\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

	- name: node-exporter
	rules:

	- alert: OutOfMemory
	expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
	for: 5m
	labels:
	severity: critical
	annotations:
	summary: "Out of memory (instance {{ $labels.instance }})"
	description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

	- alert: UnusualNetworkThroughputIn
	expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
	for: 5m
	labels:
	severity: warning
	annotations:
	summary: "Unusual network throughput in (instance {{ $labels.instance }})"
	description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

	- alert: UnusualNetworkThroughputOut
	expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
	for: 5m
	labels:
	severity: warning
	annotations:
	summary: "Unusual network throughput out (instance {{ $labels.instance }})"
	description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

	- alert: UnusualDiskReadRate
	expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
	for: 5m
	labels:
	severity: warning
	annotations:
	summary: "Unusual disk read rate (instance {{ $labels.instance }})"
	description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

	- alert: UnusualDiskWriteRate
	expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
	for: 5m
	labels:
	severity: warning
	annotations:
	summary: "Unusual disk write rate (instance {{ $labels.instance }})"
	description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

	- alert: OutOfDiskSpace
	expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10
	for: 5m
	labels:
	severity: critical
	annotations:
	summary: "Out of disk space (instance {{ $labels.instance }})"
	description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

	- alert: HighCpuLoad
	expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
	for: 5m
	labels:
	severity: critical
	annotations:
	summary: "High CPU load (instance {{ $labels.instance }})"
	description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

	- alert: SwapIsFillingUp
	expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
	for: 5m
	labels:
	severity: warning
	annotations:
	summary: "Swap is filling up (instance {{ $labels.instance }})"
	description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

	- name: smartmon
	rules:

	- alert: DiskUnhealthy
	expr: smartmon_device_smart_healthy < 1
	for: 1m
	labels:
	severity: critical
	annotations:
	summary: "Disk is unhealthy (instance {{ $labels.instance }})"
	description: "Disk {{ $labels.disk }} is unhealthy\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

	- name: cadvisor
	rules:

	- alert: ContainerCpuUsage
	expr: (sum(rate(container_cpu_usage_seconds_total{name=~".+"}[3m])) BY (instance, name) * 100) > 30
	for: 5m
	labels:
	severity: error
	annotations:
	summary: "Container CPU usage (instance {{ $labels.instance }})"
	description: "Container CPU usage is above 30%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
	global:
	scrape_interval: 15s
	evaluation_interval: 15s
	external_labels:
	monitor: czech-in-japan

	rule_files:
	- alert.rules

	alerting:
	alertmanagers:
	- scheme: https
	basic_auth:
	username: <supersecret>
	password: <supersecret>
	static_configs:
	- targets:
	- 'alertmanager.milanvit.net'

	scrape_configs:
	- job_name: prometheus
	scrape_interval: 15s
	static_configs:
	- targets:
	- 'localhost:9090'
	- job_name: cadvisor
	scrape_interval: 15s
	scheme: https
	basic_auth:
	username: <supersecret>
	password: <supersecret>
	static_configs:
	- targets:
	- 'cadvisor.milanvit.net'
	- job_name: node-exporter
	scrape_interval: 15s
	scheme: https
	basic_auth:
	username: <supersecret>
	password: <supersecret>
	static_configs:
	- targets:
	- 'node-exporter.milanvit.net'
	route:
	receiver: 'slack'

	receivers:
	- name: 'slack'
	slack_configs:
	- send_resolved: true
	username: '<slack-username>'
	channel: '<channel-name>'
	api_url: '<hook-url>'
	title: '{{ template "custom_title" . }}'
	text: '{{ template "custom_slack_message" . }}'

	templates:
	- /alertmanager/notifications.tmpl
	{{ define "__single_message_title" }}{{ range .Alerts.Firing }}{{ .Labels.alertname }} @ {{ .Annotations.summary }}{{ end }}{{ range .Alerts.Resolved }}{{ .Labels.alertname }} @ {{ .Annotations.summary }}{{ end }}{{ end }}

	{{ define "custom_title" }}[{{ .Status \| toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing \| len }}{{ end }}] {{ if or (and (eq (len .Alerts.Firing) 1) (eq (len .Alerts.Resolved) 0)) (and (eq (len .Alerts.Firing) 0) (eq (len .Alerts.Resolved) 1)) }}{{ template "__single_message_title" . }}{{ end }}{{ end }}

	{{ define "custom_slack_message" }}
	{{ if or (and (eq (len .Alerts.Firing) 1) (eq (len .Alerts.Resolved) 0)) (and (eq (len .Alerts.Firing) 0) (eq (len .Alerts.Resolved) 1)) }}
	{{ range .Alerts.Firing }}{{ .Annotations.description }}{{ end }}{{ range .Alerts.Resolved }}{{ .Annotations.description }}{{ end }}
	{{ else }}
	{{ if gt (len .Alerts.Firing) 0 }}
	Alerts Firing:
	{{ range .Alerts.Firing }}- {{ .Annotations.summary }}: {{ .Annotations.description }}
	{{ end }}{{ end }}
	{{ if gt (len .Alerts.Resolved) 0 }}
	Alerts Resolved:
	{{ range .Alerts.Resolved }}- {{ .Annotations.summary }}: {{ .Annotations.description }}
	{{ end }}{{ end }}
	{{ end }}
	{{ end }}