Created
December 10, 2019 09:48
-
-
Save dholbach/25dce6cbb72ab85bb72e018d53a7ee19 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| |
# Here's what an alert definition looks like: | |
# ALERT AlertName | |
# IF expr | |
# FOR duration | |
# LABELS { severity="critical/warning/etc" } | |
# ANNOTATIONS { | |
# summary = "Description of the alert.", | |
# impact = "Impact for users.", | |
# dashboardURL = "https://dashboard.json", | |
# playbookURL = "https://playbook.md", | |
# detail = "A label {{$labels.pod}} has {{$value}} value", | |
# } | |
# | |
# For more information see: | |
# https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ | |
# https://prometheus.io/docs/alerting/notification_examples/ | |
# | |
# Weave Cloud treats certain annotations specially: | |
# Summary | |
# A short description of an alert, preferably should not contain any variables. | |
# It shows up as a title of a notification. | |
# | |
# Impact | |
# A vital piece of information that needs to be in the notification, preferably | |
# should not contain any variables. If there is no impact in alerting rule, | |
# notification will contain the warning: | |
# No impact defined. Please add one or disable this alert. | |
# | |
# Playbook URL | |
# A link to playbook | |
# | |
# Dashboard URL | |
# A link to dashboard | |
# | |
# Detail | |
# A section with labels and values of the alert, preferably short because in | |
# case of multiple alerts for single rule notification will list detail for | |
# first 10 of them. Example: | |
# Disk xvda on node ip-172-20-2-83.ec2.internal | |
# Disk xvdb on node ip-172-20-3-81.ec2.internal | |
# Disk xvdc on node ip-172-20-1-81.ec2.internal | |
# Disk xvda on node ip-172-20-3-81.ec2.internal | |
# | |
# Other annotations will be listed below the Details section. | |
# But in case of multiple alerts for a single rule, this list will contain only | |
# annotations for the first alert. | |
# | |
# Kubernetes Default Alerts | |
| |
# DaemonSet-related alerts | |
| |
ALERT DaemonSetFailedToSchedule | |
IF kube_daemonset_status_desired_number_scheduled != kube_daemonset_status_current_number_scheduled | |
FOR 5m | |
LABELS { severity="critical" } | |
ANNOTATIONS { | |
summary = "DaemonSet failed to schedule all pods.", | |
impact = "Hard to say. You should check it out.", | |
detail = "DaemonSet {{$labels.namespace}}/{{$labels.daemonset}}", | |
} | |
| |
# Pod-related alerts | |
| |
ALERT ContainerRestartingTooMuch | |
IF rate(kube_pod_container_status_restarts_total[10m]) > 1/(10*60) | |
FOR 1h | |
LABELS { severity="warning" } | |
ANNOTATIONS { | |
summary = "Container is restarting too much.", | |
impact = "Probably a serious user-facing bug, up to complete outage", | |
detail = "Container {{$labels.namespace}}/{{$labels.pod}} ({{$labels.container}})", | |
} | |
| |
ALERT PodNotReady | |
IF kube_pod_info{created_by_kind!="Job"} AND ON (pod, namespace) kube_pod_status_ready{condition="true"} != 1 | |
FOR 2h | |
LABELS { severity="warning" } | |
ANNOTATIONS { | |
summary = "Pod exists, but is not running.", | |
impact = "Probably a serious user-facing bug, up to complete outage", | |
detail = "Pod {{$labels.namespace}}/{{$labels.pod}}", | |
} | |
| |
# Jobs-related alerts | |
| |
ALERT JobFailed | |
IF kube_job_status_failed > 0 | |
FOR 2h | |
LABELS { severity="warning" } | |
ANNOTATIONS { | |
summary = "Job failed.", | |
impact = "Hard to say. You should check it out.", | |
detail = "Job {{$labels.namespaces}}/{{$labels.job}} didn't exit successfully", | |
} | |
| |
# Other stuff | |
| |
# Because we attempt to scrape all pods by default, we do expect the scraping | |
# to fail on some pods. If you use the "opt-in" configuration, ie. only scrape | |
# pods that are known to expose prometheus metrics, uncomment this alert. To | |
# use the "opt-in" prometheus configuration, please see the "Default Scraping | |
# Policy" section at: | |
# https://www.weave.works/docs/cloud/latest/tasks/monitor/configuration-k8s/ | |
# | |
# ALERT ScrapeFailed | |
# IF up != 1 | |
# FOR 10m | |
# LABELS { severity="warning" } | |
# ANNOTATIONS { | |
# summary = "Scrape failed.", | |
# impact = "We have no monitoring data for some pods. At worst, it's completely down. At best, we cannot reliably respond to operational issues.", | |
# detail = "{{if eq $labels.job \"kubernetes-pods\"}}Pod: {{$labels.kubernetes_pod_name}}{{else}}Job: {{$labels.job}}{{end}} Target: {{$labels.instance}}", | |
# } | |
| |
ALERT ApiServerDown | |
IF absent(up{job="kubernetes-apiservers"}) unless ignoring (job) absent(up) or sum(up{job="kubernetes-apiservers"}) < 1 | |
FOR 5m | |
LABELS { severity="warning" } | |
ANNOTATIONS { | |
summary = "Kubernetes API server has been down for at least 5 minutes.", | |
impact = "Our Kubernetes cluster is inoperable. User impact uncertain.", | |
} | |
| |
ALERT DiskWillFillIn24Hours | |
IF predict_linear(node_filesystem_avail{kubernetes_namespace='weave',_weave_service='prom-node-exporter',fstype='ext4'}[30m], 24*3600) < 0 | |
FOR 1h | |
LABELS { severity="warning" } | |
ANNOTATIONS { | |
summary = "Disk space will run out in 24 hours.", | |
impact = "Random things are about to break for our users", | |
detail = "Disk {{$labels.device}} on node {{$labels.node}} ", | |
} | |
| |
ALERT ClockSkewiff | |
IF abs(node_ntp_drift_seconds) > 15 | |
FOR 5m | |
LABELS { severity="warning" } | |
ANNOTATIONS { | |
summary = "Clock is out of sync with NTP.", | |
impact = "Random things are about to break for our users", | |
detail = "Node: {{$labels.node}} is {{$value}} seconds off", | |
} | |
| |
ALERT ClockSyncBroken | |
IF node_timex_sync_status != 1 | |
FOR 5m | |
LABELS { severity="warning" } | |
ANNOTATIONS { | |
summary = "The clock is not being synced.", | |
impact = "Random things are about to break for our users", | |
detail = "Node: {{$labels.node}}", | |
} | |
| |
ALERT NoMetrics | |
IF absent(up) | |
FOR 5m | |
LABELS { severity="warning" } | |
ANNOTATIONS { | |
summary = "Received no metrics in the last 5 minutes. Agent disconnected?", | |
impact = "No alerts fire, and monitoring data may be lost. At worst, the entire cluster may be down." | |
} | |
| |
ALERT NodeHighCPUUsage | |
IF (1 - (sum(irate(node_cpu{mode="idle"}[1m])) by (node) / max (label_replace(machine_cpu_cores{job=~".*cadvisor"},"node","$1","instance","(.*)")) by (node))) > 0.9 | |
FOR 5m | |
LABELS { severity="warning" } | |
ANNOTATIONS { | |
summary = "CPU usage has been over 90% in the last 5 minutes.", | |
impact = "Pods running on that overcommitted node may cause further issues down the line", | |
detail = "Node: {{$labels.node}}", | |
} | |
| |
ALERT NodeDown | |
IF up{job="kubernetes-nodes"} == 0 | |
FOR 5m | |
LABELS { severity="warning" } | |
ANNOTATIONS { | |
summary = "Kubernetes node is not available.", | |
impact = "The cluster will be operating at a reduced capacity for scheduling workloads.", | |
detail = "Node: {{$labels.instance}}", | |
} | |
| |
ALERT NodeAvailabilityIntermittent | |
IF resets(up{job="kubernetes-nodes"}[5m]) > 1 | |
FOR 5m | |
LABELS { severity="warning" } | |
ANNOTATIONS { | |
summary = "Kubernetes node has been intermittently available.", | |
impact = "The node might be stuck in a restart cycle and be disrupting the normal scheduling of workloads.", | |
detail = "Node: {{$labels.instance}}", | |
} | |
| |
ALERT PromRemoteStorageFailures | |
IF sum by(kubernetes_pod_name)(rate(prometheus_remote_storage_failed_samples_total{kubernetes_namespace="weave"}[1m])) > 0 | |
FOR 5m | |
LABELS { severity="warning" } | |
ANNOTATIONS { | |
summary = "Prometheus failed to send samples.", | |
impact = "Prometheus cannot send samples to Cortex. Look at the Prometheus logs to check details of the error (or contact Weave Cloud support help@weave.works):", | |
detail = "kubectl logs -f -n weave {{$labels.kubernetes_pod_name}} prometheus; {{$value | humanize}} errors/sec", | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment