dholbach/default.rules

## default.rules

# Here's what an alert definition looks like:
# ALERT AlertName
#   IF          expr
#   FOR         duration
#   LABELS      { severity="critical/warning/etc" }
#   ANNOTATIONS {
#     summary = "Description of the alert.",
#     impact = "Impact for users.",
#     dashboardURL = "https://dashboard.json",
#     playbookURL = "https://playbook.md",
#     detail = "A label {{$labels.pod}} has {{$value}} value",
#   }
#
# For more information see:
# https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
# https://prometheus.io/docs/alerting/notification_examples/
#
# Weave Cloud treats certain annotations specially:
# Summary
# A short description of an alert, preferably should not contain any variables.
# It shows up as a title of a notification.
#
# Impact
# A vital piece of information that needs to be in the notification, preferably
# should not contain any variables. If there is no impact in alerting rule,
# notification will contain the warning:
# No impact defined. Please add one or disable this alert.
#
# Playbook URL
# A link to playbook
#
# Dashboard URL
# A link to dashboard
#
# Detail
# A section with labels and values of the alert, preferably short because in
# case of multiple alerts for single rule notification will list detail for
# first 10 of them. Example:
# Disk xvda on node ip-172-20-2-83.ec2.internal
# Disk xvdb on node ip-172-20-3-81.ec2.internal
# Disk xvdc on node ip-172-20-1-81.ec2.internal
# Disk xvda on node ip-172-20-3-81.ec2.internal
#
# Other annotations will be listed below the Details section.
# But in case of multiple alerts for a single rule, this list will contain only
# annotations for the first alert.
#
# Kubernetes Default Alerts

# DaemonSet-related alerts

ALERT DaemonSetFailedToSchedule
  IF          kube_daemonset_status_desired_number_scheduled != kube_daemonset_status_current_number_scheduled
  FOR         5m
  LABELS      { severity="critical" }
  ANNOTATIONS {
    summary = "DaemonSet failed to schedule all pods.",
    impact = "Hard to say. You should check it out.",
    detail = "DaemonSet {{$labels.namespace}}/{{$labels.daemonset}}",
  }

# Pod-related alerts

ALERT ContainerRestartingTooMuch
  IF          rate(kube_pod_container_status_restarts_total[10m]) > 1/(10*60)
  FOR         1h
  LABELS      { severity="warning" }
  ANNOTATIONS {
    summary = "Container is restarting too much.",
    impact = "Probably a serious user-facing bug, up to complete outage",
    detail = "Container {{$labels.namespace}}/{{$labels.pod}} ({{$labels.container}})",
  }

ALERT PodNotReady
  IF          kube_pod_info{created_by_kind!="Job"} AND ON (pod, namespace) kube_pod_status_ready{condition="true"} != 1
  FOR         2h
  LABELS      { severity="warning" }
  ANNOTATIONS {
    summary = "Pod exists, but is not running.",
    impact = "Probably a serious user-facing bug, up to complete outage",
    detail = "Pod {{$labels.namespace}}/{{$labels.pod}}",
  }

# Jobs-related alerts

ALERT JobFailed
  IF          kube_job_status_failed  > 0
  FOR         2h
  LABELS      { severity="warning" }
  ANNOTATIONS {
    summary = "Job failed.",
    impact = "Hard to say. You should check it out.",
    detail = "Job {{$labels.namespaces}}/{{$labels.job}} didn't exit successfully",
  }

# Other stuff

# Because we attempt to scrape all pods by default, we do expect the scraping
# to fail on some pods. If you use the "opt-in" configuration, ie. only scrape
# pods that are known to expose prometheus metrics, uncomment this alert.  To
# use the "opt-in" prometheus configuration, please see the "Default Scraping
# Policy" section at:
# https://www.weave.works/docs/cloud/latest/tasks/monitor/configuration-k8s/
#
# ALERT ScrapeFailed
#  IF          up != 1
#  FOR         10m
#  LABELS      { severity="warning" }
#  ANNOTATIONS {
#    summary = "Scrape failed.",
#    impact = "We have no monitoring data for some pods. At worst, it's completely down. At best, we cannot reliably respond to operational issues.",
#    detail = "{{if eq $labels.job \"kubernetes-pods\"}}Pod: {{$labels.kubernetes_pod_name}}{{else}}Job: {{$labels.job}}{{end}} Target: {{$labels.instance}}",
#  }

ALERT ApiServerDown
  IF          absent(up{job="kubernetes-apiservers"}) unless ignoring (job) absent(up) or sum(up{job="kubernetes-apiservers"}) < 1
  FOR         5m
  LABELS      { severity="warning" }
  ANNOTATIONS {
    summary = "Kubernetes API server has been down for at least 5 minutes.",
    impact = "Our Kubernetes cluster is inoperable. User impact uncertain.",
  }

ALERT DiskWillFillIn24Hours
  IF          predict_linear(node_filesystem_avail{kubernetes_namespace='weave',_weave_service='prom-node-exporter',fstype='ext4'}[30m], 24*3600) < 0
  FOR         1h
  LABELS      { severity="warning" }
  ANNOTATIONS {
    summary = "Disk space will run out in 24 hours.",
    impact = "Random things are about to break for our users",
    detail = "Disk {{$labels.device}} on node {{$labels.node}} ",
  }

ALERT ClockSkewiff
  IF          abs(node_ntp_drift_seconds) > 15
  FOR         5m
  LABELS      { severity="warning" }
  ANNOTATIONS {
    summary = "Clock is out of sync with NTP.",
    impact = "Random things are about to break for our users",
    detail = "Node: {{$labels.node}} is {{$value}} seconds off",
  }

ALERT ClockSyncBroken
  IF          node_timex_sync_status != 1
  FOR         5m
  LABELS      { severity="warning" }
  ANNOTATIONS {
    summary = "The clock is not being synced.",
    impact = "Random things are about to break for our users",
    detail = "Node: {{$labels.node}}",
  }

ALERT NoMetrics
  IF          absent(up)
  FOR         5m
  LABELS      { severity="warning" }
  ANNOTATIONS {
    summary = "Received no metrics in the last 5 minutes. Agent disconnected?",
    impact = "No alerts fire, and monitoring data may be lost. At worst, the entire cluster may be down."
  }

ALERT NodeHighCPUUsage
  IF          (1 - (sum(irate(node_cpu{mode="idle"}[1m])) by (node) / max (label_replace(machine_cpu_cores{job=~".*cadvisor"},"node","$1","instance","(.*)")) by (node))) > 0.9
  FOR         5m
  LABELS      { severity="warning" }
  ANNOTATIONS {
    summary = "CPU usage has been over 90% in the last 5 minutes.",
    impact = "Pods running on that overcommitted node may cause further issues down the line",
    detail = "Node: {{$labels.node}}",
  }

ALERT NodeDown
  IF up{job="kubernetes-nodes"} == 0
  FOR 5m
  LABELS      { severity="warning" }
  ANNOTATIONS {
    summary = "Kubernetes node is not available.",
    impact = "The cluster will be operating at a reduced capacity for scheduling workloads.",
    detail = "Node: {{$labels.instance}}",
  }

ALERT NodeAvailabilityIntermittent
  IF resets(up{job="kubernetes-nodes"}[5m]) > 1
  FOR 5m
  LABELS      { severity="warning" }
  ANNOTATIONS {
    summary = "Kubernetes node has been intermittently available.",
    impact = "The node might be stuck in a restart cycle and be disrupting the normal scheduling of workloads.",
    detail = "Node: {{$labels.instance}}",
  }

ALERT PromRemoteStorageFailures
  IF sum by(kubernetes_pod_name)(rate(prometheus_remote_storage_failed_samples_total{kubernetes_namespace="weave"}[1m])) > 0
  FOR 5m
  LABELS      { severity="warning" }
  ANNOTATIONS {
    summary = "Prometheus failed to send samples.",
    impact = "Prometheus cannot send samples to Cortex. Look at the Prometheus logs to check details of the error (or contact Weave Cloud support help@weave.works):",
    detail = "kubectl logs -f -n weave {{$labels.kubernetes_pod_name}} prometheus; {{$value | humanize}} errors/sec",
  }

	# Here's what an alert definition looks like:
	# ALERT AlertName
	# IF expr
	# FOR duration
	# LABELS { severity="critical/warning/etc" }
	# ANNOTATIONS {
	# summary = "Description of the alert.",
	# impact = "Impact for users.",
	# dashboardURL = "https://dashboard.json",
	# playbookURL = "https://playbook.md",
	# detail = "A label {{$labels.pod}} has {{$value}} value",
	# }
	#
	# For more information see:
	# https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
	# https://prometheus.io/docs/alerting/notification_examples/
	#
	# Weave Cloud treats certain annotations specially:
	# Summary
	# A short description of an alert, preferably should not contain any variables.
	# It shows up as a title of a notification.
	#
	# Impact
	# A vital piece of information that needs to be in the notification, preferably
	# should not contain any variables. If there is no impact in alerting rule,
	# notification will contain the warning:
	# No impact defined. Please add one or disable this alert.
	#
	# Playbook URL
	# A link to playbook
	#
	# Dashboard URL
	# A link to dashboard
	#
	# Detail
	# A section with labels and values of the alert, preferably short because in
	# case of multiple alerts for single rule notification will list detail for
	# first 10 of them. Example:
	# Disk xvda on node ip-172-20-2-83.ec2.internal
	# Disk xvdb on node ip-172-20-3-81.ec2.internal
	# Disk xvdc on node ip-172-20-1-81.ec2.internal
	# Disk xvda on node ip-172-20-3-81.ec2.internal
	#
	# Other annotations will be listed below the Details section.
	# But in case of multiple alerts for a single rule, this list will contain only
	# annotations for the first alert.
	#
	# Kubernetes Default Alerts

	# DaemonSet-related alerts

	ALERT DaemonSetFailedToSchedule
	IF kube_daemonset_status_desired_number_scheduled != kube_daemonset_status_current_number_scheduled
	FOR 5m
	LABELS { severity="critical" }
	ANNOTATIONS {
	summary = "DaemonSet failed to schedule all pods.",
	impact = "Hard to say. You should check it out.",
	detail = "DaemonSet {{$labels.namespace}}/{{$labels.daemonset}}",
	}

	# Pod-related alerts

	ALERT ContainerRestartingTooMuch
	IF rate(kube_pod_container_status_restarts_total[10m]) > 1/(10*60)
	FOR 1h
	LABELS { severity="warning" }
	ANNOTATIONS {
	summary = "Container is restarting too much.",
	impact = "Probably a serious user-facing bug, up to complete outage",
	detail = "Container {{$labels.namespace}}/{{$labels.pod}} ({{$labels.container}})",
	}

	ALERT PodNotReady
	IF kube_pod_info{created_by_kind!="Job"} AND ON (pod, namespace) kube_pod_status_ready{condition="true"} != 1
	FOR 2h
	LABELS { severity="warning" }
	ANNOTATIONS {
	summary = "Pod exists, but is not running.",
	impact = "Probably a serious user-facing bug, up to complete outage",
	detail = "Pod {{$labels.namespace}}/{{$labels.pod}}",
	}

	# Jobs-related alerts

	ALERT JobFailed
	IF kube_job_status_failed > 0
	FOR 2h
	LABELS { severity="warning" }
	ANNOTATIONS {
	summary = "Job failed.",
	impact = "Hard to say. You should check it out.",
	detail = "Job {{$labels.namespaces}}/{{$labels.job}} didn't exit successfully",
	}

	# Other stuff

	# Because we attempt to scrape all pods by default, we do expect the scraping
	# to fail on some pods. If you use the "opt-in" configuration, ie. only scrape
	# pods that are known to expose prometheus metrics, uncomment this alert. To
	# use the "opt-in" prometheus configuration, please see the "Default Scraping
	# Policy" section at:
	# https://www.weave.works/docs/cloud/latest/tasks/monitor/configuration-k8s/
	#
	# ALERT ScrapeFailed
	# IF up != 1
	# FOR 10m
	# LABELS { severity="warning" }
	# ANNOTATIONS {
	# summary = "Scrape failed.",
	# impact = "We have no monitoring data for some pods. At worst, it's completely down. At best, we cannot reliably respond to operational issues.",
	# detail = "{{if eq $labels.job \"kubernetes-pods\"}}Pod: {{$labels.kubernetes_pod_name}}{{else}}Job: {{$labels.job}}{{end}} Target: {{$labels.instance}}",
	# }

	ALERT ApiServerDown
	IF absent(up{job="kubernetes-apiservers"}) unless ignoring (job) absent(up) or sum(up{job="kubernetes-apiservers"}) < 1
	FOR 5m
	LABELS { severity="warning" }
	ANNOTATIONS {
	summary = "Kubernetes API server has been down for at least 5 minutes.",
	impact = "Our Kubernetes cluster is inoperable. User impact uncertain.",
	}

	ALERT DiskWillFillIn24Hours
	IF predict_linear(node_filesystem_avail{kubernetes_namespace='weave',_weave_service='prom-node-exporter',fstype='ext4'}[30m], 24*3600) < 0
	FOR 1h
	LABELS { severity="warning" }
	ANNOTATIONS {
	summary = "Disk space will run out in 24 hours.",
	impact = "Random things are about to break for our users",
	detail = "Disk {{$labels.device}} on node {{$labels.node}} ",
	}

	ALERT ClockSkewiff
	IF abs(node_ntp_drift_seconds) > 15
	FOR 5m
	LABELS { severity="warning" }
	ANNOTATIONS {
	summary = "Clock is out of sync with NTP.",
	impact = "Random things are about to break for our users",
	detail = "Node: {{$labels.node}} is {{$value}} seconds off",
	}

	ALERT ClockSyncBroken
	IF node_timex_sync_status != 1
	FOR 5m
	LABELS { severity="warning" }
	ANNOTATIONS {
	summary = "The clock is not being synced.",
	impact = "Random things are about to break for our users",
	detail = "Node: {{$labels.node}}",
	}

	ALERT NoMetrics
	IF absent(up)
	FOR 5m
	LABELS { severity="warning" }
	ANNOTATIONS {
	summary = "Received no metrics in the last 5 minutes. Agent disconnected?",
	impact = "No alerts fire, and monitoring data may be lost. At worst, the entire cluster may be down."
	}

	ALERT NodeHighCPUUsage
	IF (1 - (sum(irate(node_cpu{mode="idle"}[1m])) by (node) / max (label_replace(machine_cpu_cores{job=~".cadvisor"},"node","$1","instance","(.)")) by (node))) > 0.9
	FOR 5m
	LABELS { severity="warning" }
	ANNOTATIONS {
	summary = "CPU usage has been over 90% in the last 5 minutes.",
	impact = "Pods running on that overcommitted node may cause further issues down the line",
	detail = "Node: {{$labels.node}}",
	}

	ALERT NodeDown
	IF up{job="kubernetes-nodes"} == 0
	FOR 5m
	LABELS { severity="warning" }
	ANNOTATIONS {
	summary = "Kubernetes node is not available.",
	impact = "The cluster will be operating at a reduced capacity for scheduling workloads.",
	detail = "Node: {{$labels.instance}}",
	}

	ALERT NodeAvailabilityIntermittent
	IF resets(up{job="kubernetes-nodes"}[5m]) > 1
	FOR 5m
	LABELS { severity="warning" }
	ANNOTATIONS {
	summary = "Kubernetes node has been intermittently available.",
	impact = "The node might be stuck in a restart cycle and be disrupting the normal scheduling of workloads.",
	detail = "Node: {{$labels.instance}}",
	}

	ALERT PromRemoteStorageFailures
	IF sum by(kubernetes_pod_name)(rate(prometheus_remote_storage_failed_samples_total{kubernetes_namespace="weave"}[1m])) > 0
	FOR 5m
	LABELS { severity="warning" }
	ANNOTATIONS {
	summary = "Prometheus failed to send samples.",
	impact = "Prometheus cannot send samples to Cortex. Look at the Prometheus logs to check details of the error (or contact Weave Cloud support help@weave.works):",
	detail = "kubectl logs -f -n weave {{$labels.kubernetes_pod_name}} prometheus; {{$value \| humanize}} errors/sec",
	}