Skip to content

Instantly share code, notes, and snippets.

@dholbach
Created December 10, 2019 09:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dholbach/25dce6cbb72ab85bb72e018d53a7ee19 to your computer and use it in GitHub Desktop.
Save dholbach/25dce6cbb72ab85bb72e018d53a7ee19 to your computer and use it in GitHub Desktop.
# Here's what an alert definition looks like:
# ALERT AlertName
# IF expr
# FOR duration
# LABELS { severity="critical/warning/etc" }
# ANNOTATIONS {
# summary = "Description of the alert.",
# impact = "Impact for users.",
# dashboardURL = "https://dashboard.json",
# playbookURL = "https://playbook.md",
# detail = "A label {{$labels.pod}} has {{$value}} value",
# }
#
# For more information see:
# https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
# https://prometheus.io/docs/alerting/notification_examples/
#
# Weave Cloud treats certain annotations specially:
# Summary
# A short description of an alert, preferably should not contain any variables.
# It shows up as a title of a notification.
#
# Impact
# A vital piece of information that needs to be in the notification, preferably
# should not contain any variables. If there is no impact in alerting rule,
# notification will contain the warning:
# No impact defined. Please add one or disable this alert.
#
# Playbook URL
# A link to playbook
#
# Dashboard URL
# A link to dashboard
#
# Detail
# A section with labels and values of the alert, preferably short because in
# case of multiple alerts for single rule notification will list detail for
# first 10 of them. Example:
# Disk xvda on node ip-172-20-2-83.ec2.internal
# Disk xvdb on node ip-172-20-3-81.ec2.internal
# Disk xvdc on node ip-172-20-1-81.ec2.internal
# Disk xvda on node ip-172-20-3-81.ec2.internal
#
# Other annotations will be listed below the Details section.
# But in case of multiple alerts for a single rule, this list will contain only
# annotations for the first alert.
#
# Kubernetes Default Alerts
# DaemonSet-related alerts
ALERT DaemonSetFailedToSchedule
IF kube_daemonset_status_desired_number_scheduled != kube_daemonset_status_current_number_scheduled
FOR 5m
LABELS { severity="critical" }
ANNOTATIONS {
summary = "DaemonSet failed to schedule all pods.",
impact = "Hard to say. You should check it out.",
detail = "DaemonSet {{$labels.namespace}}/{{$labels.daemonset}}",
}
# Pod-related alerts
ALERT ContainerRestartingTooMuch
IF rate(kube_pod_container_status_restarts_total[10m]) > 1/(10*60)
FOR 1h
LABELS { severity="warning" }
ANNOTATIONS {
summary = "Container is restarting too much.",
impact = "Probably a serious user-facing bug, up to complete outage",
detail = "Container {{$labels.namespace}}/{{$labels.pod}} ({{$labels.container}})",
}
ALERT PodNotReady
IF kube_pod_info{created_by_kind!="Job"} AND ON (pod, namespace) kube_pod_status_ready{condition="true"} != 1
FOR 2h
LABELS { severity="warning" }
ANNOTATIONS {
summary = "Pod exists, but is not running.",
impact = "Probably a serious user-facing bug, up to complete outage",
detail = "Pod {{$labels.namespace}}/{{$labels.pod}}",
}
# Jobs-related alerts
ALERT JobFailed
IF kube_job_status_failed > 0
FOR 2h
LABELS { severity="warning" }
ANNOTATIONS {
summary = "Job failed.",
impact = "Hard to say. You should check it out.",
detail = "Job {{$labels.namespaces}}/{{$labels.job}} didn't exit successfully",
}
# Other stuff
# Because we attempt to scrape all pods by default, we do expect the scraping
# to fail on some pods. If you use the "opt-in" configuration, ie. only scrape
# pods that are known to expose prometheus metrics, uncomment this alert. To
# use the "opt-in" prometheus configuration, please see the "Default Scraping
# Policy" section at:
# https://www.weave.works/docs/cloud/latest/tasks/monitor/configuration-k8s/
#
# ALERT ScrapeFailed
# IF up != 1
# FOR 10m
# LABELS { severity="warning" }
# ANNOTATIONS {
# summary = "Scrape failed.",
# impact = "We have no monitoring data for some pods. At worst, it's completely down. At best, we cannot reliably respond to operational issues.",
# detail = "{{if eq $labels.job \"kubernetes-pods\"}}Pod: {{$labels.kubernetes_pod_name}}{{else}}Job: {{$labels.job}}{{end}} Target: {{$labels.instance}}",
# }
ALERT ApiServerDown
IF absent(up{job="kubernetes-apiservers"}) unless ignoring (job) absent(up) or sum(up{job="kubernetes-apiservers"}) < 1
FOR 5m
LABELS { severity="warning" }
ANNOTATIONS {
summary = "Kubernetes API server has been down for at least 5 minutes.",
impact = "Our Kubernetes cluster is inoperable. User impact uncertain.",
}
ALERT DiskWillFillIn24Hours
IF predict_linear(node_filesystem_avail{kubernetes_namespace='weave',_weave_service='prom-node-exporter',fstype='ext4'}[30m], 24*3600) < 0
FOR 1h
LABELS { severity="warning" }
ANNOTATIONS {
summary = "Disk space will run out in 24 hours.",
impact = "Random things are about to break for our users",
detail = "Disk {{$labels.device}} on node {{$labels.node}} ",
}
ALERT ClockSkewiff
IF abs(node_ntp_drift_seconds) > 15
FOR 5m
LABELS { severity="warning" }
ANNOTATIONS {
summary = "Clock is out of sync with NTP.",
impact = "Random things are about to break for our users",
detail = "Node: {{$labels.node}} is {{$value}} seconds off",
}
ALERT ClockSyncBroken
IF node_timex_sync_status != 1
FOR 5m
LABELS { severity="warning" }
ANNOTATIONS {
summary = "The clock is not being synced.",
impact = "Random things are about to break for our users",
detail = "Node: {{$labels.node}}",
}
ALERT NoMetrics
IF absent(up)
FOR 5m
LABELS { severity="warning" }
ANNOTATIONS {
summary = "Received no metrics in the last 5 minutes. Agent disconnected?",
impact = "No alerts fire, and monitoring data may be lost. At worst, the entire cluster may be down."
}
ALERT NodeHighCPUUsage
IF (1 - (sum(irate(node_cpu{mode="idle"}[1m])) by (node) / max (label_replace(machine_cpu_cores{job=~".*cadvisor"},"node","$1","instance","(.*)")) by (node))) > 0.9
FOR 5m
LABELS { severity="warning" }
ANNOTATIONS {
summary = "CPU usage has been over 90% in the last 5 minutes.",
impact = "Pods running on that overcommitted node may cause further issues down the line",
detail = "Node: {{$labels.node}}",
}
ALERT NodeDown
IF up{job="kubernetes-nodes"} == 0
FOR 5m
LABELS { severity="warning" }
ANNOTATIONS {
summary = "Kubernetes node is not available.",
impact = "The cluster will be operating at a reduced capacity for scheduling workloads.",
detail = "Node: {{$labels.instance}}",
}
ALERT NodeAvailabilityIntermittent
IF resets(up{job="kubernetes-nodes"}[5m]) > 1
FOR 5m
LABELS { severity="warning" }
ANNOTATIONS {
summary = "Kubernetes node has been intermittently available.",
impact = "The node might be stuck in a restart cycle and be disrupting the normal scheduling of workloads.",
detail = "Node: {{$labels.instance}}",
}
ALERT PromRemoteStorageFailures
IF sum by(kubernetes_pod_name)(rate(prometheus_remote_storage_failed_samples_total{kubernetes_namespace="weave"}[1m])) > 0
FOR 5m
LABELS { severity="warning" }
ANNOTATIONS {
summary = "Prometheus failed to send samples.",
impact = "Prometheus cannot send samples to Cortex. Look at the Prometheus logs to check details of the error (or contact Weave Cloud support help@weave.works):",
detail = "kubectl logs -f -n weave {{$labels.kubernetes_pod_name}} prometheus; {{$value | humanize}} errors/sec",
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment