Skip to content

Instantly share code, notes, and snippets.

@atopuzov
Last active September 29, 2017 11:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save atopuzov/ff9bc797984a97d040fc56ef43bff381 to your computer and use it in GitHub Desktop.
Save atopuzov/ff9bc797984a97d040fc56ef43bff381 to your computer and use it in GitHub Desktop.
--- before.yaml^I2017-09-29 11:44:09.000000000 +0100$
+++ now.yaml^I2017-09-29 12:18:11.000000000 +0100$
@@ -13,7 +13,7 @@$
data:$
kube-api.rules: |-$
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
-$
+ $
ALERT K8SApiServerLatency$
IF histogram_quantile($
0.99,$
@@ -28,9 +28,9 @@$
summary = "Kubernetes apiserver latency is high",$
description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",$
}$
-$
+ $
### API latency ###$
-$
+ $
# Raw metrics are in microseconds. Convert to seconds.$
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} =$
histogram_quantile($
@@ -47,11 +47,11 @@$
0.5,$
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)$
) / 1e6$
-$
+ $
### File descriptor alerts$
-$
+ $
instance:fd_utilization = process_open_fds / process_max_fds$
-$
+ $
# alert if file descriptors are likely to exhaust within the next 4 hours$
ALERT FdExhaustionClose$
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1$
@@ -64,7 +64,7 @@$
summary = "file descriptors soon exhausted",$
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",$
}$
-$
+ $
# alert if file descriptors are likely to exhaust within the next hour$
ALERT FdExhaustionClose$
IF predict_linear(instance:fd_utilization[10m], 3600) > 1$
@@ -77,7 +77,7 @@$
summary = "file descriptors soon exhausted",$
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",$
}$
-$
+ $
ALERT K8STooManyOpenFiles$
IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 50$
FOR 10m$
@@ -89,7 +89,7 @@$
summary = "{{ $labels.job }} has too many open file descriptors",$
description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",$
}$
-$
+ $
ALERT K8STooManyOpenFiles$
IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 80$
FOR 10m$
@@ -173,7 +173,7 @@$
data:$
kube-controller-manager.rules: |-$
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
-$
+ $
ALERT K8SControllerManagerDown$
IF absent(up{job="RELEASE-NAME-exporter-kube-controller-manager"}) or (count by(cluster) (up{job="RELEASE-NAME-exporter-kube-controller-manager"} == 1) == 0)$
FOR 5m$
@@ -185,6 +185,7 @@$
summary = "Controller manager is down",$
description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",$
}$
+$
---$
# Source: kube-prometheus/charts/exporter-kube-controller-manager/templates/service.yaml$
apiVersion: v1$
@@ -312,9 +313,9 @@$
data:$
kube-etcd.rules: |-$
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
-$
+ $
### General cluster availability ###$
-$
+ $
# alert if another failed peer will result in an unavailable cluster$
ALERT InsufficientPeers$
IF count(up{job="RELEASE-NAME-exporter-kube-etcd"} == 0) > (count(up{job="RELEASE-NAME-exporter-kube-etcd"}) / 2 - 1)$
@@ -326,9 +327,9 @@$
summary = "Etcd cluster small",$
description = "If one more etcd peer goes down the cluster will be unavailable",$
}$
-$
+ $
### HTTP requests alerts ###$
-$
+ $
# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response$
ALERT HighNumberOfFailedHTTPRequests$
IF sum by(method) (rate(etcd_http_failed_total{job="RELEASE-NAME-exporter-kube-etcd", code!~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="RELEASE-NAME-exporter-kube-etcd"}[5m])) > 0.01$
@@ -340,7 +341,7 @@$
summary = "a high number of HTTP requests are failing",$
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",$
}$
-$
+ $
# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response$
ALERT HighNumberOfFailedHTTPRequests$
IF sum by(method) (rate(etcd_http_failed_total{job="RELEASE-NAME-exporter-kube-etcd", code!~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="RELEASE-NAME-exporter-kube-etcd"}[5m])) > 0.05$
@@ -353,7 +354,7 @@$
summary = "a high number of HTTP requests are failing",$
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",$
}$
-$
+ $
# alert if 50% of requests get a 4xx response$
ALERT HighNumberOfFailedHTTPRequests$
IF sum by(method) (rate(etcd_http_failed_total{job="RELEASE-NAME-exporter-kube-etcd", code=~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="RELEASE-NAME-exporter-kube-etcd"}[5m])) > 0.5$
@@ -366,7 +367,7 @@$
summary = "a high number of HTTP requests are failing",$
description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",$
}$
-$
+ $
# alert if the 99th percentile of HTTP requests take more than 150ms$
ALERT HTTPRequestsSlow$
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15$
@@ -379,7 +380,7 @@$
summary = "slow HTTP requests",$
description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow",$
}$
-$
+ $
ALERT K8SApiServerEtcdAccessLatency$
IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0$
FOR 15m$
@@ -391,9 +392,9 @@$
summary = "Access to etcd is slow",$
description = "99th percentile latency for apiserver to access etcd is higher than 1s.",$
}$
-$
+ $
### etcd proposal alerts ###$
-$
+ $
# alert if there are several failed proposals within an hour$
ALERT HighNumberOfFailedProposals$
IF increase(etcd_server_proposal_failed_total{job="RELEASE-NAME-exporter-kube-etcd"}[1h]) > 5$
@@ -404,9 +405,9 @@$
summary = "a high number of failed proposals within the etcd cluster are happening",$
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",$
}$
-$
+ $
### etcd disk io latency alerts$
-$
+ $
# alert if 99th percentile of fsync durations is higher than 500ms$
ALERT HighFsyncDurations$
IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5$
@@ -484,30 +485,30 @@$
data:$
kube-controller-manager.rules: |-$
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
-$
+ $
### Scheduling latency ###$
-$
+ $
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} =$
histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6$
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} =$
histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6$
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} =$
histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6$
-$
+ $
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} =$
histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6$
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} =$
histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6$
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} =$
histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6$
-$
+ $
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} =$
histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6$
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =$
histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6$
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =$
histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6$
-$
+ $
ALERT K8SSchedulerDown$
IF absent(up{job="RELEASE-NAME-exporter-kube-scheduler"}) or (count by(cluster) (up{job="RELEASE-NAME-exporter-kube-scheduler"} == 1) == 0)$
FOR 5m$
@@ -519,6 +520,7 @@$
summary = "Scheduler is down",$
description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",$
}$
+$
---$
# Source: kube-prometheus/charts/exporter-kube-scheduler/templates/service.yaml$
apiVersion: v1$
@@ -585,7 +587,7 @@$
data:$
kube-state.rules: |-$
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
-$
+ $
ALERT K8SNodeNotReady$
IF kube_node_status_condition{condition="Ready", status="true"} == 0$
FOR 1h$
@@ -597,7 +599,7 @@$
summary = "Node status is NotReady",$
description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",$
}$
-$
+ $
ALERT K8SManyNodesNotReady$
IF$
count by (cluster) (kube_node_status_condition{condition="Ready", status="true"} == 0) > 1$
@@ -743,9 +745,9 @@$
data:$
kubelets.rules: |-$
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
-$
+ $
### Container resources ###$
-$
+ $
cluster_namespace_controller_pod_container:spec_memory_limit_bytes =$
sum by (cluster,namespace,controller,pod_name,container_name) ($
label_replace($
@@ -754,7 +756,7 @@$
"pod_name", "^(.*)-[a-z0-9]+"$
)$
)$
-$
+ $
cluster_namespace_controller_pod_container:spec_cpu_shares =$
sum by (cluster,namespace,controller,pod_name,container_name) ($
label_replace($
@@ -763,7 +765,7 @@$
"pod_name", "^(.*)-[a-z0-9]+"$
)$
)$
-$
+ $
cluster_namespace_controller_pod_container:cpu_usage:rate =$
sum by (cluster,namespace,controller,pod_name,container_name) ($
label_replace($
@@ -774,7 +776,7 @@$
"pod_name", "^(.*)-[a-z0-9]+"$
)$
)$
-$
+ $
cluster_namespace_controller_pod_container:memory_usage:bytes =$
sum by (cluster,namespace,controller,pod_name,container_name) ($
label_replace($
@@ -783,7 +785,7 @@$
"pod_name", "^(.*)-[a-z0-9]+"$
)$
)$
-$
+ $
cluster_namespace_controller_pod_container:memory_working_set:bytes =$
sum by (cluster,namespace,controller,pod_name,container_name) ($
label_replace($
@@ -792,7 +794,7 @@$
"pod_name", "^(.*)-[a-z0-9]+"$
)$
)$
-$
+ $
cluster_namespace_controller_pod_container:memory_rss:bytes =$
sum by (cluster,namespace,controller,pod_name,container_name) ($
label_replace($
@@ -801,7 +803,7 @@$
"pod_name", "^(.*)-[a-z0-9]+"$
)$
)$
-$
+ $
cluster_namespace_controller_pod_container:memory_cache:bytes =$
sum by (cluster,namespace,controller,pod_name,container_name) ($
label_replace($
@@ -810,7 +812,7 @@$
"pod_name", "^(.*)-[a-z0-9]+"$
)$
)$
-$
+ $
cluster_namespace_controller_pod_container:disk_usage:bytes =$
sum by (cluster,namespace,controller,pod_name,container_name) ($
label_replace($
@@ -819,7 +821,7 @@$
"pod_name", "^(.*)-[a-z0-9]+"$
)$
)$
-$
+ $
cluster_namespace_controller_pod_container:memory_pagefaults:rate =$
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ($
label_replace($
@@ -830,7 +832,7 @@$
"pod_name", "^(.*)-[a-z0-9]+"$
)$
)$
-$
+ $
cluster_namespace_controller_pod_container:memory_oom:rate =$
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ($
label_replace($
@@ -841,30 +843,30 @@$
"pod_name", "^(.*)-[a-z0-9]+"$
)$
)$
-$
+ $
### Cluster resources ###$
-$
+ $
cluster:memory_allocation:percent =$
100 * sum by (cluster) ($
container_spec_memory_limit_bytes{pod_name!=""}$
) / sum by (cluster) ($
machine_memory_bytes$
)$
-$
+ $
cluster:memory_used:percent =$
100 * sum by (cluster) ($
container_memory_usage_bytes{pod_name!=""}$
) / sum by (cluster) ($
machine_memory_bytes$
)$
-$
+ $
cluster:cpu_allocation:percent =$
100 * sum by (cluster) ($
container_spec_cpu_shares{pod_name!=""}$
) / sum by (cluster) ($
container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores$
)$
-$
+ $
ALERT K8SNodeDown$
IF up{job="kubelet"} == 0$
FOR 1h$
@@ -876,7 +878,7 @@$
summary = "Kubelet cannot be scraped",$
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour",$
}$
-$
+ $
ALERT K8SKubeletDown$
IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1$
FOR 1h$
@@ -888,10 +890,10 @@$
summary = "Many Kubelets cannot be scraped",$
description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.",$
}$
-$
+ $
# Some verbs excluded because they are expected to be long-lasting:$
# WATCHLIST is long-poll, CONNECT is `kubectl exec`.$
-$
+ $
ALERT K8SKubeletTooManyPods$
IF kubelet_running_pod_count > 100$
LABELS {$
@@ -902,6 +904,7 @@$
summary = "Kubelet is close to pod limit",$
description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",$
}$
+$
---$
# Source: kube-prometheus/charts/exporter-kubelets/templates/servicemonitor.yaml$
apiVersion: monitoring.coreos.com/v1$
@@ -953,7 +956,7 @@$
data:$
kubernetes.rules: |-$
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
-$
+ $
ALERT K8SApiserverDown$
IF up{job="kubernetes"} == 0$
FOR 15m$
@@ -965,7 +968,7 @@$
summary = "API server unreachable",$
description = "An API server could not be scraped.",$
}$
-$
+ $
# Disable for non HA kubernetes setups.$
ALERT K8SApiserverDown$
IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"}))$
@@ -978,6 +981,7 @@$
summary = "API server unreachable",$
description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.",$
}$
+$
---$
# Source: kube-prometheus/charts/exporter-kubernetes/templates/servicemonitor.yaml$
apiVersion: monitoring.coreos.com/v1$
@@ -1025,14 +1029,14 @@$
data:$
node.rules: |-$
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
-$
+ $
cluster:node_cpu_use:percent =$
100 * sum by (cluster) ($
rate(node_cpu{mode!="idle"}[5m])$
) / sum by (cluster) ($
machine_cpu_cores$
)$
-$
+ $
ALERT K8SKubeletNodeExporterDown$
IF up{job="RELEASE-NAME-exporter-node"} == 0$
FOR 15m$
@@ -1044,7 +1048,7 @@$
summary = "Kubelet node_exporter cannot be scraped",$
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.",$
}$
-$
+ $
ALERT K8SConntrackTableFull$
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50$
FOR 10m$
@@ -1056,7 +1060,7 @@$
summary = "Number of tracked connections is near the limit",$
description = "The nf_conntrack table is {{ $value }}% full.",$
}$
-$
+ $
ALERT K8SConntrackTableFull$
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90$
LABELS {$
@@ -1067,7 +1071,7 @@$
summary = "Number of tracked connections is near the limit",$
description = "The nf_conntrack table is {{ $value }}% full.",$
}$
-$
+ $
# To catch the conntrack sysctl de-tuning when it happens$
ALERT K8SConntrackTuningMissing$
IF node_nf_conntrack_udp_timeout > 10$
@@ -1080,7 +1084,7 @@$
summary = "Node does not have the correct conntrack tunings",$
description = "Nodes keep un-setting the correct tunings, investigate when it happens.",$
}$
-$
+ $
ALERT K8SNodeOutOfDisk$
IF kube_node_status_condition{condition="OutOfDisk", status="true"} == 1$
LABELS {$
@@ -1091,7 +1095,7 @@$
summary = "Node ran out of disk space.",$
description = "{{ $labels.node }} has run out of disk space.",$
}$
-$
+ $
ALERT K8SNodeMemoryPressure$
IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1$
LABELS {$
@@ -1102,7 +1106,7 @@$
summary = "Node is under memory pressure.",$
description = "{{ $labels.node }} is under memory pressure.",$
}$
-$
+ $
ALERT K8SNodeDiskPressure$
IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1$
LABELS {$
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment