Last active
September 29, 2017 11:36
-
-
Save atopuzov/ff9bc797984a97d040fc56ef43bff381 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- before.yaml^I2017-09-29 11:44:09.000000000 +0100$ | |
+++ now.yaml^I2017-09-29 12:18:11.000000000 +0100$ | |
@@ -13,7 +13,7 @@$ | |
data:$ | |
kube-api.rules: |-$ | |
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$ | |
-$ | |
+ $ | |
ALERT K8SApiServerLatency$ | |
IF histogram_quantile($ | |
0.99,$ | |
@@ -28,9 +28,9 @@$ | |
summary = "Kubernetes apiserver latency is high",$ | |
description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",$ | |
}$ | |
-$ | |
+ $ | |
### API latency ###$ | |
-$ | |
+ $ | |
# Raw metrics are in microseconds. Convert to seconds.$ | |
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} =$ | |
histogram_quantile($ | |
@@ -47,11 +47,11 @@$ | |
0.5,$ | |
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)$ | |
) / 1e6$ | |
-$ | |
+ $ | |
### File descriptor alerts$ | |
-$ | |
+ $ | |
instance:fd_utilization = process_open_fds / process_max_fds$ | |
-$ | |
+ $ | |
# alert if file descriptors are likely to exhaust within the next 4 hours$ | |
ALERT FdExhaustionClose$ | |
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1$ | |
@@ -64,7 +64,7 @@$ | |
summary = "file descriptors soon exhausted",$ | |
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",$ | |
}$ | |
-$ | |
+ $ | |
# alert if file descriptors are likely to exhaust within the next hour$ | |
ALERT FdExhaustionClose$ | |
IF predict_linear(instance:fd_utilization[10m], 3600) > 1$ | |
@@ -77,7 +77,7 @@$ | |
summary = "file descriptors soon exhausted",$ | |
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",$ | |
}$ | |
-$ | |
+ $ | |
ALERT K8STooManyOpenFiles$ | |
IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 50$ | |
FOR 10m$ | |
@@ -89,7 +89,7 @@$ | |
summary = "{{ $labels.job }} has too many open file descriptors",$ | |
description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",$ | |
}$ | |
-$ | |
+ $ | |
ALERT K8STooManyOpenFiles$ | |
IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 80$ | |
FOR 10m$ | |
@@ -173,7 +173,7 @@$ | |
data:$ | |
kube-controller-manager.rules: |-$ | |
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$ | |
-$ | |
+ $ | |
ALERT K8SControllerManagerDown$ | |
IF absent(up{job="RELEASE-NAME-exporter-kube-controller-manager"}) or (count by(cluster) (up{job="RELEASE-NAME-exporter-kube-controller-manager"} == 1) == 0)$ | |
FOR 5m$ | |
@@ -185,6 +185,7 @@$ | |
summary = "Controller manager is down",$ | |
description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",$ | |
}$ | |
+$ | |
---$ | |
# Source: kube-prometheus/charts/exporter-kube-controller-manager/templates/service.yaml$ | |
apiVersion: v1$ | |
@@ -312,9 +313,9 @@$ | |
data:$ | |
kube-etcd.rules: |-$ | |
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$ | |
-$ | |
+ $ | |
### General cluster availability ###$ | |
-$ | |
+ $ | |
# alert if another failed peer will result in an unavailable cluster$ | |
ALERT InsufficientPeers$ | |
IF count(up{job="RELEASE-NAME-exporter-kube-etcd"} == 0) > (count(up{job="RELEASE-NAME-exporter-kube-etcd"}) / 2 - 1)$ | |
@@ -326,9 +327,9 @@$ | |
summary = "Etcd cluster small",$ | |
description = "If one more etcd peer goes down the cluster will be unavailable",$ | |
}$ | |
-$ | |
+ $ | |
### HTTP requests alerts ###$ | |
-$ | |
+ $ | |
# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response$ | |
ALERT HighNumberOfFailedHTTPRequests$ | |
IF sum by(method) (rate(etcd_http_failed_total{job="RELEASE-NAME-exporter-kube-etcd", code!~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="RELEASE-NAME-exporter-kube-etcd"}[5m])) > 0.01$ | |
@@ -340,7 +341,7 @@$ | |
summary = "a high number of HTTP requests are failing",$ | |
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",$ | |
}$ | |
-$ | |
+ $ | |
# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response$ | |
ALERT HighNumberOfFailedHTTPRequests$ | |
IF sum by(method) (rate(etcd_http_failed_total{job="RELEASE-NAME-exporter-kube-etcd", code!~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="RELEASE-NAME-exporter-kube-etcd"}[5m])) > 0.05$ | |
@@ -353,7 +354,7 @@$ | |
summary = "a high number of HTTP requests are failing",$ | |
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",$ | |
}$ | |
-$ | |
+ $ | |
# alert if 50% of requests get a 4xx response$ | |
ALERT HighNumberOfFailedHTTPRequests$ | |
IF sum by(method) (rate(etcd_http_failed_total{job="RELEASE-NAME-exporter-kube-etcd", code=~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="RELEASE-NAME-exporter-kube-etcd"}[5m])) > 0.5$ | |
@@ -366,7 +367,7 @@$ | |
summary = "a high number of HTTP requests are failing",$ | |
description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",$ | |
}$ | |
-$ | |
+ $ | |
# alert if the 99th percentile of HTTP requests take more than 150ms$ | |
ALERT HTTPRequestsSlow$ | |
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15$ | |
@@ -379,7 +380,7 @@$ | |
summary = "slow HTTP requests",$ | |
description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow",$ | |
}$ | |
-$ | |
+ $ | |
ALERT K8SApiServerEtcdAccessLatency$ | |
IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0$ | |
FOR 15m$ | |
@@ -391,9 +392,9 @@$ | |
summary = "Access to etcd is slow",$ | |
description = "99th percentile latency for apiserver to access etcd is higher than 1s.",$ | |
}$ | |
-$ | |
+ $ | |
### etcd proposal alerts ###$ | |
-$ | |
+ $ | |
# alert if there are several failed proposals within an hour$ | |
ALERT HighNumberOfFailedProposals$ | |
IF increase(etcd_server_proposal_failed_total{job="RELEASE-NAME-exporter-kube-etcd"}[1h]) > 5$ | |
@@ -404,9 +405,9 @@$ | |
summary = "a high number of failed proposals within the etcd cluster are happening",$ | |
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",$ | |
}$ | |
-$ | |
+ $ | |
### etcd disk io latency alerts$ | |
-$ | |
+ $ | |
# alert if 99th percentile of fsync durations is higher than 500ms$ | |
ALERT HighFsyncDurations$ | |
IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5$ | |
@@ -484,30 +485,30 @@$ | |
data:$ | |
kube-controller-manager.rules: |-$ | |
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$ | |
-$ | |
+ $ | |
### Scheduling latency ###$ | |
-$ | |
+ $ | |
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} =$ | |
histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6$ | |
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} =$ | |
histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6$ | |
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} =$ | |
histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6$ | |
-$ | |
+ $ | |
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} =$ | |
histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6$ | |
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} =$ | |
histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6$ | |
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} =$ | |
histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6$ | |
-$ | |
+ $ | |
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} =$ | |
histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6$ | |
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =$ | |
histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6$ | |
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =$ | |
histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6$ | |
-$ | |
+ $ | |
ALERT K8SSchedulerDown$ | |
IF absent(up{job="RELEASE-NAME-exporter-kube-scheduler"}) or (count by(cluster) (up{job="RELEASE-NAME-exporter-kube-scheduler"} == 1) == 0)$ | |
FOR 5m$ | |
@@ -519,6 +520,7 @@$ | |
summary = "Scheduler is down",$ | |
description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",$ | |
}$ | |
+$ | |
---$ | |
# Source: kube-prometheus/charts/exporter-kube-scheduler/templates/service.yaml$ | |
apiVersion: v1$ | |
@@ -585,7 +587,7 @@$ | |
data:$ | |
kube-state.rules: |-$ | |
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$ | |
-$ | |
+ $ | |
ALERT K8SNodeNotReady$ | |
IF kube_node_status_condition{condition="Ready", status="true"} == 0$ | |
FOR 1h$ | |
@@ -597,7 +599,7 @@$ | |
summary = "Node status is NotReady",$ | |
description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",$ | |
}$ | |
-$ | |
+ $ | |
ALERT K8SManyNodesNotReady$ | |
IF$ | |
count by (cluster) (kube_node_status_condition{condition="Ready", status="true"} == 0) > 1$ | |
@@ -743,9 +745,9 @@$ | |
data:$ | |
kubelets.rules: |-$ | |
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$ | |
-$ | |
+ $ | |
### Container resources ###$ | |
-$ | |
+ $ | |
cluster_namespace_controller_pod_container:spec_memory_limit_bytes =$ | |
sum by (cluster,namespace,controller,pod_name,container_name) ($ | |
label_replace($ | |
@@ -754,7 +756,7 @@$ | |
"pod_name", "^(.*)-[a-z0-9]+"$ | |
)$ | |
)$ | |
-$ | |
+ $ | |
cluster_namespace_controller_pod_container:spec_cpu_shares =$ | |
sum by (cluster,namespace,controller,pod_name,container_name) ($ | |
label_replace($ | |
@@ -763,7 +765,7 @@$ | |
"pod_name", "^(.*)-[a-z0-9]+"$ | |
)$ | |
)$ | |
-$ | |
+ $ | |
cluster_namespace_controller_pod_container:cpu_usage:rate =$ | |
sum by (cluster,namespace,controller,pod_name,container_name) ($ | |
label_replace($ | |
@@ -774,7 +776,7 @@$ | |
"pod_name", "^(.*)-[a-z0-9]+"$ | |
)$ | |
)$ | |
-$ | |
+ $ | |
cluster_namespace_controller_pod_container:memory_usage:bytes =$ | |
sum by (cluster,namespace,controller,pod_name,container_name) ($ | |
label_replace($ | |
@@ -783,7 +785,7 @@$ | |
"pod_name", "^(.*)-[a-z0-9]+"$ | |
)$ | |
)$ | |
-$ | |
+ $ | |
cluster_namespace_controller_pod_container:memory_working_set:bytes =$ | |
sum by (cluster,namespace,controller,pod_name,container_name) ($ | |
label_replace($ | |
@@ -792,7 +794,7 @@$ | |
"pod_name", "^(.*)-[a-z0-9]+"$ | |
)$ | |
)$ | |
-$ | |
+ $ | |
cluster_namespace_controller_pod_container:memory_rss:bytes =$ | |
sum by (cluster,namespace,controller,pod_name,container_name) ($ | |
label_replace($ | |
@@ -801,7 +803,7 @@$ | |
"pod_name", "^(.*)-[a-z0-9]+"$ | |
)$ | |
)$ | |
-$ | |
+ $ | |
cluster_namespace_controller_pod_container:memory_cache:bytes =$ | |
sum by (cluster,namespace,controller,pod_name,container_name) ($ | |
label_replace($ | |
@@ -810,7 +812,7 @@$ | |
"pod_name", "^(.*)-[a-z0-9]+"$ | |
)$ | |
)$ | |
-$ | |
+ $ | |
cluster_namespace_controller_pod_container:disk_usage:bytes =$ | |
sum by (cluster,namespace,controller,pod_name,container_name) ($ | |
label_replace($ | |
@@ -819,7 +821,7 @@$ | |
"pod_name", "^(.*)-[a-z0-9]+"$ | |
)$ | |
)$ | |
-$ | |
+ $ | |
cluster_namespace_controller_pod_container:memory_pagefaults:rate =$ | |
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ($ | |
label_replace($ | |
@@ -830,7 +832,7 @@$ | |
"pod_name", "^(.*)-[a-z0-9]+"$ | |
)$ | |
)$ | |
-$ | |
+ $ | |
cluster_namespace_controller_pod_container:memory_oom:rate =$ | |
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ($ | |
label_replace($ | |
@@ -841,30 +843,30 @@$ | |
"pod_name", "^(.*)-[a-z0-9]+"$ | |
)$ | |
)$ | |
-$ | |
+ $ | |
### Cluster resources ###$ | |
-$ | |
+ $ | |
cluster:memory_allocation:percent =$ | |
100 * sum by (cluster) ($ | |
container_spec_memory_limit_bytes{pod_name!=""}$ | |
) / sum by (cluster) ($ | |
machine_memory_bytes$ | |
)$ | |
-$ | |
+ $ | |
cluster:memory_used:percent =$ | |
100 * sum by (cluster) ($ | |
container_memory_usage_bytes{pod_name!=""}$ | |
) / sum by (cluster) ($ | |
machine_memory_bytes$ | |
)$ | |
-$ | |
+ $ | |
cluster:cpu_allocation:percent =$ | |
100 * sum by (cluster) ($ | |
container_spec_cpu_shares{pod_name!=""}$ | |
) / sum by (cluster) ($ | |
container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores$ | |
)$ | |
-$ | |
+ $ | |
ALERT K8SNodeDown$ | |
IF up{job="kubelet"} == 0$ | |
FOR 1h$ | |
@@ -876,7 +878,7 @@$ | |
summary = "Kubelet cannot be scraped",$ | |
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour",$ | |
}$ | |
-$ | |
+ $ | |
ALERT K8SKubeletDown$ | |
IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1$ | |
FOR 1h$ | |
@@ -888,10 +890,10 @@$ | |
summary = "Many Kubelets cannot be scraped",$ | |
description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.",$ | |
}$ | |
-$ | |
+ $ | |
# Some verbs excluded because they are expected to be long-lasting:$ | |
# WATCHLIST is long-poll, CONNECT is `kubectl exec`.$ | |
-$ | |
+ $ | |
ALERT K8SKubeletTooManyPods$ | |
IF kubelet_running_pod_count > 100$ | |
LABELS {$ | |
@@ -902,6 +904,7 @@$ | |
summary = "Kubelet is close to pod limit",$ | |
description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",$ | |
}$ | |
+$ | |
---$ | |
# Source: kube-prometheus/charts/exporter-kubelets/templates/servicemonitor.yaml$ | |
apiVersion: monitoring.coreos.com/v1$ | |
@@ -953,7 +956,7 @@$ | |
data:$ | |
kubernetes.rules: |-$ | |
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$ | |
-$ | |
+ $ | |
ALERT K8SApiserverDown$ | |
IF up{job="kubernetes"} == 0$ | |
FOR 15m$ | |
@@ -965,7 +968,7 @@$ | |
summary = "API server unreachable",$ | |
description = "An API server could not be scraped.",$ | |
}$ | |
-$ | |
+ $ | |
# Disable for non HA kubernetes setups.$ | |
ALERT K8SApiserverDown$ | |
IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"}))$ | |
@@ -978,6 +981,7 @@$ | |
summary = "API server unreachable",$ | |
description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.",$ | |
}$ | |
+$ | |
---$ | |
# Source: kube-prometheus/charts/exporter-kubernetes/templates/servicemonitor.yaml$ | |
apiVersion: monitoring.coreos.com/v1$ | |
@@ -1025,14 +1029,14 @@$ | |
data:$ | |
node.rules: |-$ | |
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$ | |
-$ | |
+ $ | |
cluster:node_cpu_use:percent =$ | |
100 * sum by (cluster) ($ | |
rate(node_cpu{mode!="idle"}[5m])$ | |
) / sum by (cluster) ($ | |
machine_cpu_cores$ | |
)$ | |
-$ | |
+ $ | |
ALERT K8SKubeletNodeExporterDown$ | |
IF up{job="RELEASE-NAME-exporter-node"} == 0$ | |
FOR 15m$ | |
@@ -1044,7 +1048,7 @@$ | |
summary = "Kubelet node_exporter cannot be scraped",$ | |
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.",$ | |
}$ | |
-$ | |
+ $ | |
ALERT K8SConntrackTableFull$ | |
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50$ | |
FOR 10m$ | |
@@ -1056,7 +1060,7 @@$ | |
summary = "Number of tracked connections is near the limit",$ | |
description = "The nf_conntrack table is {{ $value }}% full.",$ | |
}$ | |
-$ | |
+ $ | |
ALERT K8SConntrackTableFull$ | |
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90$ | |
LABELS {$ | |
@@ -1067,7 +1071,7 @@$ | |
summary = "Number of tracked connections is near the limit",$ | |
description = "The nf_conntrack table is {{ $value }}% full.",$ | |
}$ | |
-$ | |
+ $ | |
# To catch the conntrack sysctl de-tuning when it happens$ | |
ALERT K8SConntrackTuningMissing$ | |
IF node_nf_conntrack_udp_timeout > 10$ | |
@@ -1080,7 +1084,7 @@$ | |
summary = "Node does not have the correct conntrack tunings",$ | |
description = "Nodes keep un-setting the correct tunings, investigate when it happens.",$ | |
}$ | |
-$ | |
+ $ | |
ALERT K8SNodeOutOfDisk$ | |
IF kube_node_status_condition{condition="OutOfDisk", status="true"} == 1$ | |
LABELS {$ | |
@@ -1091,7 +1095,7 @@$ | |
summary = "Node ran out of disk space.",$ | |
description = "{{ $labels.node }} has run out of disk space.",$ | |
}$ | |
-$ | |
+ $ | |
ALERT K8SNodeMemoryPressure$ | |
IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1$ | |
LABELS {$ | |
@@ -1102,7 +1106,7 @@$ | |
summary = "Node is under memory pressure.",$ | |
description = "{{ $labels.node }} is under memory pressure.",$ | |
}$ | |
-$ | |
+ $ | |
ALERT K8SNodeDiskPressure$ | |
IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1$ | |
LABELS {$ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment