atopuzov/diff -Naur before.yaml now.yaml | cat -vet

## diff -Naur before.yaml now.yaml | cat -vet
--- before.yaml^I2017-09-29 11:44:09.000000000 +0100$
+++ now.yaml^I2017-09-29 12:18:11.000000000 +0100$
@@ -13,7 +13,7 @@$
 data:$
   kube-api.rules: |-$
     # NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
-$
+    $
     ALERT K8SApiServerLatency$
       IF histogram_quantile($
           0.99,$
@@ -28,9 +28,9 @@$
         summary = "Kubernetes apiserver latency is high",$
         description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",$
       }$
-$
+    $
     ### API latency ###$
-$
+    $
     # Raw metrics are in microseconds. Convert to seconds.$
     cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} =$
       histogram_quantile($
@@ -47,11 +47,11 @@$
         0.5,$
         sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)$
       ) / 1e6$
-$
+    $
     ### File descriptor alerts$
-$
+    $
     instance:fd_utilization = process_open_fds / process_max_fds$
-$
+    $
     # alert if file descriptors are likely to exhaust within the next 4 hours$
     ALERT FdExhaustionClose$
       IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1$
@@ -64,7 +64,7 @@$
         summary = "file descriptors soon exhausted",$
         description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",$
       }$
-$
+    $
     # alert if file descriptors are likely to exhaust within the next hour$
     ALERT FdExhaustionClose$
       IF predict_linear(instance:fd_utilization[10m], 3600) > 1$
@@ -77,7 +77,7 @@$
         summary = "file descriptors soon exhausted",$
         description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",$
       }$
-$
+    $
     ALERT K8STooManyOpenFiles$
       IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 50$
       FOR 10m$
@@ -89,7 +89,7 @@$
         summary = "{{ $labels.job }} has too many open file descriptors",$
         description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",$
       }$
-$
+    $
     ALERT K8STooManyOpenFiles$
       IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 80$
       FOR 10m$
@@ -173,7 +173,7 @@$
 data:$
   kube-controller-manager.rules: |-$
     # NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
-$
+    $
     ALERT K8SControllerManagerDown$
       IF absent(up{job="RELEASE-NAME-exporter-kube-controller-manager"}) or (count by(cluster) (up{job="RELEASE-NAME-exporter-kube-controller-manager"} == 1) == 0)$
       FOR 5m$
@@ -185,6 +185,7 @@$
         summary = "Controller manager is down",$
         description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",$
       }$
+$
 ---$
 # Source: kube-prometheus/charts/exporter-kube-controller-manager/templates/service.yaml$
 apiVersion: v1$
@@ -312,9 +313,9 @@$
 data:$
   kube-etcd.rules: |-$
     # NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
-$
+    $
     ### General cluster availability ###$
-$
+    $
     # alert if another failed peer will result in an unavailable cluster$
     ALERT InsufficientPeers$
       IF count(up{job="RELEASE-NAME-exporter-kube-etcd"} == 0) > (count(up{job="RELEASE-NAME-exporter-kube-etcd"}) / 2 - 1)$
@@ -326,9 +327,9 @@$
         summary = "Etcd cluster small",$
         description = "If one more etcd peer goes down the cluster will be unavailable",$
       }$
-$
+    $
     ### HTTP requests alerts ###$
-$
+    $
     # alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response$
     ALERT HighNumberOfFailedHTTPRequests$
       IF sum by(method) (rate(etcd_http_failed_total{job="RELEASE-NAME-exporter-kube-etcd", code!~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="RELEASE-NAME-exporter-kube-etcd"}[5m])) > 0.01$
@@ -340,7 +341,7 @@$
         summary = "a high number of HTTP requests are failing",$
         description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",$
       }$
-$
+    $
     # alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response$
     ALERT HighNumberOfFailedHTTPRequests$
       IF sum by(method) (rate(etcd_http_failed_total{job="RELEASE-NAME-exporter-kube-etcd", code!~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="RELEASE-NAME-exporter-kube-etcd"}[5m])) > 0.05$
@@ -353,7 +354,7 @@$
         summary = "a high number of HTTP requests are failing",$
         description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",$
       }$
-$
+    $
     # alert if 50% of requests get a 4xx response$
     ALERT HighNumberOfFailedHTTPRequests$
       IF sum by(method) (rate(etcd_http_failed_total{job="RELEASE-NAME-exporter-kube-etcd", code=~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="RELEASE-NAME-exporter-kube-etcd"}[5m])) > 0.5$
@@ -366,7 +367,7 @@$
         summary = "a high number of HTTP requests are failing",$
         description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",$
       }$
-$
+    $
     # alert if the 99th percentile of HTTP requests take more than 150ms$
     ALERT HTTPRequestsSlow$
       IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15$
@@ -379,7 +380,7 @@$
         summary = "slow HTTP requests",$
         description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow",$
       }$
-$
+    $
     ALERT K8SApiServerEtcdAccessLatency$
       IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0$
       FOR 15m$
@@ -391,9 +392,9 @@$
         summary = "Access to etcd is slow",$
         description = "99th percentile latency for apiserver to access etcd is higher than 1s.",$
       }$
-$
+    $
     ### etcd proposal alerts ###$
-$
+    $
     # alert if there are several failed proposals within an hour$
     ALERT HighNumberOfFailedProposals$
       IF increase(etcd_server_proposal_failed_total{job="RELEASE-NAME-exporter-kube-etcd"}[1h]) > 5$
@@ -404,9 +405,9 @@$
         summary = "a high number of failed proposals within the etcd cluster are happening",$
         description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",$
       }$
-$
+    $
     ### etcd disk io latency alerts$
-$
+    $
     # alert if 99th percentile of fsync durations is higher than 500ms$
     ALERT HighFsyncDurations$
       IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5$
@@ -484,30 +485,30 @@$
 data:$
   kube-controller-manager.rules: |-$
     # NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
-$
+    $
     ### Scheduling latency ###$
-$
+    $
     cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} =$
       histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6$
     cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} =$
       histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6$
     cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} =$
       histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6$
-$
+    $
     cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} =$
       histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6$
     cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} =$
       histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6$
     cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} =$
       histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6$
-$
+    $
     cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} =$
       histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6$
     cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =$
       histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6$
     cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =$
       histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6$
-$
+    $
     ALERT K8SSchedulerDown$
       IF absent(up{job="RELEASE-NAME-exporter-kube-scheduler"}) or (count by(cluster) (up{job="RELEASE-NAME-exporter-kube-scheduler"} == 1) == 0)$
       FOR 5m$
@@ -519,6 +520,7 @@$
         summary = "Scheduler is down",$
         description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",$
       }$
+$
 ---$
 # Source: kube-prometheus/charts/exporter-kube-scheduler/templates/service.yaml$
 apiVersion: v1$
@@ -585,7 +587,7 @@$
 data:$
   kube-state.rules: |-$
     # NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
-$
+    $
     ALERT K8SNodeNotReady$
       IF kube_node_status_condition{condition="Ready", status="true"} == 0$
       FOR 1h$
@@ -597,7 +599,7 @@$
         summary = "Node status is NotReady",$
         description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",$
       }$
-$
+    $
     ALERT K8SManyNodesNotReady$
       IF$
         count by (cluster) (kube_node_status_condition{condition="Ready", status="true"} == 0) > 1$
@@ -743,9 +745,9 @@$
 data:$
   kubelets.rules: |-$
     # NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
-$
+    $
     ### Container resources ###$
-$
+    $
     cluster_namespace_controller_pod_container:spec_memory_limit_bytes =$
       sum by (cluster,namespace,controller,pod_name,container_name) ($
         label_replace($
@@ -754,7 +756,7 @@$
           "pod_name", "^(.*)-[a-z0-9]+"$
         )$
       )$
-$
+    $
     cluster_namespace_controller_pod_container:spec_cpu_shares =$
       sum by (cluster,namespace,controller,pod_name,container_name) ($
         label_replace($
@@ -763,7 +765,7 @@$
           "pod_name", "^(.*)-[a-z0-9]+"$
         )$
       )$
-$
+    $
     cluster_namespace_controller_pod_container:cpu_usage:rate =$
       sum by (cluster,namespace,controller,pod_name,container_name) ($
         label_replace($
@@ -774,7 +776,7 @@$
           "pod_name", "^(.*)-[a-z0-9]+"$
         )$
       )$
-$
+    $
     cluster_namespace_controller_pod_container:memory_usage:bytes =$
       sum by (cluster,namespace,controller,pod_name,container_name) ($
         label_replace($
@@ -783,7 +785,7 @@$
           "pod_name", "^(.*)-[a-z0-9]+"$
         )$
       )$
-$
+    $
     cluster_namespace_controller_pod_container:memory_working_set:bytes =$
       sum by (cluster,namespace,controller,pod_name,container_name) ($
         label_replace($
@@ -792,7 +794,7 @@$
           "pod_name", "^(.*)-[a-z0-9]+"$
         )$
       )$
-$
+    $
     cluster_namespace_controller_pod_container:memory_rss:bytes =$
       sum by (cluster,namespace,controller,pod_name,container_name) ($
         label_replace($
@@ -801,7 +803,7 @@$
           "pod_name", "^(.*)-[a-z0-9]+"$
         )$
       )$
-$
+    $
     cluster_namespace_controller_pod_container:memory_cache:bytes =$
       sum by (cluster,namespace,controller,pod_name,container_name) ($
         label_replace($
@@ -810,7 +812,7 @@$
           "pod_name", "^(.*)-[a-z0-9]+"$
         )$
       )$
-$
+    $
     cluster_namespace_controller_pod_container:disk_usage:bytes =$
       sum by (cluster,namespace,controller,pod_name,container_name) ($
         label_replace($
@@ -819,7 +821,7 @@$
           "pod_name", "^(.*)-[a-z0-9]+"$
         )$
       )$
-$
+    $
     cluster_namespace_controller_pod_container:memory_pagefaults:rate =$
       sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ($
         label_replace($
@@ -830,7 +832,7 @@$
           "pod_name", "^(.*)-[a-z0-9]+"$
         )$
       )$
-$
+    $
     cluster_namespace_controller_pod_container:memory_oom:rate =$
       sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ($
         label_replace($
@@ -841,30 +843,30 @@$
           "pod_name", "^(.*)-[a-z0-9]+"$
         )$
       )$
-$
+    $
     ### Cluster resources ###$
-$
+    $
     cluster:memory_allocation:percent =$
       100 * sum by (cluster) ($
         container_spec_memory_limit_bytes{pod_name!=""}$
       ) / sum by (cluster) ($
         machine_memory_bytes$
       )$
-$
+    $
     cluster:memory_used:percent =$
       100 * sum by (cluster) ($
         container_memory_usage_bytes{pod_name!=""}$
       ) / sum by (cluster) ($
         machine_memory_bytes$
       )$
-$
+    $
     cluster:cpu_allocation:percent =$
       100 * sum by (cluster) ($
         container_spec_cpu_shares{pod_name!=""}$
       ) / sum by (cluster) ($
         container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores$
       )$
-$
+    $
     ALERT K8SNodeDown$
       IF up{job="kubelet"} == 0$
       FOR 1h$
@@ -876,7 +878,7 @@$
         summary = "Kubelet cannot be scraped",$
         description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour",$
       }$
-$
+    $
     ALERT K8SKubeletDown$
       IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1$
       FOR 1h$
@@ -888,10 +890,10 @@$
         summary = "Many Kubelets cannot be scraped",$
         description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.",$
       }$
-$
+    $
     # Some verbs excluded because they are expected to be long-lasting:$
     # WATCHLIST is long-poll, CONNECT is `kubectl exec`.$
-$
+    $
     ALERT K8SKubeletTooManyPods$
       IF kubelet_running_pod_count > 100$
       LABELS {$
@@ -902,6 +904,7 @@$
         summary = "Kubelet is close to pod limit",$
         description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",$
       }$
+$
 ---$
 # Source: kube-prometheus/charts/exporter-kubelets/templates/servicemonitor.yaml$
 apiVersion: monitoring.coreos.com/v1$
@@ -953,7 +956,7 @@$
 data:$
   kubernetes.rules: |-$
     # NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
-$
+    $
     ALERT K8SApiserverDown$
       IF up{job="kubernetes"} == 0$
       FOR 15m$
@@ -965,7 +968,7 @@$
         summary = "API server unreachable",$
         description = "An API server could not be scraped.",$
       }$
-$
+    $
     # Disable for non HA kubernetes setups.$
     ALERT K8SApiserverDown$
       IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"}))$
@@ -978,6 +981,7 @@$
         summary = "API server unreachable",$
         description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.",$
       }$
+$
 ---$
 # Source: kube-prometheus/charts/exporter-kubernetes/templates/servicemonitor.yaml$
 apiVersion: monitoring.coreos.com/v1$
@@ -1025,14 +1029,14 @@$
 data:$
   node.rules: |-$
     # NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
-$
+    $
     cluster:node_cpu_use:percent =$
       100 * sum by (cluster) ($
         rate(node_cpu{mode!="idle"}[5m])$
       ) / sum by (cluster) ($
         machine_cpu_cores$
       )$
-$
+    $
     ALERT K8SKubeletNodeExporterDown$
       IF up{job="RELEASE-NAME-exporter-node"} == 0$
       FOR 15m$
@@ -1044,7 +1048,7 @@$
         summary = "Kubelet node_exporter cannot be scraped",$
         description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.",$
       }$
-$
+    $
     ALERT K8SConntrackTableFull$
       IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50$
       FOR 10m$
@@ -1056,7 +1060,7 @@$
         summary = "Number of tracked connections is near the limit",$
         description = "The nf_conntrack table is {{ $value }}% full.",$
       }$
-$
+    $
     ALERT K8SConntrackTableFull$
       IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90$
       LABELS {$
@@ -1067,7 +1071,7 @@$
         summary = "Number of tracked connections is near the limit",$
         description = "The nf_conntrack table is {{ $value }}% full.",$
       }$
-$
+    $
     # To catch the conntrack sysctl de-tuning when it happens$
     ALERT K8SConntrackTuningMissing$
       IF node_nf_conntrack_udp_timeout > 10$
@@ -1080,7 +1084,7 @@$
         summary = "Node does not have the correct conntrack tunings",$
         description = "Nodes keep un-setting the correct tunings, investigate when it happens.",$
       }$
-$
+    $
     ALERT K8SNodeOutOfDisk$
       IF kube_node_status_condition{condition="OutOfDisk", status="true"} == 1$
       LABELS {$
@@ -1091,7 +1095,7 @@$
         summary = "Node ran out of disk space.",$
         description = "{{ $labels.node }} has run out of disk space.",$
       }$
-$
+    $
     ALERT K8SNodeMemoryPressure$
       IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1$
       LABELS {$
@@ -1102,7 +1106,7 @@$
         summary = "Node is under memory pressure.",$
         description = "{{ $labels.node }} is under memory pressure.",$
       }$
-$
+    $
     ALERT K8SNodeDiskPressure$
       IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1$
       LABELS {$
	--- before.yaml^I2017-09-29 11:44:09.000000000 +0100$
	+++ now.yaml^I2017-09-29 12:18:11.000000000 +0100$
	@@ -13,7 +13,7 @@$
	data:$
	kube-api.rules: \|-$
	# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
	-$
	+ $
	ALERT K8SApiServerLatency$
	IF histogram_quantile($
	0.99,$
	@@ -28,9 +28,9 @@$
	summary = "Kubernetes apiserver latency is high",$
	description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",$
	}$
	-$
	+ $
	### API latency ###$
	-$
	+ $
	# Raw metrics are in microseconds. Convert to seconds.$
	cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} =$
	histogram_quantile($
	@@ -47,11 +47,11 @@$
	0.5,$
	sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)$
	) / 1e6$
	-$
	+ $
	### File descriptor alerts$
	-$
	+ $
	instance:fd_utilization = process_open_fds / process_max_fds$
	-$
	+ $
	# alert if file descriptors are likely to exhaust within the next 4 hours$
	ALERT FdExhaustionClose$
	IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1$
	@@ -64,7 +64,7 @@$
	summary = "file descriptors soon exhausted",$
	description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",$
	}$
	-$
	+ $
	# alert if file descriptors are likely to exhaust within the next hour$
	ALERT FdExhaustionClose$
	IF predict_linear(instance:fd_utilization[10m], 3600) > 1$
	@@ -77,7 +77,7 @@$
	summary = "file descriptors soon exhausted",$
	description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",$
	}$
	-$
	+ $
	ALERT K8STooManyOpenFiles$
	IF 100*process_open_fds{job=~"kubelets\|kubernetes"} / process_max_fds > 50$
	FOR 10m$
	@@ -89,7 +89,7 @@$
	summary = "{{ $labels.job }} has too many open file descriptors",$
	description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",$
	}$
	-$
	+ $
	ALERT K8STooManyOpenFiles$
	IF 100*process_open_fds{job=~"kubelets\|kubernetes"} / process_max_fds > 80$
	FOR 10m$
	@@ -173,7 +173,7 @@$
	data:$
	kube-controller-manager.rules: \|-$
	# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
	-$
	+ $
	ALERT K8SControllerManagerDown$
	IF absent(up{job="RELEASE-NAME-exporter-kube-controller-manager"}) or (count by(cluster) (up{job="RELEASE-NAME-exporter-kube-controller-manager"} == 1) == 0)$
	FOR 5m$
	@@ -185,6 +185,7 @@$
	summary = "Controller manager is down",$
	description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",$
	}$
	+$
	---$
	# Source: kube-prometheus/charts/exporter-kube-controller-manager/templates/service.yaml$
	apiVersion: v1$
	@@ -312,9 +313,9 @@$
	data:$
	kube-etcd.rules: \|-$
	# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
	-$
	+ $
	### General cluster availability ###$
	-$
	+ $
	# alert if another failed peer will result in an unavailable cluster$
	ALERT InsufficientPeers$
	IF count(up{job="RELEASE-NAME-exporter-kube-etcd"} == 0) > (count(up{job="RELEASE-NAME-exporter-kube-etcd"}) / 2 - 1)$
	@@ -326,9 +327,9 @@$
	summary = "Etcd cluster small",$
	description = "If one more etcd peer goes down the cluster will be unavailable",$
	}$
	-$
	+ $
	### HTTP requests alerts ###$
	-$
	+ $
	# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response$
	ALERT HighNumberOfFailedHTTPRequests$
	IF sum by(method) (rate(etcd_http_failed_total{job="RELEASE-NAME-exporter-kube-etcd", code!~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="RELEASE-NAME-exporter-kube-etcd"}[5m])) > 0.01$
	@@ -340,7 +341,7 @@$
	summary = "a high number of HTTP requests are failing",$
	description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",$
	}$
	-$
	+ $
	# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response$
	ALERT HighNumberOfFailedHTTPRequests$
	IF sum by(method) (rate(etcd_http_failed_total{job="RELEASE-NAME-exporter-kube-etcd", code!~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="RELEASE-NAME-exporter-kube-etcd"}[5m])) > 0.05$
	@@ -353,7 +354,7 @@$
	summary = "a high number of HTTP requests are failing",$
	description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",$
	}$
	-$
	+ $
	# alert if 50% of requests get a 4xx response$
	ALERT HighNumberOfFailedHTTPRequests$
	IF sum by(method) (rate(etcd_http_failed_total{job="RELEASE-NAME-exporter-kube-etcd", code=~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="RELEASE-NAME-exporter-kube-etcd"}[5m])) > 0.5$
	@@ -366,7 +367,7 @@$
	summary = "a high number of HTTP requests are failing",$
	description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",$
	}$
	-$
	+ $
	# alert if the 99th percentile of HTTP requests take more than 150ms$
	ALERT HTTPRequestsSlow$
	IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15$
	@@ -379,7 +380,7 @@$
	summary = "slow HTTP requests",$
	description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow",$
	}$
	-$
	+ $
	ALERT K8SApiServerEtcdAccessLatency$
	IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0$
	FOR 15m$
	@@ -391,9 +392,9 @@$
	summary = "Access to etcd is slow",$
	description = "99th percentile latency for apiserver to access etcd is higher than 1s.",$
	}$
	-$
	+ $
	### etcd proposal alerts ###$
	-$
	+ $
	# alert if there are several failed proposals within an hour$
	ALERT HighNumberOfFailedProposals$
	IF increase(etcd_server_proposal_failed_total{job="RELEASE-NAME-exporter-kube-etcd"}[1h]) > 5$
	@@ -404,9 +405,9 @@$
	summary = "a high number of failed proposals within the etcd cluster are happening",$
	description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",$
	}$
	-$
	+ $
	### etcd disk io latency alerts$
	-$
	+ $
	# alert if 99th percentile of fsync durations is higher than 500ms$
	ALERT HighFsyncDurations$
	IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5$
	@@ -484,30 +485,30 @@$
	data:$
	kube-controller-manager.rules: \|-$
	# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
	-$
	+ $
	### Scheduling latency ###$
	-$
	+ $
	cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} =$
	histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6$
	cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} =$
	histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6$
	cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} =$
	histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6$
	-$
	+ $
	cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} =$
	histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6$
	cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} =$
	histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6$
	cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} =$
	histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6$
	-$
	+ $
	cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} =$
	histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6$
	cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =$
	histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6$
	cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =$
	histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6$
	-$
	+ $
	ALERT K8SSchedulerDown$
	IF absent(up{job="RELEASE-NAME-exporter-kube-scheduler"}) or (count by(cluster) (up{job="RELEASE-NAME-exporter-kube-scheduler"} == 1) == 0)$
	FOR 5m$
	@@ -519,6 +520,7 @@$
	summary = "Scheduler is down",$
	description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",$
	}$
	+$
	---$
	# Source: kube-prometheus/charts/exporter-kube-scheduler/templates/service.yaml$
	apiVersion: v1$
	@@ -585,7 +587,7 @@$
	data:$
	kube-state.rules: \|-$
	# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
	-$
	+ $
	ALERT K8SNodeNotReady$
	IF kube_node_status_condition{condition="Ready", status="true"} == 0$
	FOR 1h$
	@@ -597,7 +599,7 @@$
	summary = "Node status is NotReady",$
	description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",$
	}$
	-$
	+ $
	ALERT K8SManyNodesNotReady$
	IF$
	count by (cluster) (kube_node_status_condition{condition="Ready", status="true"} == 0) > 1$
	@@ -743,9 +745,9 @@$
	data:$
	kubelets.rules: \|-$
	# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
	-$
	+ $
	### Container resources ###$
	-$
	+ $
	cluster_namespace_controller_pod_container:spec_memory_limit_bytes =$
	sum by (cluster,namespace,controller,pod_name,container_name) ($
	label_replace($
	@@ -754,7 +756,7 @@$
	"pod_name", "^(.*)-[a-z0-9]+"$
	)$
	)$
	-$
	+ $
	cluster_namespace_controller_pod_container:spec_cpu_shares =$
	sum by (cluster,namespace,controller,pod_name,container_name) ($
	label_replace($
	@@ -763,7 +765,7 @@$
	"pod_name", "^(.*)-[a-z0-9]+"$
	)$
	)$
	-$
	+ $
	cluster_namespace_controller_pod_container:cpu_usage:rate =$
	sum by (cluster,namespace,controller,pod_name,container_name) ($
	label_replace($
	@@ -774,7 +776,7 @@$
	"pod_name", "^(.*)-[a-z0-9]+"$
	)$
	)$
	-$
	+ $
	cluster_namespace_controller_pod_container:memory_usage:bytes =$
	sum by (cluster,namespace,controller,pod_name,container_name) ($
	label_replace($
	@@ -783,7 +785,7 @@$
	"pod_name", "^(.*)-[a-z0-9]+"$
	)$
	)$
	-$
	+ $
	cluster_namespace_controller_pod_container:memory_working_set:bytes =$
	sum by (cluster,namespace,controller,pod_name,container_name) ($
	label_replace($
	@@ -792,7 +794,7 @@$
	"pod_name", "^(.*)-[a-z0-9]+"$
	)$
	)$
	-$
	+ $
	cluster_namespace_controller_pod_container:memory_rss:bytes =$
	sum by (cluster,namespace,controller,pod_name,container_name) ($
	label_replace($
	@@ -801,7 +803,7 @@$
	"pod_name", "^(.*)-[a-z0-9]+"$
	)$
	)$
	-$
	+ $
	cluster_namespace_controller_pod_container:memory_cache:bytes =$
	sum by (cluster,namespace,controller,pod_name,container_name) ($
	label_replace($
	@@ -810,7 +812,7 @@$
	"pod_name", "^(.*)-[a-z0-9]+"$
	)$
	)$
	-$
	+ $
	cluster_namespace_controller_pod_container:disk_usage:bytes =$
	sum by (cluster,namespace,controller,pod_name,container_name) ($
	label_replace($
	@@ -819,7 +821,7 @@$
	"pod_name", "^(.*)-[a-z0-9]+"$
	)$
	)$
	-$
	+ $
	cluster_namespace_controller_pod_container:memory_pagefaults:rate =$
	sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ($
	label_replace($
	@@ -830,7 +832,7 @@$
	"pod_name", "^(.*)-[a-z0-9]+"$
	)$
	)$
	-$
	+ $
	cluster_namespace_controller_pod_container:memory_oom:rate =$
	sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ($
	label_replace($
	@@ -841,30 +843,30 @@$
	"pod_name", "^(.*)-[a-z0-9]+"$
	)$
	)$
	-$
	+ $
	### Cluster resources ###$
	-$
	+ $
	cluster:memory_allocation:percent =$
	100 * sum by (cluster) ($
	container_spec_memory_limit_bytes{pod_name!=""}$
	) / sum by (cluster) ($
	machine_memory_bytes$
	)$
	-$
	+ $
	cluster:memory_used:percent =$
	100 * sum by (cluster) ($
	container_memory_usage_bytes{pod_name!=""}$
	) / sum by (cluster) ($
	machine_memory_bytes$
	)$
	-$
	+ $
	cluster:cpu_allocation:percent =$
	100 * sum by (cluster) ($
	container_spec_cpu_shares{pod_name!=""}$
	) / sum by (cluster) ($
	container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores$
	)$
	-$
	+ $
	ALERT K8SNodeDown$
	IF up{job="kubelet"} == 0$
	FOR 1h$
	@@ -876,7 +878,7 @@$
	summary = "Kubelet cannot be scraped",$
	description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour",$
	}$
	-$
	+ $
	ALERT K8SKubeletDown$
	IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1$
	FOR 1h$
	@@ -888,10 +890,10 @@$
	summary = "Many Kubelets cannot be scraped",$
	description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.",$
	}$
	-$
	+ $
	# Some verbs excluded because they are expected to be long-lasting:$
	# WATCHLIST is long-poll, CONNECT is `kubectl exec`.$
	-$
	+ $
	ALERT K8SKubeletTooManyPods$
	IF kubelet_running_pod_count > 100$
	LABELS {$
	@@ -902,6 +904,7 @@$
	summary = "Kubelet is close to pod limit",$
	description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",$
	}$
	+$
	---$
	# Source: kube-prometheus/charts/exporter-kubelets/templates/servicemonitor.yaml$
	apiVersion: monitoring.coreos.com/v1$
	@@ -953,7 +956,7 @@$
	data:$
	kubernetes.rules: \|-$
	# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
	-$
	+ $
	ALERT K8SApiserverDown$
	IF up{job="kubernetes"} == 0$
	FOR 15m$
	@@ -965,7 +968,7 @@$
	summary = "API server unreachable",$
	description = "An API server could not be scraped.",$
	}$
	-$
	+ $
	# Disable for non HA kubernetes setups.$
	ALERT K8SApiserverDown$
	IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"}))$
	@@ -978,6 +981,7 @@$
	summary = "API server unreachable",$
	description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.",$
	}$
	+$
	---$
	# Source: kube-prometheus/charts/exporter-kubernetes/templates/servicemonitor.yaml$
	apiVersion: monitoring.coreos.com/v1$
	@@ -1025,14 +1029,14 @@$
	data:$
	node.rules: \|-$
	# NOTE: These rules were kindly contributed by the SoundCloud engineering team.$
	-$
	+ $
	cluster:node_cpu_use:percent =$
	100 * sum by (cluster) ($
	rate(node_cpu{mode!="idle"}[5m])$
	) / sum by (cluster) ($
	machine_cpu_cores$
	)$
	-$
	+ $
	ALERT K8SKubeletNodeExporterDown$
	IF up{job="RELEASE-NAME-exporter-node"} == 0$
	FOR 15m$
	@@ -1044,7 +1048,7 @@$
	summary = "Kubelet node_exporter cannot be scraped",$
	description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.",$
	}$
	-$
	+ $
	ALERT K8SConntrackTableFull$
	IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50$
	FOR 10m$
	@@ -1056,7 +1060,7 @@$
	summary = "Number of tracked connections is near the limit",$
	description = "The nf_conntrack table is {{ $value }}% full.",$
	}$
	-$
	+ $
	ALERT K8SConntrackTableFull$
	IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90$
	LABELS {$
	@@ -1067,7 +1071,7 @@$
	summary = "Number of tracked connections is near the limit",$
	description = "The nf_conntrack table is {{ $value }}% full.",$
	}$
	-$
	+ $
	# To catch the conntrack sysctl de-tuning when it happens$
	ALERT K8SConntrackTuningMissing$
	IF node_nf_conntrack_udp_timeout > 10$
	@@ -1080,7 +1084,7 @@$
	summary = "Node does not have the correct conntrack tunings",$
	description = "Nodes keep un-setting the correct tunings, investigate when it happens.",$
	}$
	-$
	+ $
	ALERT K8SNodeOutOfDisk$
	IF kube_node_status_condition{condition="OutOfDisk", status="true"} == 1$
	LABELS {$
	@@ -1091,7 +1095,7 @@$
	summary = "Node ran out of disk space.",$
	description = "{{ $labels.node }} has run out of disk space.",$
	}$
	-$
	+ $
	ALERT K8SNodeMemoryPressure$
	IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1$
	LABELS {$
	@@ -1102,7 +1106,7 @@$
	summary = "Node is under memory pressure.",$
	description = "{{ $labels.node }} is under memory pressure.",$
	}$
	-$
	+ $
	ALERT K8SNodeDiskPressure$
	IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1$
	LABELS {$