Created
August 5, 2020 09:17
-
-
Save yangchuansheng/4310ae9f41513899dc5f0176cdf804b1 to your computer and use it in GitHub Desktop.
prometheus-rules-system.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
apiVersion: monitoring.coreos.com/v1 | |
kind: PrometheusRule | |
metadata: | |
labels: | |
prometheus: system | |
role: alert-rules | |
name: prometheus-system-rules | |
namespace: monitoring | |
spec: | |
groups: | |
- name: kube-apiserver.rules | |
rules: | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) | |
- | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) + | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) + | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate1d | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) | |
- | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) + | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) + | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate1h | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) | |
- | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) + | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) + | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate2h | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) | |
- | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) + | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) + | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate30m | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) | |
- | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) + | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) + | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate3d | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) | |
- | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) + | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) + | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate5m | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) | |
- | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) + | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) + | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate6h | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate1d | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate1h | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate2h | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate30m | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate3d | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate5m | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate6h | |
- expr: | | |
sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) | |
labels: | |
verb: read | |
record: code_resource:apiserver_request_total:rate5m | |
- expr: | | |
sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) | |
labels: | |
verb: write | |
record: code_resource:apiserver_request_total:rate5m | |
- expr: | | |
histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 | |
labels: | |
quantile: "0.99" | |
verb: read | |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 | |
labels: | |
quantile: "0.99" | |
verb: write | |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
- expr: | | |
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod) | |
/ | |
sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod) | |
record: cluster:apiserver_request_duration_seconds:mean5m | |
- expr: | | |
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.99" | |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.9" | |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.5" | |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
- interval: 3m | |
name: kube-apiserver-availability.rules | |
rules: | |
- expr: | | |
1 - ( | |
( | |
# write too slow | |
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) | |
- | |
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) | |
) + | |
( | |
# read too slow | |
sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) | |
- | |
( | |
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) + | |
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + | |
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d])) | |
) | |
) + | |
# errors | |
sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) | |
) | |
/ | |
sum(code:apiserver_request_total:increase30d) | |
labels: | |
verb: all | |
record: apiserver_request:availability30d | |
- expr: | | |
1 - ( | |
sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d])) | |
- | |
( | |
# too slow | |
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) + | |
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + | |
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d])) | |
) | |
+ | |
# errors | |
sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) | |
) | |
/ | |
sum(code:apiserver_request_total:increase30d{verb="read"}) | |
labels: | |
verb: read | |
record: apiserver_request:availability30d | |
- expr: | | |
1 - ( | |
( | |
# too slow | |
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) | |
- | |
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) | |
) | |
+ | |
# errors | |
sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) | |
) | |
/ | |
sum(code:apiserver_request_total:increase30d{verb="write"}) | |
labels: | |
verb: write | |
record: apiserver_request:availability30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[30d])) | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) | |
labels: | |
verb: read | |
record: code:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) | |
labels: | |
verb: write | |
record: code:apiserver_request_total:increase30d | |
- name: kube-scheduler.rules | |
rules: | |
- expr: | | |
histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.99" | |
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.99" | |
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.99" | |
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.9" | |
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.9" | |
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.9" | |
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.5" | |
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.5" | |
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.5" | |
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile | |
- name: kubelet.rules | |
rules: | |
- expr: | | |
histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) | |
labels: | |
quantile: "0.99" | |
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) | |
labels: | |
quantile: "0.9" | |
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) | |
labels: | |
quantile: "0.5" | |
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile | |
- name: kubernetes-storage | |
rules: | |
- alert: KubePersistentVolumeFillingUp | |
annotations: | |
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim | |
}} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage | |
}} free. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup | |
expr: | | |
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} | |
/ | |
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} | |
< 0.03 | |
for: 1m | |
labels: | |
severity: critical | |
- alert: KubePersistentVolumeFillingUp | |
annotations: | |
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim | |
}} in Namespace {{ $labels.namespace }} is expected to fill up within four | |
days. Currently {{ $value | humanizePercentage }} is available. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup | |
expr: | | |
( | |
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} | |
/ | |
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} | |
) < 0.15 | |
and | |
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 | |
for: 1h | |
labels: | |
severity: warning | |
- alert: KubePersistentVolumeErrors | |
annotations: | |
message: The persistent volume {{ $labels.persistentvolume }} has status {{ | |
$labels.phase }}. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors | |
expr: | | |
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 | |
for: 5m | |
labels: | |
severity: critical | |
- name: kubernetes-system | |
rules: | |
- alert: KubeVersionMismatch | |
annotations: | |
message: There are {{ $value }} different semantic versions of Kubernetes | |
components running. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch | |
expr: | | |
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeClientErrors | |
annotations: | |
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance | |
}}' is experiencing {{ $value | humanizePercentage }} errors.' | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors | |
expr: | | |
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) | |
/ | |
sum(rate(rest_client_requests_total[5m])) by (instance, job)) | |
> 0.01 | |
for: 15m | |
labels: | |
severity: warning | |
- name: kube-apiserver-slos | |
rules: | |
- alert: KubeAPIErrorBudgetBurn | |
annotations: | |
message: The API server is burning too much error budget | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn | |
expr: | | |
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) | |
and | |
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) | |
for: 2m | |
labels: | |
long: 1h | |
severity: critical | |
short: 5m | |
- alert: KubeAPIErrorBudgetBurn | |
annotations: | |
message: The API server is burning too much error budget | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn | |
expr: | | |
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) | |
and | |
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) | |
for: 15m | |
labels: | |
long: 6h | |
severity: critical | |
short: 30m | |
- alert: KubeAPIErrorBudgetBurn | |
annotations: | |
message: The API server is burning too much error budget | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn | |
expr: | | |
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) | |
and | |
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) | |
for: 1h | |
labels: | |
long: 1d | |
severity: warning | |
short: 2h | |
- alert: KubeAPIErrorBudgetBurn | |
annotations: | |
message: The API server is burning too much error budget | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn | |
expr: | | |
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) | |
and | |
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) | |
for: 3h | |
labels: | |
long: 3d | |
severity: warning | |
short: 6h | |
- name: kubernetes-system-apiserver | |
rules: | |
- alert: KubeAPILatencyHigh | |
annotations: | |
message: The API server has an abnormal latency of {{ $value }} seconds for | |
{{ $labels.verb }} {{ $labels.resource }}. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh | |
expr: | | |
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"} | |
> | |
1 | |
and on (verb,resource) | |
( | |
cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} | |
> | |
on (verb) group_left() | |
( | |
avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0) | |
+ | |
2*stddev by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0) | |
) | |
) > on (verb) group_left() | |
1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0) | |
for: 5m | |
labels: | |
severity: warning | |
- alert: KubeAPIErrorsHigh | |
annotations: | |
message: API server is returning errors for {{ $value | humanizePercentage | |
}} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource | |
}}. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh | |
expr: | | |
sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m])) by (resource,subresource,verb) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) > 0.05 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: KubeClientCertificateExpiration | |
annotations: | |
message: A client certificate used to authenticate to the apiserver is expiring | |
in less than 7.0 days. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration | |
expr: | | |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 | |
labels: | |
severity: warning | |
- alert: KubeClientCertificateExpiration | |
annotations: | |
message: A client certificate used to authenticate to the apiserver is expiring | |
in less than 24.0 hours. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration | |
expr: | | |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 | |
labels: | |
severity: critical | |
- alert: AggregatedAPIErrors | |
annotations: | |
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has | |
reported errors. The number of errors have increased for it in the past | |
five minutes. High values indicate that the availability of the service | |
changes too often. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors | |
expr: | | |
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 | |
labels: | |
severity: warning | |
- alert: AggregatedAPIDown | |
annotations: | |
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} is down. | |
It has not been available at least for the past five minutes. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown | |
expr: | | |
sum by(name, namespace)(sum_over_time(aggregator_unavailable_apiservice[5m])) > 0 | |
for: 5m | |
labels: | |
severity: warning | |
- alert: KubeAPIDown | |
annotations: | |
message: KubeAPI has disappeared from Prometheus target discovery. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown | |
expr: | | |
absent(up{job="apiserver"} == 1) | |
for: 15m | |
labels: | |
severity: critical | |
- name: kubernetes-system-kubelet | |
rules: | |
- alert: KubeNodeNotReady | |
annotations: | |
message: '{{ $labels.node }} has been unready for more than 15 minutes.' | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready | |
expr: | | |
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeNodeUnreachable | |
annotations: | |
message: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable | |
expr: | | |
(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key="ToBeDeletedByClusterAutoscaler"}) == 1 | |
labels: | |
severity: warning | |
- alert: KubeletTooManyPods | |
annotations: | |
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage | |
}} of its Pod capacity. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods | |
expr: | | |
max(max(kubelet_running_pod_count{job="kubelet", metrics_path="/metrics"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"} != 1) by(node) > 0.95 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeNodeReadinessFlapping | |
annotations: | |
message: The readiness status of node {{ $labels.node }} has changed {{ $value | |
}} times in the last 15 minutes. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping | |
expr: | | |
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeletPlegDurationHigh | |
annotations: | |
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration | |
of {{ $value }} seconds on node {{ $labels.node }}. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh | |
expr: | | |
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 | |
for: 5m | |
labels: | |
severity: warning | |
- alert: KubeletPodStartUpLatencyHigh | |
annotations: | |
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds | |
on node {{ $labels.node }}. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh | |
expr: | | |
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeletDown | |
annotations: | |
message: Kubelet has disappeared from Prometheus target discovery. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown | |
expr: | | |
absent(up{job="kubelet", metrics_path="/metrics"} == 1) | |
for: 15m | |
labels: | |
severity: critical | |
- name: kubernetes-system-scheduler | |
rules: | |
- alert: KubeSchedulerDown | |
annotations: | |
message: KubeScheduler has disappeared from Prometheus target discovery. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown | |
expr: | | |
absent(up{job="kube-scheduler"} == 1) | |
for: 15m | |
labels: | |
severity: critical | |
- name: kubernetes-system-controller-manager | |
rules: | |
- alert: KubeControllerManagerDown | |
annotations: | |
message: KubeControllerManager has disappeared from Prometheus target discovery. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown | |
expr: | | |
absent(up{job="kube-controller-manager"} == 1) | |
for: 15m | |
labels: | |
severity: critical | |
- name: general.rules | |
rules: | |
- alert: TargetDown | |
annotations: | |
message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service | |
}} targets in {{ $labels.namespace }} namespace are down.' | |
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, | |
namespace, service)) > 10 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: Watchdog | |
annotations: | |
message: | | |
This is an alert meant to ensure that the entire alerting pipeline is functional. | |
This alert is always firing, therefore it should always be firing in Alertmanager | |
and always fire against a receiver. There are integrations with various notification | |
mechanisms that send a notification when this alert is not firing. For example the | |
"DeadMansSnitch" integration in PagerDuty. | |
expr: vector(1) | |
labels: | |
severity: none | |
- name: CoreDNS | |
rules: | |
- alert: CorednsPanicCount | |
expr: increase(coredns_panic_count_total[10m]) > 0 | |
for: 5m | |
labels: | |
severity: critical | |
annotations: | |
summary: "CoreDNS Panic Count (instance {{ $labels.instance }})" | |
description: "Number of CoreDNS panics encountered\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment