Last active
September 18, 2023 11:08
-
-
Save looztra/cd9078e1305ff97cb4f7133e292c9e8f to your computer and use it in GitHub Desktop.
Some prometheus configuration we may find usefull
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
groups: | |
- name: node-resources | |
rules: | |
- alert: HostCPUUtilisation | |
expr: 100 - (avg by(instance) (irate(node_cpu{mode="idle"}[5m])) * 100) > 70 | |
for: 20m | |
labels: | |
severity: warning | |
annotations: | |
description: 'High CPU utilisation detected for instance {{ $labels.instance_id | |
}} tagged as: {{ $labels.instance_name_tag }}, the utilisation is currently: | |
{{ $value }}%' | |
summary: CPU Utilisation Alert | |
- alert: InstanceLoad1Saturation | |
expr: | | |
100 * instance:instance_cpu_saturation_load1: > 50 | |
for: 15m | |
labels: | |
severity: warning | |
annotations: | |
description: 'Load1 Saturation > 50% for node {{ $labels.node }} / instance {{ $labels.instance }}, current saturation : {{ $value }}%' | |
- alert: InstanceLoad1Saturation | |
expr: | | |
100 * instance:instance_cpu_saturation_load1: > 70 | |
for: 15m | |
labels: | |
severity: critical | |
annotations: | |
description: 'Load1 Saturation > 70% for node {{ $labels.node }} / instance {{ $labels.instance }}, current saturation : {{ $value }}%' | |
- alert: GlobalLoad1Saturation | |
expr: | | |
100 * :instance_cpu_saturation_load1: > 50 | |
for: 15m | |
labels: | |
severity: warning | |
annotations: | |
description: 'Load1 Saturation > 50% for the full cluster, current saturation : {{ $value }}%' | |
- alert: GlobalLoad1Saturation | |
expr: | | |
100 * :instance_cpu_saturation_load1: > 70 | |
for: 15m | |
labels: | |
severity: critical | |
annotations: | |
description: 'Load1 Saturation > 70% for the full cluster, current saturation : {{ $value }}%' | |
- name: kubernetes-absent | |
rules: | |
- alert: KubeAPIDown | |
annotations: | |
message: KubeAPI has disappeared from Prometheus target discovery. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown | |
expr: | | |
absent(up{job="kubernetes-apiservers"} == 1) | |
for: 15m | |
labels: | |
severity: critical | |
- alert: KubeStateMetricsDown | |
annotations: | |
message: KubeStateMetrics has disappeared from Prometheus target discovery. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown | |
expr: | | |
absent(up{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} == 1) | |
for: 15m | |
labels: | |
severity: critical | |
- alert: KubeletDown | |
annotations: | |
message: Kubelet has disappeared from Prometheus target discovery. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown | |
expr: | | |
absent(up{job="kubernetes-nodes-cadvisor"} == 1) | |
for: 15m | |
labels: | |
severity: critical | |
- alert: NodeExporterDown | |
annotations: | |
message: NodeExporter has disappeared from Prometheus target discovery. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown | |
expr: | | |
absent(up{job="kubernetes-node-exporter"} == 1) | |
for: 15m | |
labels: | |
severity: critical | |
- alert: PrometheusDown | |
annotations: | |
message: Prometheus has disappeared from Prometheus target discovery. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown | |
expr: | | |
absent(up{job="prometheus"} == 1) | |
for: 15m | |
labels: | |
severity: critical | |
- name: kubernetes-apps | |
rules: | |
- alert: KubePodCrashLooping | |
annotations: | |
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container | |
}}) is restarting {{ printf "%.2f" $value }} times / second. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping | |
expr: | | |
rate(kube_pod_container_status_restarts_total{kubernetes_name="prometheus-kube-state-metrics"}[15m]) > 0 | |
for: 1h | |
labels: | |
severity: critical | |
- alert: KubePodNotReady | |
annotations: | |
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready | |
state for longer than an hour. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready | |
expr: | | |
sum by (namespace, pod) (kube_pod_status_phase{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics", phase=~"Pending|Unknown"}) > 0 | |
for: 1h | |
labels: | |
severity: critical | |
- alert: KubeDeploymentGenerationMismatch | |
annotations: | |
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment | |
}} does not match, this indicates that the Deployment has failed but has | |
not been rolled back. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch | |
expr: | | |
kube_deployment_status_observed_generation{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} | |
!= | |
kube_deployment_metadata_generation{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} | |
for: 15m | |
labels: | |
severity: critical | |
- alert: KubeDeploymentReplicasMismatch | |
annotations: | |
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not | |
matched the expected number of replicas for longer than an hour. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch | |
expr: | | |
kube_deployment_spec_replicas{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} | |
!= | |
kube_deployment_status_replicas_available{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} | |
for: 1h | |
labels: | |
severity: critical | |
- alert: KubeStatefulSetReplicasMismatch | |
annotations: | |
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has | |
not matched the expected number of replicas for longer than 15 minutes. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch | |
expr: | | |
kube_statefulset_status_replicas_ready{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} | |
!= | |
kube_statefulset_status_replicas{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} | |
for: 15m | |
labels: | |
severity: critical | |
- alert: KubeStatefulSetGenerationMismatch | |
annotations: | |
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset | |
}} does not match, this indicates that the StatefulSet has failed but has | |
not been rolled back. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch | |
expr: | | |
kube_statefulset_status_observed_generation{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} | |
!= | |
kube_statefulset_metadata_generation{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} | |
for: 15m | |
labels: | |
severity: critical | |
- alert: KubeStatefulSetUpdateNotRolledOut | |
annotations: | |
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update | |
has not been rolled out. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout | |
expr: | | |
max without (revision) ( | |
kube_statefulset_status_current_revision{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} | |
unless | |
kube_statefulset_status_update_revision{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} | |
) | |
* | |
( | |
kube_statefulset_replicas{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} | |
!= | |
kube_statefulset_status_replicas_updated{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} | |
) | |
for: 15m | |
labels: | |
severity: critical | |
- alert: KubeDaemonSetRolloutStuck | |
annotations: | |
message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace | |
}}/{{ $labels.daemonset }} are scheduled and ready. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck | |
expr: | | |
kube_daemonset_status_number_ready{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} | |
/ | |
kube_daemonset_status_desired_number_scheduled{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} * 100 < 100 | |
for: 15m | |
labels: | |
severity: critical | |
- alert: KubeDaemonSetNotScheduled | |
annotations: | |
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset | |
}} are not scheduled.' | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled | |
expr: | | |
kube_daemonset_status_desired_number_scheduled{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} | |
- | |
kube_daemonset_status_current_number_scheduled{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} > 0 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: KubeDaemonSetMisScheduled | |
annotations: | |
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset | |
}} are running where they are not supposed to run.' | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled | |
expr: | | |
kube_daemonset_status_number_misscheduled{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} > 0 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: KubeCronJobRunning | |
annotations: | |
message: CronJob {{ $labels.namespaces }}/{{ $labels.cronjob }} is taking | |
more than 1h to complete. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning | |
expr: | | |
time() - kube_cronjob_next_schedule_time{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} > 3600 | |
for: 1h | |
labels: | |
severity: warning | |
- alert: KubeJobCompletion | |
annotations: | |
message: Job {{ $labels.namespaces }}/{{ $labels.job }} is taking more than | |
one hour to complete. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion | |
expr: | | |
kube_job_spec_completions{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} - kube_job_status_succeeded{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} > 0 | |
for: 1h | |
labels: | |
severity: warning | |
- alert: KubeJobFailed | |
annotations: | |
message: Job {{ $labels.namespaces }}/{{ $labels.job }} failed to complete. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed | |
expr: | | |
kube_job_status_failed{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} > 0 | |
for: 1h | |
labels: | |
severity: warning | |
- name: kubernetes-resources | |
rules: | |
- alert: KubeCPUOvercommit | |
annotations: | |
message: Cluster has overcommitted CPU resource requests for Pods and cannot | |
tolerate node failure. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit | |
expr: | | |
sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum) | |
/ | |
sum(node:node_num_cpu:sum) | |
> | |
(count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum) | |
for: 5m | |
labels: | |
severity: warning | |
- alert: KubeMemOvercommit | |
annotations: | |
message: Cluster has overcommitted memory resource requests for Pods and cannot | |
tolerate node failure. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit | |
expr: | | |
sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) | |
/ | |
sum(node_memory_MemTotal) | |
> | |
(count(node:node_num_cpu:sum)-1) | |
/ | |
count(node:node_num_cpu:sum) | |
for: 5m | |
labels: | |
severity: warning | |
- alert: KubeCPUOvercommit | |
annotations: | |
message: Cluster has overcommitted CPU resource requests for Namespaces. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit | |
expr: | | |
sum(kube_resourcequota{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics", type="hard", resource="requests.cpu"}) | |
/ | |
sum(node:node_num_cpu:sum) | |
> 1.5 | |
for: 5m | |
labels: | |
severity: warning | |
- alert: KubeMemOvercommit | |
annotations: | |
message: Cluster has overcommitted memory resource requests for Namespaces. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit | |
expr: | | |
sum(kube_resourcequota{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics", type="hard", resource="requests.memory"}) | |
/ | |
sum(node_memory_MemTotal{job="kubernetes-node-exporter"}) | |
> 1.5 | |
for: 5m | |
labels: | |
severity: warning | |
- alert: KubeQuotaExceeded | |
annotations: | |
message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value | |
}}% of its {{ $labels.resource }} quota. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded | |
expr: | | |
100 * kube_resourcequota{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics", type="used"} | |
/ ignoring(instance, job, type) | |
(kube_resourcequota{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics", type="hard"} > 0) | |
> 90 | |
for: 15m | |
labels: | |
severity: warning | |
- name: kubernetes-storage | |
rules: | |
- alert: KubePersistentVolumeUsageCritical | |
annotations: | |
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim | |
}} in Namespace {{ $labels.namespace }} is only {{ printf "%0.0f" $value | |
}}% free. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical | |
expr: | | |
100 * kubelet_volume_stats_available_bytes{job="kubernetes-nodes-cadvisor"} | |
/ | |
kubelet_volume_stats_capacity_bytes{job="kubernetes-nodes-cadvisor"} | |
< 3 | |
for: 1m | |
labels: | |
severity: critical | |
- alert: KubePersistentVolumeFullInFourDays | |
annotations: | |
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim | |
}} in Namespace {{ $labels.namespace }} is expected to fill up within four | |
days. Currently {{ $value }} bytes are available. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays | |
expr: | | |
kubelet_volume_stats_available_bytes{job="kubernetes-nodes-cadvisor"} and predict_linear(kubelet_volume_stats_available_bytes{job="kubernetes-nodes-cadvisor"}[6h], 4 * 24 * 3600) < 0 | |
for: 5m | |
labels: | |
severity: critical | |
- name: kubernetes-system | |
rules: | |
- alert: KubeNodeNotReady | |
annotations: | |
message: '{{ $labels.node }} has been unready for more than an hour.' | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready | |
expr: | | |
kube_node_status_condition{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics",condition="Ready",status="true"} == 0 | |
for: 1h | |
labels: | |
severity: warning | |
- alert: KubeVersionMismatch | |
annotations: | |
message: There are {{ $value }} different versions of Kubernetes components | |
running. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch | |
expr: | | |
count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1 | |
for: 1h | |
labels: | |
severity: warning | |
- alert: KubeClientErrors | |
annotations: | |
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance | |
}}' is experiencing {{ printf "%0.0f" $value }}% errors.' | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors | |
expr: | | |
(sum(rate(rest_client_requests_total{code!~"2..|404"}[5m])) by (instance, job) | |
/ | |
sum(rate(rest_client_requests_total[5m])) by (instance, job)) | |
* 100 > 1 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeClientErrors | |
annotations: | |
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance | |
}}' is experiencing {{ printf "%0.0f" $value }} errors / second. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors | |
expr: | | |
sum(rate(ksm_scrape_error_total{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"}[5m])) by (instance, job) > 0.1 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeletTooManyPods | |
annotations: | |
message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close | |
to the limit of 110. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods | |
expr: | | |
kubelet_running_pod_count{job="kubernetes-nodes-cadvisor"} > 110 * 0.9 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeAPILatencyHigh | |
annotations: | |
message: The API server has a 99th percentile latency of {{ $value }} seconds | |
for {{ $labels.verb }} {{ $labels.resource }}. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh | |
expr: | | |
cluster_quantile:apiserver_request_latencies:histogram_quantile{job="kubernetes-apiservers",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: KubeAPILatencyHigh | |
annotations: | |
message: The API server has a 99th percentile latency of {{ $value }} seconds | |
for {{ $labels.verb }} {{ $labels.resource }}. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh | |
expr: | | |
cluster_quantile:apiserver_request_latencies:histogram_quantile{job="kubernetes-apiservers",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 | |
for: 10m | |
labels: | |
severity: critical | |
- alert: KubeAPIErrorsHigh | |
annotations: | |
message: API server is returning errors for {{ $value }}% of requests. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh | |
expr: | | |
sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m])) without(instance, pod) | |
/ | |
sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) without(instance, pod) * 100 > 10 | |
for: 10m | |
labels: | |
severity: critical | |
- alert: KubeAPIErrorsHigh | |
annotations: | |
message: API server is returning errors for {{ $value }}% of requests. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh | |
expr: | | |
sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m])) without(instance, pod) | |
/ | |
sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) without(instance, pod) * 100 > 5 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: KubeClientCertificateExpiration | |
annotations: | |
message: Kubernetes API certificate is expiring in less than 7 days. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration | |
expr: | | |
histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubernetes-apiservers"}[5m]))) < 604800 | |
labels: | |
severity: warning | |
- alert: KubeClientCertificateExpiration | |
annotations: | |
message: Kubernetes API certificate is expiring in less than 24 hours. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration | |
expr: | | |
histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubernetes-apiservers"}[5m]))) < 86400 | |
labels: | |
severity: critical | |
- name: alertmanager.rules | |
rules: | |
- alert: AlertmanagerConfigInconsistent | |
annotations: | |
message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` | |
are out of sync. | |
expr: | | |
count_values("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{job="{{ $operatorJob }}"}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 | |
for: 5m | |
labels: | |
severity: critical | |
- alert: AlertmanagerFailedReload | |
annotations: | |
message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace | |
}}/{{ $labels.pod}}. | |
expr: | | |
alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}"} == 0 | |
for: 10m | |
labels: | |
severity: warning | |
- name: general.rules | |
rules: | |
- alert: TargetDown | |
annotations: | |
message: '{{ $value }}% of the {{ $labels.job }} targets are down.' | |
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: DeadMansSwitch | |
annotations: | |
message: This is a DeadMansSwitch meant to ensure that the entire alerting | |
pipeline is functional. | |
expr: vector(1) | |
labels: | |
severity: none | |
- name: kube-prometheus-node-alerting.rules | |
rules: | |
- alert: NodeDiskRunningFull | |
annotations: | |
message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace | |
}}/{{ $labels.pod }} will be full within the next 24 hours. | |
expr: | | |
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0) | |
for: 30m | |
labels: | |
severity: warning | |
- alert: NodeDiskRunningFull | |
annotations: | |
message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace | |
}}/{{ $labels.pod }} will be full within the next 2 hours. | |
expr: | | |
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0) | |
for: 10m | |
labels: | |
severity: critical | |
- name: prometheus.rules | |
rules: | |
- alert: PrometheusConfigReloadFailed | |
annotations: | |
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} | |
summary: Reloading Prometheus' configuration failed | |
expr: | | |
prometheus_config_last_reload_successful{job="prometheus"} == 0 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusNotificationQueueRunningFull | |
annotations: | |
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ | |
$labels.pod}} | |
summary: Prometheus' alert notification queue is running full | |
expr: | | |
predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus"} | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusErrorSendingAlerts | |
annotations: | |
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ | |
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} | |
summary: Errors while sending alert from Prometheus | |
expr: | | |
rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) > 0.01 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusErrorSendingAlerts | |
annotations: | |
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ | |
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} | |
summary: Errors while sending alerts from Prometheus | |
expr: | | |
rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) > 0.03 | |
for: 10m | |
labels: | |
severity: critical | |
- alert: PrometheusNotConnectedToAlertmanagers | |
annotations: | |
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected | |
to any Alertmanagers | |
summary: Prometheus is not connected to any Alertmanagers | |
expr: | | |
prometheus_notifications_alertmanagers_discovered{job="prometheus"} < 1 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusTSDBReloadsFailing | |
annotations: | |
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} | |
reload failures over the last four hours.' | |
summary: Prometheus has issues reloading data blocks from disk | |
expr: | | |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[2h]) > 0 | |
for: 12h | |
labels: | |
severity: warning | |
- alert: PrometheusTSDBCompactionsFailing | |
annotations: | |
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} | |
compaction failures over the last four hours.' | |
summary: Prometheus has issues compacting sample blocks | |
expr: | | |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[2h]) > 0 | |
for: 12h | |
labels: | |
severity: warning | |
- alert: PrometheusTSDBWALCorruptions | |
annotations: | |
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead | |
log (WAL).' | |
summary: Prometheus write-ahead log is corrupted | |
expr: | | |
tsdb_wal_corruptions_total{job="prometheus"} > 0 | |
for: 4h | |
labels: | |
severity: warning | |
- alert: PrometheusNotIngestingSamples | |
annotations: | |
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting | |
samples. | |
summary: Prometheus isn't ingesting samples | |
expr: | | |
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusTargetScrapesDuplicate | |
annotations: | |
description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected | |
due to duplicate timestamps but different values' | |
summary: Prometheus has many samples rejected | |
expr: | | |
increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0 | |
for: 10m | |
labels: | |
severity: warning | |
- name: prometheus-operator | |
rules: | |
- alert: PrometheusOperatorAlertmanagerReconcileErrors | |
annotations: | |
message: Errors while reconciling Alertmanager in {{ $labels.namespace }} | |
Namespace. | |
expr: | | |
rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",controller="alertmanager"}[5m]) > 0.1 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusOperatorPrometheusReconcileErrors | |
annotations: | |
message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. | |
expr: | | |
rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",controller="prometheus"}[5m]) > 0.1 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusOperatorNodeLookupErrors | |
annotations: | |
message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. | |
expr: | | |
rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}",controller="prometheus"}[5m]) > 0.1 | |
for: 10m | |
labels: | |
severity: warning | |
- name: etcd3_alert.rules | |
rules: | |
- alert: InsufficientMembers | |
expr: count(up{job="kube-etcd"} == 0) > (count(up{job="kube-etcd"}) / 2 - 1) | |
for: 3m | |
labels: | |
severity: critical | |
annotations: | |
description: If one more etcd member goes down the cluster will be unavailable | |
summary: etcd cluster insufficient members | |
- alert: NoLeader | |
expr: etcd_server_has_leader{job="kube-etcd"} == 0 | |
for: 1m | |
labels: | |
severity: critical | |
annotations: | |
description: etcd member {{ $labels.instance }} has no leader | |
summary: etcd member has no leader | |
- alert: HighNumberOfLeaderChanges | |
expr: increase(etcd_server_leader_changes_seen_total{job="kube-etcd"}[1h]) > 3 | |
labels: | |
severity: warning | |
annotations: | |
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader | |
changes within the last hour | |
summary: a high number of leader changes within the etcd cluster are happening | |
- alert: HighNumberOfFailedGRPCRequests | |
expr: 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="kube-etcd"}[5m])) BY (grpc_service, grpc_method) | |
/ sum(rate(grpc_server_handled_total{job="kube-etcd"}[5m])) BY (grpc_service, grpc_method)) > 1 | |
for: 10m | |
labels: | |
severity: warning | |
annotations: | |
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed | |
on etcd instance {{ $labels.instance }}' | |
summary: a high number of gRPC requests are failing | |
- alert: HighNumberOfFailedGRPCRequests | |
expr: 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="kube-etcd"}[5m])) BY (grpc_service, grpc_method) | |
/ sum(rate(grpc_server_handled_total{job="kube-etcd"}[5m])) BY (grpc_service, grpc_method)) > 5 | |
for: 5m | |
labels: | |
severity: critical | |
annotations: | |
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed | |
on etcd instance {{ $labels.instance }}' | |
summary: a high number of gRPC requests are failing | |
- alert: GRPCRequestsSlow | |
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="kube-etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) | |
> 0.15 | |
for: 10m | |
labels: | |
severity: critical | |
annotations: | |
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method | |
}} are slow | |
summary: slow gRPC requests | |
- alert: FdExhaustionClose | |
expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 | |
for: 10m | |
labels: | |
severity: warning | |
annotations: | |
description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust | |
its file descriptors soon' | |
summary: file descriptors soon exhausted | |
- alert: FdExhaustionClose | |
expr: predict_linear(instance:fd_utilization[10m], 3600) > 1 | |
for: 10m | |
labels: | |
severity: critical | |
annotations: | |
description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust | |
its file descriptors soon' | |
summary: file descriptors soon exhausted | |
- alert: EtcdMemberCommunicationSlow | |
expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) | |
> 0.15 | |
for: 10m | |
labels: | |
severity: warning | |
annotations: | |
description: etcd instance {{ $labels.instance }} member communication with | |
{{ $labels.To }} is slow | |
summary: etcd member communication is slow | |
- alert: HighNumberOfFailedProposals | |
expr: increase(etcd_server_proposals_failed_total{job="kube-etcd"}[1h]) > 5 | |
labels: | |
severity: warning | |
annotations: | |
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal | |
failures within the last hour | |
summary: a high number of proposals within the etcd cluster are failing | |
- alert: HighFsyncDurations | |
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) | |
> 0.5 | |
for: 10m | |
labels: | |
severity: warning | |
annotations: | |
description: etcd instance {{ $labels.instance }} fync durations are high | |
summary: high fsync durations | |
- alert: HighCommitDurations | |
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) | |
> 0.25 | |
for: 10m | |
labels: | |
severity: warning | |
annotations: | |
description: etcd instance {{ $labels.instance }} commit durations are high | |
summary: high commit durations |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rule_files: | |
- /etc/config/rules | |
- /etc/config/alerts | |
scrape_configs: | |
- job_name: prometheus | |
static_configs: | |
- targets: | |
- localhost:9090 | |
# A scrape configuration for running Prometheus on a Kubernetes cluster. | |
# This uses separate scrape configs for cluster components (i.e. API server, node) | |
# and services to allow each to use different authentication configs. | |
# | |
# Kubernetes labels will be added as Prometheus labels on metrics via the | |
# `labelmap` relabeling action. | |
# Scrape config for API servers. | |
# | |
# Kubernetes exposes API servers as endpoints to the default/kubernetes | |
# service so this uses `endpoints` role and uses relabelling to only keep | |
# the endpoints associated with the default/kubernetes service using the | |
# default named port `https`. This works for single API server deployments as | |
# well as HA API server deployments. | |
- job_name: 'kubernetes-apiservers' | |
kubernetes_sd_configs: | |
- role: endpoints | |
# Default to scraping over https. If required, just disable this or change to | |
# `http`. | |
scheme: https | |
# This TLS & bearer token file config is used to connect to the actual scrape | |
# endpoints for cluster components. This is separate to discovery auth | |
# configuration because discovery & scraping are two separate concerns in | |
# Prometheus. The discovery auth config is automatic if Prometheus runs inside | |
# the cluster. Otherwise, more config options have to be provided within the | |
# <kubernetes_sd_config>. | |
tls_config: | |
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt | |
# If your node certificates are self-signed or use a different CA to the | |
# master CA, then disable certificate verification below. Note that | |
# certificate verification is an integral part of a secure infrastructure | |
# so this should only be disabled in a controlled environment. You can | |
# disable certificate verification by uncommenting the line below. | |
# | |
insecure_skip_verify: true | |
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token | |
# Keep only the default/kubernetes service endpoints for the https port. This | |
# will add targets for each API server which Kubernetes adds an endpoint to | |
# the default/kubernetes service. | |
relabel_configs: | |
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] | |
action: keep | |
regex: default;kubernetes;https | |
- job_name: 'kubernetes-nodes' | |
# Default to scraping over https. If required, just disable this or change to | |
# `http`. | |
scheme: https | |
# This TLS & bearer token file config is used to connect to the actual scrape | |
# endpoints for cluster components. This is separate to discovery auth | |
# configuration because discovery & scraping are two separate concerns in | |
# Prometheus. The discovery auth config is automatic if Prometheus runs inside | |
# the cluster. Otherwise, more config options have to be provided within the | |
# <kubernetes_sd_config>. | |
tls_config: | |
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt | |
# If your node certificates are self-signed or use a different CA to the | |
# master CA, then disable certificate verification below. Note that | |
# certificate verification is an integral part of a secure infrastructure | |
# so this should only be disabled in a controlled environment. You can | |
# disable certificate verification by uncommenting the line below. | |
# | |
insecure_skip_verify: true | |
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token | |
kubernetes_sd_configs: | |
- role: node | |
relabel_configs: | |
- action: labelmap | |
regex: __meta_kubernetes_node_label_(.+) | |
- target_label: __address__ | |
replacement: kubernetes.default.svc:443 | |
- source_labels: [__meta_kubernetes_node_name] | |
regex: (.+) | |
target_label: __metrics_path__ | |
replacement: /api/v1/nodes/${1}/proxy/metrics | |
- job_name: 'kubernetes-nodes-cadvisor' | |
# Default to scraping over https. If required, just disable this or change to | |
# `http`. | |
scheme: https | |
# This TLS & bearer token file config is used to connect to the actual scrape | |
# endpoints for cluster components. This is separate to discovery auth | |
# configuration because discovery & scraping are two separate concerns in | |
# Prometheus. The discovery auth config is automatic if Prometheus runs inside | |
# the cluster. Otherwise, more config options have to be provided within the | |
# <kubernetes_sd_config>. | |
tls_config: | |
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt | |
# If your node certificates are self-signed or use a different CA to the | |
# master CA, then disable certificate verification below. Note that | |
# certificate verification is an integral part of a secure infrastructure | |
# so this should only be disabled in a controlled environment. You can | |
# disable certificate verification by uncommenting the line below. | |
# | |
insecure_skip_verify: true | |
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token | |
kubernetes_sd_configs: | |
- role: node | |
# This configuration will work only on kubelet 1.7.3+ | |
# As the scrape endpoints for cAdvisor have changed | |
# if you are using older version you need to change the replacement to | |
# replacement: /api/v1/nodes/${1}:4194/proxy/metrics | |
# more info here https://github.com/coreos/prometheus-operator/issues/633 | |
relabel_configs: | |
- action: labelmap | |
regex: __meta_kubernetes_node_label_(.+) | |
- target_label: __address__ | |
replacement: kubernetes.default.svc:443 | |
- source_labels: [__meta_kubernetes_node_name] | |
regex: (.+) | |
target_label: __metrics_path__ | |
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor | |
# Scrape config for service endpoints. | |
# | |
# The relabeling allows the actual service scrape endpoint to be configured | |
# via the following annotations: | |
# | |
# * `prometheus.io/scrape`: Only scrape services that have a value of `true` | |
# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need | |
# to set this to `https` & most likely set the `tls_config` of the scrape config. | |
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this. | |
# * `prometheus.io/port`: If the metrics are exposed on a different port to the | |
# service then set this appropriately. | |
- job_name: 'kubernetes-service-endpoints' | |
kubernetes_sd_configs: | |
- role: endpoints | |
relabel_configs: | |
- source_labels: [__meta_kubernetes_service_label_component] | |
action: drop | |
regex: node-exporter | |
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] | |
action: keep | |
regex: true | |
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] | |
action: replace | |
target_label: __scheme__ | |
regex: (https?) | |
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] | |
action: replace | |
target_label: __metrics_path__ | |
regex: (.+) | |
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] | |
action: replace | |
target_label: __address__ | |
regex: ([^:]+)(?::\d+)?;(\d+) | |
replacement: $1:$2 | |
- action: labelmap | |
regex: __meta_kubernetes_service_label_(.+) | |
- source_labels: [__meta_kubernetes_namespace] | |
action: replace | |
target_label: kubernetes_namespace | |
- source_labels: [__meta_kubernetes_service_name] | |
action: replace | |
target_label: kubernetes_name | |
- job_name: 'kubernetes-node-exporter' | |
kubernetes_sd_configs: | |
- role: endpoints | |
relabel_configs: | |
- source_labels: [__meta_kubernetes_service_label_component] | |
action: keep | |
regex: node-exporter | |
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] | |
action: keep | |
regex: true | |
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] | |
action: replace | |
target_label: __scheme__ | |
regex: (https?) | |
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] | |
action: replace | |
target_label: __metrics_path__ | |
regex: (.+) | |
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] | |
action: replace | |
target_label: __address__ | |
regex: ([^:]+)(?::\d+)?;(\d+) | |
replacement: $1:$2 | |
- action: labelmap | |
regex: __meta_kubernetes_service_label_(.+) | |
- source_labels: [__meta_kubernetes_namespace] | |
action: replace | |
target_label: namespace | |
- source_labels: [__meta_kubernetes_service_name] | |
action: replace | |
target_label: service | |
- source_labels: [__meta_kubernetes_pod_name] | |
action: replace | |
target_label: pod | |
- source_labels: [__meta_kubernetes_pod_node_name] | |
action: replace | |
target_label: node | |
- job_name: 'prometheus-pushgateway' | |
honor_labels: true | |
kubernetes_sd_configs: | |
- role: service | |
relabel_configs: | |
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] | |
action: keep | |
regex: pushgateway | |
# Example scrape config for probing services via the Blackbox Exporter. | |
# | |
# The relabeling allows the actual service scrape endpoint to be configured | |
# via the following annotations: | |
# | |
# * `prometheus.io/probe`: Only probe services that have a value of `true` | |
- job_name: 'kubernetes-services' | |
metrics_path: /probe | |
params: | |
module: [http_2xx] | |
kubernetes_sd_configs: | |
- role: service | |
relabel_configs: | |
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] | |
action: keep | |
regex: true | |
- source_labels: [__address__] | |
target_label: __param_target | |
- target_label: __address__ | |
replacement: blackbox | |
- source_labels: [__param_target] | |
target_label: instance | |
- action: labelmap | |
regex: __meta_kubernetes_service_label_(.+) | |
- source_labels: [__meta_kubernetes_namespace] | |
target_label: kubernetes_namespace | |
- source_labels: [__meta_kubernetes_service_name] | |
target_label: kubernetes_name | |
# Example scrape config for pods | |
# | |
# The relabeling allows the actual pod scrape endpoint to be configured via the | |
# following annotations: | |
# | |
# * `prometheus.io/scrape`: Only scrape pods that have a value of `true` | |
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this. | |
# * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. | |
- job_name: 'kubernetes-pods' | |
kubernetes_sd_configs: | |
- role: pod | |
relabel_configs: | |
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] | |
action: keep | |
regex: true | |
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] | |
action: replace | |
target_label: __metrics_path__ | |
regex: (.+) | |
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] | |
action: replace | |
regex: ([^:]+)(?::\d+)?;(\d+) | |
replacement: $1:$2 | |
target_label: __address__ | |
- action: labelmap | |
regex: __meta_kubernetes_pod_label_(.+) | |
- source_labels: [__meta_kubernetes_namespace] | |
action: replace | |
target_label: kubernetes_namespace | |
- source_labels: [__meta_kubernetes_pod_name] | |
action: replace | |
target_label: kubernetes_pod_name |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
groups: | |
- name: node-exporter.rules | |
rules: | |
# The count of CPUs per node, useful for getting CPU time as a percent of total. | |
- record: instance:node_cpus:count | |
expr: count(node_cpu_seconds_total{mode="idle"}) without (cpu,mode) | |
# CPU in use by CPU. | |
- record: instance_cpu:node_cpu_seconds_not_idle:rate5m | |
expr: sum(rate(node_cpu_seconds_total{mode!="idle"}[5m])) without (mode) | |
# CPU in use by mode. | |
- record: instance_mode:node_cpu_seconds:rate5m | |
expr: sum(rate(node_cpu_seconds_total[5m])) without (cpu) | |
# CPU in use ratio. | |
- record: instance:node_cpu_utilization:ratio | |
expr: sum(instance_mode:node_cpu_seconds:rate5m{mode!="idle"}) without (mode) / instance:node_cpus:count | |
- expr: | | |
sum(node_load1{job="kubernetes-node-exporter"}) | |
/ | |
sum(instance:node_cpus:count) | |
record: ':instance_cpu_saturation_load1:' | |
- expr: | | |
node_load1 | |
/ | |
instance:node_cpus:count | |
record: 'instance:instance_cpu_saturation_load1:' | |
- name: k8s.rules | |
rules: | |
- expr: | | |
sum(rate(container_cpu_usage_seconds_total{job="kubernetes-nodes-cadvisor", image!="", container_name!=""}[5m])) by (namespace) | |
record: namespace:container_cpu_usage_seconds_total:sum_rate | |
- expr: | | |
sum by (namespace, pod_name, container_name) ( | |
rate(container_cpu_usage_seconds_total{job="kubernetes-nodes-cadvisor", image!="", container_name!=""}[5m]) | |
) | |
record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate | |
- expr: | | |
sum(container_memory_usage_bytes{job="kubernetes-nodes-cadvisor", image!="", container_name!=""}) by (namespace) | |
record: namespace:container_memory_usage_bytes:sum | |
- expr: | | |
sum by (namespace, label_name) ( | |
sum(rate(container_cpu_usage_seconds_total{job="kubernetes-nodes-cadvisor", image!="", container_name!=""}[5m])) by (namespace, pod_name) | |
* on (namespace, pod_name) group_left(label_name) | |
label_replace(kube_pod_labels{}, "pod_name", "$1", "pod", "(.*)") | |
) | |
record: namespace_name:container_cpu_usage_seconds_total:sum_rate | |
- expr: | | |
sum by (namespace, label_name) ( | |
sum(container_memory_usage_bytes{job="kubernetes-nodes-cadvisor",image!="", container_name!=""}) by (pod_name, namespace) | |
* on (namespace, pod_name) group_left(label_name) | |
label_replace(kube_pod_labels{}, "pod_name", "$1", "pod", "(.*)") | |
) | |
record: namespace_name:container_memory_usage_bytes:sum | |
- expr: | | |
sum by (namespace, label_name) ( | |
sum(kube_pod_container_resource_requests_memory_bytes{}) by (namespace, pod) | |
* on (namespace, pod) group_left(label_name) | |
label_replace(kube_pod_labels{}, "pod_name", "$1", "pod", "(.*)") | |
) | |
record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum | |
- expr: | | |
sum by (namespace, label_name) ( | |
sum(kube_pod_container_resource_requests_cpu_cores{} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod) | |
* on (namespace, pod) group_left(label_name) | |
label_replace(kube_pod_labels{}, "pod_name", "$1", "pod", "(.*)") | |
) | |
record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum | |
- name: kube-apiserver.rules | |
rules: | |
- expr: | | |
histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06 | |
labels: | |
quantile: "0.99" | |
record: cluster_quantile:apiserver_request_latencies:histogram_quantile | |
- expr: | | |
histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06 | |
labels: | |
quantile: "0.9" | |
record: cluster_quantile:apiserver_request_latencies:histogram_quantile | |
- expr: | | |
histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06 | |
labels: | |
quantile: "0.5" | |
record: cluster_quantile:apiserver_request_latencies:histogram_quantile | |
- name: node.rules | |
rules: | |
- expr: sum(min(kube_pod_info) by (node)) | |
record: ':kube_pod_info_node_count:' | |
- expr: | | |
max(label_replace(kube_pod_info{}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) | |
record: 'node_namespace_pod:kube_pod_info:' | |
- expr: | | |
count by (node) (sum by (node, cpu) ( | |
node_cpu{job="kubernetes-node-exporter"} | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
)) | |
record: node:node_num_cpu:sum | |
- expr: | | |
1 - avg(rate(node_cpu{job="kubernetes-node-exporter",mode="idle"}[1m])) | |
record: :node_cpu_utilisation:avg1m | |
- expr: | | |
1 - avg by (node) ( | |
rate(node_cpu{job="kubernetes-node-exporter",mode="idle"}[1m]) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info:) | |
record: node:node_cpu_utilisation:avg1m | |
- expr: | | |
sum(node_load1{job="kubernetes-node-exporter"}) | |
/ | |
sum(node:node_num_cpu:sum) | |
record: ':node_cpu_saturation_load1:' | |
- expr: | | |
sum by (node) ( | |
node_load1{job="kubernetes-node-exporter"} | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
/ | |
node:node_num_cpu:sum | |
record: 'node:node_cpu_saturation_load1:' | |
- expr: | | |
1 - | |
sum(node_memory_MemFree{job="kubernetes-node-exporter"} + node_memory_Cached{job="kubernetes-node-exporter"} + node_memory_Buffers{job="kubernetes-node-exporter"}) | |
/ | |
sum(node_memory_MemTotal{job="kubernetes-node-exporter"}) | |
record: ':node_memory_utilisation:' | |
- expr: | | |
sum(node_memory_MemFree{job="kubernetes-node-exporter"} + node_memory_Cached{job="kubernetes-node-exporter"} + node_memory_Buffers{job="kubernetes-node-exporter"}) | |
record: :node_memory_MemFreeCachedBuffers:sum | |
- expr: | | |
sum(node_memory_MemTotal{job="kubernetes-node-exporter"}) | |
record: :node_memory_MemTotal:sum | |
- expr: | | |
sum by (node) ( | |
(node_memory_MemFree{job="kubernetes-node-exporter"} + node_memory_Cached{job="kubernetes-node-exporter"} + node_memory_Buffers{job="kubernetes-node-exporter"}) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
record: node:node_memory_bytes_available:sum | |
- expr: | | |
sum by (node) ( | |
node_memory_MemTotal{job="kubernetes-node-exporter"} | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
record: node:node_memory_bytes_total:sum | |
- expr: | | |
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) | |
/ | |
scalar(sum(node:node_memory_bytes_total:sum)) | |
record: node:node_memory_utilisation:ratio | |
- expr: | | |
1e3 * sum( | |
(rate(node_vmstat_pgpgin{job="kubernetes-node-exporter"}[1m]) | |
+ rate(node_vmstat_pgpgout{job="kubernetes-node-exporter"}[1m])) | |
) | |
record: :node_memory_swap_io_bytes:sum_rate | |
- expr: | | |
1 - | |
sum by (node) ( | |
(node_memory_MemFree{job="kubernetes-node-exporter"} + node_memory_Cached{job="kubernetes-node-exporter"} + node_memory_Buffers{job="kubernetes-node-exporter"}) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
/ | |
sum by (node) ( | |
node_memory_MemTotal{job="kubernetes-node-exporter"} | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
record: 'node:node_memory_utilisation:' | |
- expr: | | |
1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) | |
record: 'node:node_memory_utilisation_2:' | |
- expr: | | |
1e3 * sum by (node) ( | |
(rate(node_vmstat_pgpgin{job="kubernetes-node-exporter"}[1m]) | |
+ rate(node_vmstat_pgpgout{job="kubernetes-node-exporter"}[1m])) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
record: node:node_memory_swap_io_bytes:sum_rate | |
- expr: | | |
avg(irate(node_disk_io_time_ms{job="kubernetes-node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) | |
record: :node_disk_utilisation:avg_irate | |
- expr: | | |
avg by (node) ( | |
irate(node_disk_io_time_ms{job="kubernetes-node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
record: node:node_disk_utilisation:avg_irate | |
- expr: | | |
avg(irate(node_disk_io_time_weighted{job="kubernetes-node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) | |
record: :node_disk_saturation:avg_irate | |
- expr: | | |
avg by (node) ( | |
irate(node_disk_io_time_weighted{job="kubernetes-node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
record: node:node_disk_saturation:avg_irate | |
- expr: | | |
max by (namespace, pod, device) ((node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"} | |
- node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"}) | |
/ node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"}) | |
record: 'node:node_filesystem_usage:' | |
- expr: | | |
max by (namespace, pod, device) (node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"}) | |
record: 'node:node_filesystem_avail:' | |
- expr: | | |
sum(irate(node_network_receive_bytes{job="kubernetes-node-exporter",device="eth0"}[1m])) + | |
sum(irate(node_network_transmit_bytes{job="kubernetes-node-exporter",device="eth0"}[1m])) | |
record: :node_net_utilisation:sum_irate | |
- expr: | | |
sum by (node) ( | |
(irate(node_network_receive_bytes{job="kubernetes-node-exporter",device="eth0"}[1m]) + | |
irate(node_network_transmit_bytes{job="kubernetes-node-exporter",device="eth0"}[1m])) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
record: node:node_net_utilisation:sum_irate | |
- expr: | | |
sum(irate(node_network_receive_drop{job="kubernetes-node-exporter",device="eth0"}[1m])) + | |
sum(irate(node_network_transmit_drop{job="kubernetes-node-exporter",device="eth0"}[1m])) | |
record: :node_net_saturation:sum_irate | |
- expr: | | |
sum by (node) ( | |
(irate(node_network_receive_drop{job="kubernetes-node-exporter",device="eth0"}[1m]) + | |
irate(node_network_transmit_drop{job="kubernetes-node-exporter",device="eth0"}[1m])) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
record: node:node_net_saturation:sum_irate | |
- name: kube-prometheus-node-recording.rules | |
rules: | |
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance) | |
record: instance:node_cpu:rate:sum | |
- expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) | |
BY (instance) | |
record: instance:node_filesystem_usage:sum | |
- expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) | |
record: instance:node_network_receive_bytes:rate:sum | |
- expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) | |
record: instance:node_network_transmit_bytes:rate:sum | |
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) | |
/ ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) | |
record: instance:node_cpu:ratio | |
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) | |
record: cluster:node_cpu:sum_rate5m | |
- expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) | |
record: cluster:node_cpu:ratio | |
- name: node_exporter-16-bcache | |
rules: | |
- expr: node_bcache_cache_read_races | |
record: node_bcache_cache_read_races_total | |
- name: node_exporter-16-buddyinfo | |
rules: | |
- expr: node_buddyinfo_blocks | |
record: node_buddyinfo_count | |
- name: node_exporter-16-stat | |
rules: | |
- expr: node_boot_time_seconds | |
record: node_boot_time | |
- expr: node_context_switches_total | |
record: node_context_switches | |
- expr: node_forks_total | |
record: node_forks | |
- expr: node_intr_total | |
record: node_intr | |
- name: node_exporter-16-cpu | |
rules: | |
- expr: label_replace(node_cpu_seconds_total, "cpu", "$1", "cpu", "cpu(.+)") | |
record: node_cpu | |
- name: node_exporter-16-diskstats | |
rules: | |
- expr: node_disk_read_bytes_total | |
record: node_disk_bytes_read | |
- expr: node_disk_written_bytes_total | |
record: node_disk_bytes_written | |
- expr: node_disk_io_time_seconds_total * 1000 | |
record: node_disk_io_time_ms | |
- expr: node_disk_io_time_weighted_seconds_total | |
record: node_disk_io_time_weighted | |
- expr: node_disk_reads_completed_total | |
record: node_disk_reads_completed | |
- expr: node_disk_reads_merged_total | |
record: node_disk_reads_merged | |
- expr: node_disk_read_time_seconds_total * 1000 | |
record: node_disk_read_time_ms | |
- expr: node_disk_writes_completed_total | |
record: node_disk_writes_completed | |
- expr: node_disk_writes_merged_total | |
record: node_disk_writes_merged | |
- expr: node_disk_write_time_seconds_total * 1000 | |
record: node_disk_write_time_ms | |
- name: node_exporter-16-filesystem | |
rules: | |
- expr: node_filesystem_free_bytes | |
record: node_filesystem_free | |
- expr: node_filesystem_avail_bytes | |
record: node_filesystem_avail | |
- expr: node_filesystem_size_bytes | |
record: node_filesystem_size | |
- name: node_exporter-16-infiniband | |
rules: | |
- expr: node_infiniband_port_data_received_bytes_total | |
record: node_infiniband_port_data_received_bytes | |
- expr: node_infiniband_port_data_transmitted_bytes_total | |
record: node_infiniband_port_data_transmitted_bytes | |
- name: node_exporter-16-interrupts | |
rules: | |
- expr: node_interrupts_total | |
record: node_interrupts | |
- name: node_exporter-16-memory | |
rules: | |
- expr: node_memory_Active_bytes | |
record: node_memory_Active | |
- expr: node_memory_Active_anon_bytes | |
record: node_memory_Active_anon | |
- expr: node_memory_Active_file_bytes | |
record: node_memory_Active_file | |
- expr: node_memory_AnonHugePages_bytes | |
record: node_memory_AnonHugePages | |
- expr: node_memory_AnonPages_bytes | |
record: node_memory_AnonPages | |
- expr: node_memory_Bounce_bytes | |
record: node_memory_Bounce | |
- expr: node_memory_Buffers_bytes | |
record: node_memory_Buffers | |
- expr: node_memory_Cached_bytes | |
record: node_memory_Cached | |
- expr: node_memory_CommitLimit_bytes | |
record: node_memory_CommitLimit | |
- expr: node_memory_Committed_AS_bytes | |
record: node_memory_Committed_AS | |
- expr: node_memory_DirectMap2M_bytes | |
record: node_memory_DirectMap2M | |
- expr: node_memory_DirectMap4k_bytes | |
record: node_memory_DirectMap4k | |
- expr: node_memory_Dirty_bytes | |
record: node_memory_Dirty | |
- expr: node_memory_HardwareCorrupted_bytes | |
record: node_memory_HardwareCorrupted | |
- expr: node_memory_Hugepagesize_bytes | |
record: node_memory_Hugepagesize | |
- expr: node_memory_Inactive_bytes | |
record: node_memory_Inactive | |
- expr: node_memory_Inactive_anon_bytes | |
record: node_memory_Inactive_anon | |
- expr: node_memory_Inactive_file_bytes | |
record: node_memory_Inactive_file | |
- expr: node_memory_KernelStack_bytes | |
record: node_memory_KernelStack | |
- expr: node_memory_Mapped_bytes | |
record: node_memory_Mapped | |
- expr: node_memory_MemAvailable_bytes | |
record: node_memory_MemAvailable | |
- expr: node_memory_MemFree_bytes | |
record: node_memory_MemFree | |
- expr: node_memory_MemTotal_bytes | |
record: node_memory_MemTotal | |
- expr: node_memory_Mlocked_bytes | |
record: node_memory_Mlocked | |
- expr: node_memory_NFS_Unstable_bytes | |
record: node_memory_NFS_Unstable | |
- expr: node_memory_PageTables_bytes | |
record: node_memory_PageTables | |
- expr: node_memory_Shmem_bytes | |
record: node_memory_Shmem | |
- expr: node_memory_Slab_bytes | |
record: node_memory_Slab | |
- expr: node_memory_SReclaimable_bytes | |
record: node_memory_SReclaimable | |
- expr: node_memory_SUnreclaim_bytes | |
record: node_memory_SUnreclaim | |
- expr: node_memory_SwapCached_bytes | |
record: node_memory_SwapCached | |
- expr: node_memory_SwapFree_bytes | |
record: node_memory_SwapFree | |
- expr: node_memory_SwapTotal_bytes | |
record: node_memory_SwapTotal | |
- expr: node_memory_Unevictable_bytes | |
record: node_memory_Unevictable | |
- expr: node_memory_VmallocChunk_bytes | |
record: node_memory_VmallocChunk | |
- expr: node_memory_VmallocTotal_bytes | |
record: node_memory_VmallocTotal | |
- expr: node_memory_VmallocUsed_bytes | |
record: node_memory_VmallocUsed | |
- expr: node_memory_Writeback_bytes | |
record: node_memory_Writeback | |
- expr: node_memory_WritebackTmp_bytes | |
record: node_memory_WritebackTmp | |
- name: node_exporter-16-network | |
rules: | |
- expr: node_network_receive_bytes_total | |
record: node_network_receive_bytes | |
- expr: node_network_receive_compressed_total | |
record: node_network_receive_compressed | |
- expr: node_network_receive_drop_total | |
record: node_network_receive_drop | |
- expr: node_network_receive_errs_total | |
record: node_network_receive_errs | |
- expr: node_network_receive_fifo_total | |
record: node_network_receive_fifo | |
- expr: node_network_receive_frame_total | |
record: node_network_receive_frame | |
- expr: node_network_receive_multicast_total | |
record: node_network_receive_multicast | |
- expr: node_network_receive_packets_total | |
record: node_network_receive_packets | |
- expr: node_network_transmit_bytes_total | |
record: node_network_transmit_bytes | |
- expr: node_network_transmit_compressed_total | |
record: node_network_transmit_compressed | |
- expr: node_network_transmit_drop_total | |
record: node_network_transmit_drop | |
- expr: node_network_transmit_errs_total | |
record: node_network_transmit_errs | |
- expr: node_network_transmit_fifo_total | |
record: node_network_transmit_fifo | |
- expr: node_network_transmit_frame_total | |
record: node_network_transmit_frame | |
- expr: node_network_transmit_multicast_total | |
record: node_network_transmit_multicast | |
- expr: node_network_transmit_packets_total | |
record: node_network_transmit_packets | |
- name: node_exporter-16-nfs | |
rules: | |
- expr: node_nfs_connections_total | |
record: node_nfs_net_connections | |
- expr: node_nfs_packets_total | |
record: node_nfs_net_reads | |
- expr: label_replace(label_replace(node_nfs_requests_total, "proto", "$1", "version", | |
"(.+)"), "method", "$1", "procedure", "(.+)") | |
record: node_nfs_procedures | |
- expr: node_nfs_rpc_authentication_refreshes_total | |
record: node_nfs_rpc_authentication_refreshes | |
- expr: node_nfs_rpcs_total | |
record: node_nfs_rpc_operations | |
- expr: node_nfs_rpc_retransmissions_total | |
record: node_nfs_rpc_retransmissions | |
- name: node_exporter-16-textfile | |
rules: | |
- expr: node_textfile_mtime_seconds | |
record: node_textfile_mtime | |
- name: etcd3_alert2.rules | |
rules: | |
- record: instance:fd_utilization | |
expr: process_open_fds / process_max_fds |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment