Skip to content

Instantly share code, notes, and snippets.

@looztra
Last active September 18, 2023 11:08
Show Gist options
  • Save looztra/cd9078e1305ff97cb4f7133e292c9e8f to your computer and use it in GitHub Desktop.
Save looztra/cd9078e1305ff97cb4f7133e292c9e8f to your computer and use it in GitHub Desktop.
Some prometheus configuration we may find usefull
groups:
- name: node-resources
rules:
- alert: HostCPUUtilisation
expr: 100 - (avg by(instance) (irate(node_cpu{mode="idle"}[5m])) * 100) > 70
for: 20m
labels:
severity: warning
annotations:
description: 'High CPU utilisation detected for instance {{ $labels.instance_id
}} tagged as: {{ $labels.instance_name_tag }}, the utilisation is currently:
{{ $value }}%'
summary: CPU Utilisation Alert
- alert: InstanceLoad1Saturation
expr: |
100 * instance:instance_cpu_saturation_load1: > 50
for: 15m
labels:
severity: warning
annotations:
description: 'Load1 Saturation > 50% for node {{ $labels.node }} / instance {{ $labels.instance }}, current saturation : {{ $value }}%'
- alert: InstanceLoad1Saturation
expr: |
100 * instance:instance_cpu_saturation_load1: > 70
for: 15m
labels:
severity: critical
annotations:
description: 'Load1 Saturation > 70% for node {{ $labels.node }} / instance {{ $labels.instance }}, current saturation : {{ $value }}%'
- alert: GlobalLoad1Saturation
expr: |
100 * :instance_cpu_saturation_load1: > 50
for: 15m
labels:
severity: warning
annotations:
description: 'Load1 Saturation > 50% for the full cluster, current saturation : {{ $value }}%'
- alert: GlobalLoad1Saturation
expr: |
100 * :instance_cpu_saturation_load1: > 70
for: 15m
labels:
severity: critical
annotations:
description: 'Load1 Saturation > 70% for the full cluster, current saturation : {{ $value }}%'
- name: kubernetes-absent
rules:
- alert: KubeAPIDown
annotations:
message: KubeAPI has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
expr: |
absent(up{job="kubernetes-apiservers"} == 1)
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsDown
annotations:
message: KubeStateMetrics has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown
expr: |
absent(up{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} == 1)
for: 15m
labels:
severity: critical
- alert: KubeletDown
annotations:
message: Kubelet has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
expr: |
absent(up{job="kubernetes-nodes-cadvisor"} == 1)
for: 15m
labels:
severity: critical
- alert: NodeExporterDown
annotations:
message: NodeExporter has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown
expr: |
absent(up{job="kubernetes-node-exporter"} == 1)
for: 15m
labels:
severity: critical
- alert: PrometheusDown
annotations:
message: Prometheus has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown
expr: |
absent(up{job="prometheus"} == 1)
for: 15m
labels:
severity: critical
- name: kubernetes-apps
rules:
- alert: KubePodCrashLooping
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is restarting {{ printf "%.2f" $value }} times / second.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
expr: |
rate(kube_pod_container_status_restarts_total{kubernetes_name="prometheus-kube-state-metrics"}[15m]) > 0
for: 1h
labels:
severity: critical
- alert: KubePodNotReady
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
state for longer than an hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
expr: |
sum by (namespace, pod) (kube_pod_status_phase{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics", phase=~"Pending|Unknown"}) > 0
for: 1h
labels:
severity: critical
- alert: KubeDeploymentGenerationMismatch
annotations:
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has
not been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
expr: |
kube_deployment_status_observed_generation{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"}
!=
kube_deployment_metadata_generation{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"}
for: 15m
labels:
severity: critical
- alert: KubeDeploymentReplicasMismatch
annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
matched the expected number of replicas for longer than an hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
expr: |
kube_deployment_spec_replicas{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"}
!=
kube_deployment_status_replicas_available{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"}
for: 1h
labels:
severity: critical
- alert: KubeStatefulSetReplicasMismatch
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
expr: |
kube_statefulset_status_replicas_ready{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"}
!=
kube_statefulset_status_replicas{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"}
for: 15m
labels:
severity: critical
- alert: KubeStatefulSetGenerationMismatch
annotations:
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has
not been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
expr: |
kube_statefulset_status_observed_generation{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"}
!=
kube_statefulset_metadata_generation{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"}
for: 15m
labels:
severity: critical
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
has not been rolled out.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
expr: |
max without (revision) (
kube_statefulset_status_current_revision{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"}
unless
kube_statefulset_status_update_revision{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"}
)
*
(
kube_statefulset_replicas{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"}
!=
kube_statefulset_status_replicas_updated{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"}
)
for: 15m
labels:
severity: critical
- alert: KubeDaemonSetRolloutStuck
annotations:
message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace
}}/{{ $labels.daemonset }} are scheduled and ready.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
expr: |
kube_daemonset_status_number_ready{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"}
/
kube_daemonset_status_desired_number_scheduled{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} * 100 < 100
for: 15m
labels:
severity: critical
- alert: KubeDaemonSetNotScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
expr: |
kube_daemonset_status_desired_number_scheduled{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"}
-
kube_daemonset_status_current_number_scheduled{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} > 0
for: 10m
labels:
severity: warning
- alert: KubeDaemonSetMisScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
expr: |
kube_daemonset_status_number_misscheduled{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} > 0
for: 10m
labels:
severity: warning
- alert: KubeCronJobRunning
annotations:
message: CronJob {{ $labels.namespaces }}/{{ $labels.cronjob }} is taking
more than 1h to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning
expr: |
time() - kube_cronjob_next_schedule_time{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} > 3600
for: 1h
labels:
severity: warning
- alert: KubeJobCompletion
annotations:
message: Job {{ $labels.namespaces }}/{{ $labels.job }} is taking more than
one hour to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
expr: |
kube_job_spec_completions{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} - kube_job_status_succeeded{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
- alert: KubeJobFailed
annotations:
message: Job {{ $labels.namespaces }}/{{ $labels.job }} failed to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
expr: |
kube_job_status_failed{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
- name: kubernetes-resources
rules:
- alert: KubeCPUOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for Pods and cannot
tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
expr: |
sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
/
sum(node:node_num_cpu:sum)
>
(count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)
for: 5m
labels:
severity: warning
- alert: KubeMemOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for Pods and cannot
tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
expr: |
sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
/
sum(node_memory_MemTotal)
>
(count(node:node_num_cpu:sum)-1)
/
count(node:node_num_cpu:sum)
for: 5m
labels:
severity: warning
- alert: KubeCPUOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
expr: |
sum(kube_resourcequota{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics", type="hard", resource="requests.cpu"})
/
sum(node:node_num_cpu:sum)
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeMemOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
expr: |
sum(kube_resourcequota{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics", type="hard", resource="requests.memory"})
/
sum(node_memory_MemTotal{job="kubernetes-node-exporter"})
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeQuotaExceeded
annotations:
message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value
}}% of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
expr: |
100 * kube_resourcequota{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics", type="hard"} > 0)
> 90
for: 15m
labels:
severity: warning
- name: kubernetes-storage
rules:
- alert: KubePersistentVolumeUsageCritical
annotations:
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is only {{ printf "%0.0f" $value
}}% free.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
expr: |
100 * kubelet_volume_stats_available_bytes{job="kubernetes-nodes-cadvisor"}
/
kubelet_volume_stats_capacity_bytes{job="kubernetes-nodes-cadvisor"}
< 3
for: 1m
labels:
severity: critical
- alert: KubePersistentVolumeFullInFourDays
annotations:
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
days. Currently {{ $value }} bytes are available.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
expr: |
kubelet_volume_stats_available_bytes{job="kubernetes-nodes-cadvisor"} and predict_linear(kubelet_volume_stats_available_bytes{job="kubernetes-nodes-cadvisor"}[6h], 4 * 24 * 3600) < 0
for: 5m
labels:
severity: critical
- name: kubernetes-system
rules:
- alert: KubeNodeNotReady
annotations:
message: '{{ $labels.node }} has been unready for more than an hour.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
expr: |
kube_node_status_condition{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics",condition="Ready",status="true"} == 0
for: 1h
labels:
severity: warning
- alert: KubeVersionMismatch
annotations:
message: There are {{ $value }} different versions of Kubernetes components
running.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
expr: |
count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1
for: 1h
labels:
severity: warning
- alert: KubeClientErrors
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ printf "%0.0f" $value }}% errors.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
expr: |
(sum(rate(rest_client_requests_total{code!~"2..|404"}[5m])) by (instance, job)
/
sum(rate(rest_client_requests_total[5m])) by (instance, job))
* 100 > 1
for: 15m
labels:
severity: warning
- alert: KubeClientErrors
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ printf "%0.0f" $value }} errors / second.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
expr: |
sum(rate(ksm_scrape_error_total{job="kubernetes-service-endpoints",kubernetes_name="prometheus-kube-state-metrics"}[5m])) by (instance, job) > 0.1
for: 15m
labels:
severity: warning
- alert: KubeletTooManyPods
annotations:
message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close
to the limit of 110.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
expr: |
kubelet_running_pod_count{job="kubernetes-nodes-cadvisor"} > 110 * 0.9
for: 15m
labels:
severity: warning
- alert: KubeAPILatencyHigh
annotations:
message: The API server has a 99th percentile latency of {{ $value }} seconds
for {{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
expr: |
cluster_quantile:apiserver_request_latencies:histogram_quantile{job="kubernetes-apiservers",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
for: 10m
labels:
severity: warning
- alert: KubeAPILatencyHigh
annotations:
message: The API server has a 99th percentile latency of {{ $value }} seconds
for {{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
expr: |
cluster_quantile:apiserver_request_latencies:histogram_quantile{job="kubernetes-apiservers",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
for: 10m
labels:
severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m])) without(instance, pod)
/
sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) without(instance, pod) * 100 > 10
for: 10m
labels:
severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m])) without(instance, pod)
/
sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) without(instance, pod) * 100 > 5
for: 10m
labels:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
message: Kubernetes API certificate is expiring in less than 7 days.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr: |
histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubernetes-apiservers"}[5m]))) < 604800
labels:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
message: Kubernetes API certificate is expiring in less than 24 hours.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr: |
histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubernetes-apiservers"}[5m]))) < 86400
labels:
severity: critical
- name: alertmanager.rules
rules:
- alert: AlertmanagerConfigInconsistent
annotations:
message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
are out of sync.
expr: |
count_values("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{job="{{ $operatorJob }}"}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
for: 5m
labels:
severity: critical
- alert: AlertmanagerFailedReload
annotations:
message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
}}/{{ $labels.pod}}.
expr: |
alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}"} == 0
for: 10m
labels:
severity: warning
- name: general.rules
rules:
- alert: TargetDown
annotations:
message: '{{ $value }}% of the {{ $labels.job }} targets are down.'
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
for: 10m
labels:
severity: warning
- alert: DeadMansSwitch
annotations:
message: This is a DeadMansSwitch meant to ensure that the entire alerting
pipeline is functional.
expr: vector(1)
labels:
severity: none
- name: kube-prometheus-node-alerting.rules
rules:
- alert: NodeDiskRunningFull
annotations:
message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
}}/{{ $labels.pod }} will be full within the next 24 hours.
expr: |
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)
for: 30m
labels:
severity: warning
- alert: NodeDiskRunningFull
annotations:
message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
}}/{{ $labels.pod }} will be full within the next 2 hours.
expr: |
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)
for: 10m
labels:
severity: critical
- name: prometheus.rules
rules:
- alert: PrometheusConfigReloadFailed
annotations:
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
summary: Reloading Prometheus' configuration failed
expr: |
prometheus_config_last_reload_successful{job="prometheus"} == 0
for: 10m
labels:
severity: warning
- alert: PrometheusNotificationQueueRunningFull
annotations:
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
$labels.pod}}
summary: Prometheus' alert notification queue is running full
expr: |
predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus"}
for: 10m
labels:
severity: warning
- alert: PrometheusErrorSendingAlerts
annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
summary: Errors while sending alert from Prometheus
expr: |
rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) > 0.01
for: 10m
labels:
severity: warning
- alert: PrometheusErrorSendingAlerts
annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
summary: Errors while sending alerts from Prometheus
expr: |
rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) > 0.03
for: 10m
labels:
severity: critical
- alert: PrometheusNotConnectedToAlertmanagers
annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
to any Alertmanagers
summary: Prometheus is not connected to any Alertmanagers
expr: |
prometheus_notifications_alertmanagers_discovered{job="prometheus"} < 1
for: 10m
labels:
severity: warning
- alert: PrometheusTSDBReloadsFailing
annotations:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk
expr: |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[2h]) > 0
for: 12h
labels:
severity: warning
- alert: PrometheusTSDBCompactionsFailing
annotations:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks
expr: |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[2h]) > 0
for: 12h
labels:
severity: warning
- alert: PrometheusTSDBWALCorruptions
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
log (WAL).'
summary: Prometheus write-ahead log is corrupted
expr: |
tsdb_wal_corruptions_total{job="prometheus"} > 0
for: 4h
labels:
severity: warning
- alert: PrometheusNotIngestingSamples
annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting
samples.
summary: Prometheus isn't ingesting samples
expr: |
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0
for: 10m
labels:
severity: warning
- alert: PrometheusTargetScrapesDuplicate
annotations:
description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected
due to duplicate timestamps but different values'
summary: Prometheus has many samples rejected
expr: |
increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
for: 10m
labels:
severity: warning
- name: prometheus-operator
rules:
- alert: PrometheusOperatorAlertmanagerReconcileErrors
annotations:
message: Errors while reconciling Alertmanager in {{ $labels.namespace }}
Namespace.
expr: |
rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",controller="alertmanager"}[5m]) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorPrometheusReconcileErrors
annotations:
message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
expr: |
rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",controller="prometheus"}[5m]) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNodeLookupErrors
annotations:
message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
expr: |
rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}",controller="prometheus"}[5m]) > 0.1
for: 10m
labels:
severity: warning
- name: etcd3_alert.rules
rules:
- alert: InsufficientMembers
expr: count(up{job="kube-etcd"} == 0) > (count(up{job="kube-etcd"}) / 2 - 1)
for: 3m
labels:
severity: critical
annotations:
description: If one more etcd member goes down the cluster will be unavailable
summary: etcd cluster insufficient members
- alert: NoLeader
expr: etcd_server_has_leader{job="kube-etcd"} == 0
for: 1m
labels:
severity: critical
annotations:
description: etcd member {{ $labels.instance }} has no leader
summary: etcd member has no leader
- alert: HighNumberOfLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total{job="kube-etcd"}[1h]) > 3
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
changes within the last hour
summary: a high number of leader changes within the etcd cluster are happening
- alert: HighNumberOfFailedGRPCRequests
expr: 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="kube-etcd"}[5m])) BY (grpc_service, grpc_method)
/ sum(rate(grpc_server_handled_total{job="kube-etcd"}[5m])) BY (grpc_service, grpc_method)) > 1
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: HighNumberOfFailedGRPCRequests
expr: 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="kube-etcd"}[5m])) BY (grpc_service, grpc_method)
/ sum(rate(grpc_server_handled_total{job="kube-etcd"}[5m])) BY (grpc_service, grpc_method)) > 5
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: GRPCRequestsSlow
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="kube-etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
> 0.15
for: 10m
labels:
severity: critical
annotations:
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
}} are slow
summary: slow gRPC requests
- alert: FdExhaustionClose
expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
for: 10m
labels:
severity: warning
annotations:
description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust
its file descriptors soon'
summary: file descriptors soon exhausted
- alert: FdExhaustionClose
expr: predict_linear(instance:fd_utilization[10m], 3600) > 1
for: 10m
labels:
severity: critical
annotations:
description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust
its file descriptors soon'
summary: file descriptors soon exhausted
- alert: EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
> 0.15
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} member communication with
{{ $labels.To }} is slow
summary: etcd member communication is slow
- alert: HighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total{job="kube-etcd"}[1h]) > 5
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
failures within the last hour
summary: a high number of proposals within the etcd cluster are failing
- alert: HighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
> 0.5
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} fync durations are high
summary: high fsync durations
- alert: HighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
> 0.25
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} commit durations are high
summary: high commit durations
rule_files:
- /etc/config/rules
- /etc/config/alerts
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- localhost:9090
# A scrape configuration for running Prometheus on a Kubernetes cluster.
# This uses separate scrape configs for cluster components (i.e. API server, node)
# and services to allow each to use different authentication configs.
#
# Kubernetes labels will be added as Prometheus labels on metrics via the
# `labelmap` relabeling action.
# Scrape config for API servers.
#
# Kubernetes exposes API servers as endpoints to the default/kubernetes
# service so this uses `endpoints` role and uses relabelling to only keep
# the endpoints associated with the default/kubernetes service using the
# default named port `https`. This works for single API server deployments as
# well as HA API server deployments.
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# Keep only the default/kubernetes service endpoints for the https port. This
# will add targets for each API server which Kubernetes adds an endpoint to
# the default/kubernetes service.
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-nodes'
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
- job_name: 'kubernetes-nodes-cadvisor'
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
# This configuration will work only on kubelet 1.7.3+
# As the scrape endpoints for cAdvisor have changed
# if you are using older version you need to change the replacement to
# replacement: /api/v1/nodes/${1}:4194/proxy/metrics
# more info here https://github.com/coreos/prometheus-operator/issues/633
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
# Scrape config for service endpoints.
#
# The relabeling allows the actual service scrape endpoint to be configured
# via the following annotations:
#
# * `prometheus.io/scrape`: Only scrape services that have a value of `true`
# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
# to set this to `https` & most likely set the `tls_config` of the scrape config.
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: If the metrics are exposed on a different port to the
# service then set this appropriately.
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_label_component]
action: drop
regex: node-exporter
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- job_name: 'kubernetes-node-exporter'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_label_component]
action: keep
regex: node-exporter
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: service
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: pod
- source_labels: [__meta_kubernetes_pod_node_name]
action: replace
target_label: node
- job_name: 'prometheus-pushgateway'
honor_labels: true
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
action: keep
regex: pushgateway
# Example scrape config for probing services via the Blackbox Exporter.
#
# The relabeling allows the actual service scrape endpoint to be configured
# via the following annotations:
#
# * `prometheus.io/probe`: Only probe services that have a value of `true`
- job_name: 'kubernetes-services'
metrics_path: /probe
params:
module: [http_2xx]
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_name
# Example scrape config for pods
#
# The relabeling allows the actual pod scrape endpoint to be configured via the
# following annotations:
#
# * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
groups:
- name: node-exporter.rules
rules:
# The count of CPUs per node, useful for getting CPU time as a percent of total.
- record: instance:node_cpus:count
expr: count(node_cpu_seconds_total{mode="idle"}) without (cpu,mode)
# CPU in use by CPU.
- record: instance_cpu:node_cpu_seconds_not_idle:rate5m
expr: sum(rate(node_cpu_seconds_total{mode!="idle"}[5m])) without (mode)
# CPU in use by mode.
- record: instance_mode:node_cpu_seconds:rate5m
expr: sum(rate(node_cpu_seconds_total[5m])) without (cpu)
# CPU in use ratio.
- record: instance:node_cpu_utilization:ratio
expr: sum(instance_mode:node_cpu_seconds:rate5m{mode!="idle"}) without (mode) / instance:node_cpus:count
- expr: |
sum(node_load1{job="kubernetes-node-exporter"})
/
sum(instance:node_cpus:count)
record: ':instance_cpu_saturation_load1:'
- expr: |
node_load1
/
instance:node_cpus:count
record: 'instance:instance_cpu_saturation_load1:'
- name: k8s.rules
rules:
- expr: |
sum(rate(container_cpu_usage_seconds_total{job="kubernetes-nodes-cadvisor", image!="", container_name!=""}[5m])) by (namespace)
record: namespace:container_cpu_usage_seconds_total:sum_rate
- expr: |
sum by (namespace, pod_name, container_name) (
rate(container_cpu_usage_seconds_total{job="kubernetes-nodes-cadvisor", image!="", container_name!=""}[5m])
)
record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate
- expr: |
sum(container_memory_usage_bytes{job="kubernetes-nodes-cadvisor", image!="", container_name!=""}) by (namespace)
record: namespace:container_memory_usage_bytes:sum
- expr: |
sum by (namespace, label_name) (
sum(rate(container_cpu_usage_seconds_total{job="kubernetes-nodes-cadvisor", image!="", container_name!=""}[5m])) by (namespace, pod_name)
* on (namespace, pod_name) group_left(label_name)
label_replace(kube_pod_labels{}, "pod_name", "$1", "pod", "(.*)")
)
record: namespace_name:container_cpu_usage_seconds_total:sum_rate
- expr: |
sum by (namespace, label_name) (
sum(container_memory_usage_bytes{job="kubernetes-nodes-cadvisor",image!="", container_name!=""}) by (pod_name, namespace)
* on (namespace, pod_name) group_left(label_name)
label_replace(kube_pod_labels{}, "pod_name", "$1", "pod", "(.*)")
)
record: namespace_name:container_memory_usage_bytes:sum
- expr: |
sum by (namespace, label_name) (
sum(kube_pod_container_resource_requests_memory_bytes{}) by (namespace, pod)
* on (namespace, pod) group_left(label_name)
label_replace(kube_pod_labels{}, "pod_name", "$1", "pod", "(.*)")
)
record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
- expr: |
sum by (namespace, label_name) (
sum(kube_pod_container_resource_requests_cpu_cores{} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod)
* on (namespace, pod) group_left(label_name)
label_replace(kube_pod_labels{}, "pod_name", "$1", "pod", "(.*)")
)
record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
- name: kube-apiserver.rules
rules:
- expr: |
histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.99"
record: cluster_quantile:apiserver_request_latencies:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.9"
record: cluster_quantile:apiserver_request_latencies:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.5"
record: cluster_quantile:apiserver_request_latencies:histogram_quantile
- name: node.rules
rules:
- expr: sum(min(kube_pod_info) by (node))
record: ':kube_pod_info_node_count:'
- expr: |
max(label_replace(kube_pod_info{}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
record: 'node_namespace_pod:kube_pod_info:'
- expr: |
count by (node) (sum by (node, cpu) (
node_cpu{job="kubernetes-node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
))
record: node:node_num_cpu:sum
- expr: |
1 - avg(rate(node_cpu{job="kubernetes-node-exporter",mode="idle"}[1m]))
record: :node_cpu_utilisation:avg1m
- expr: |
1 - avg by (node) (
rate(node_cpu{job="kubernetes-node-exporter",mode="idle"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:)
record: node:node_cpu_utilisation:avg1m
- expr: |
sum(node_load1{job="kubernetes-node-exporter"})
/
sum(node:node_num_cpu:sum)
record: ':node_cpu_saturation_load1:'
- expr: |
sum by (node) (
node_load1{job="kubernetes-node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
/
node:node_num_cpu:sum
record: 'node:node_cpu_saturation_load1:'
- expr: |
1 -
sum(node_memory_MemFree{job="kubernetes-node-exporter"} + node_memory_Cached{job="kubernetes-node-exporter"} + node_memory_Buffers{job="kubernetes-node-exporter"})
/
sum(node_memory_MemTotal{job="kubernetes-node-exporter"})
record: ':node_memory_utilisation:'
- expr: |
sum(node_memory_MemFree{job="kubernetes-node-exporter"} + node_memory_Cached{job="kubernetes-node-exporter"} + node_memory_Buffers{job="kubernetes-node-exporter"})
record: :node_memory_MemFreeCachedBuffers:sum
- expr: |
sum(node_memory_MemTotal{job="kubernetes-node-exporter"})
record: :node_memory_MemTotal:sum
- expr: |
sum by (node) (
(node_memory_MemFree{job="kubernetes-node-exporter"} + node_memory_Cached{job="kubernetes-node-exporter"} + node_memory_Buffers{job="kubernetes-node-exporter"})
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_memory_bytes_available:sum
- expr: |
sum by (node) (
node_memory_MemTotal{job="kubernetes-node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_memory_bytes_total:sum
- expr: |
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
/
scalar(sum(node:node_memory_bytes_total:sum))
record: node:node_memory_utilisation:ratio
- expr: |
1e3 * sum(
(rate(node_vmstat_pgpgin{job="kubernetes-node-exporter"}[1m])
+ rate(node_vmstat_pgpgout{job="kubernetes-node-exporter"}[1m]))
)
record: :node_memory_swap_io_bytes:sum_rate
- expr: |
1 -
sum by (node) (
(node_memory_MemFree{job="kubernetes-node-exporter"} + node_memory_Cached{job="kubernetes-node-exporter"} + node_memory_Buffers{job="kubernetes-node-exporter"})
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
/
sum by (node) (
node_memory_MemTotal{job="kubernetes-node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: 'node:node_memory_utilisation:'
- expr: |
1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
record: 'node:node_memory_utilisation_2:'
- expr: |
1e3 * sum by (node) (
(rate(node_vmstat_pgpgin{job="kubernetes-node-exporter"}[1m])
+ rate(node_vmstat_pgpgout{job="kubernetes-node-exporter"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_memory_swap_io_bytes:sum_rate
- expr: |
avg(irate(node_disk_io_time_ms{job="kubernetes-node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3)
record: :node_disk_utilisation:avg_irate
- expr: |
avg by (node) (
irate(node_disk_io_time_ms{job="kubernetes-node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_disk_utilisation:avg_irate
- expr: |
avg(irate(node_disk_io_time_weighted{job="kubernetes-node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3)
record: :node_disk_saturation:avg_irate
- expr: |
avg by (node) (
irate(node_disk_io_time_weighted{job="kubernetes-node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_disk_saturation:avg_irate
- expr: |
max by (namespace, pod, device) ((node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"}
- node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"})
/ node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"})
record: 'node:node_filesystem_usage:'
- expr: |
max by (namespace, pod, device) (node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"})
record: 'node:node_filesystem_avail:'
- expr: |
sum(irate(node_network_receive_bytes{job="kubernetes-node-exporter",device="eth0"}[1m])) +
sum(irate(node_network_transmit_bytes{job="kubernetes-node-exporter",device="eth0"}[1m]))
record: :node_net_utilisation:sum_irate
- expr: |
sum by (node) (
(irate(node_network_receive_bytes{job="kubernetes-node-exporter",device="eth0"}[1m]) +
irate(node_network_transmit_bytes{job="kubernetes-node-exporter",device="eth0"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_net_utilisation:sum_irate
- expr: |
sum(irate(node_network_receive_drop{job="kubernetes-node-exporter",device="eth0"}[1m])) +
sum(irate(node_network_transmit_drop{job="kubernetes-node-exporter",device="eth0"}[1m]))
record: :node_net_saturation:sum_irate
- expr: |
sum by (node) (
(irate(node_network_receive_drop{job="kubernetes-node-exporter",device="eth0"}[1m]) +
irate(node_network_transmit_drop{job="kubernetes-node-exporter",device="eth0"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_net_saturation:sum_irate
- name: kube-prometheus-node-recording.rules
rules:
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance)
record: instance:node_cpu:rate:sum
- expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
BY (instance)
record: instance:node_filesystem_usage:sum
- expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
record: instance:node_network_receive_bytes:rate:sum
- expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
record: instance:node_network_transmit_bytes:rate:sum
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode)
/ ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
record: instance:node_cpu:ratio
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))
record: cluster:node_cpu:sum_rate5m
- expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
record: cluster:node_cpu:ratio
- name: node_exporter-16-bcache
rules:
- expr: node_bcache_cache_read_races
record: node_bcache_cache_read_races_total
- name: node_exporter-16-buddyinfo
rules:
- expr: node_buddyinfo_blocks
record: node_buddyinfo_count
- name: node_exporter-16-stat
rules:
- expr: node_boot_time_seconds
record: node_boot_time
- expr: node_context_switches_total
record: node_context_switches
- expr: node_forks_total
record: node_forks
- expr: node_intr_total
record: node_intr
- name: node_exporter-16-cpu
rules:
- expr: label_replace(node_cpu_seconds_total, "cpu", "$1", "cpu", "cpu(.+)")
record: node_cpu
- name: node_exporter-16-diskstats
rules:
- expr: node_disk_read_bytes_total
record: node_disk_bytes_read
- expr: node_disk_written_bytes_total
record: node_disk_bytes_written
- expr: node_disk_io_time_seconds_total * 1000
record: node_disk_io_time_ms
- expr: node_disk_io_time_weighted_seconds_total
record: node_disk_io_time_weighted
- expr: node_disk_reads_completed_total
record: node_disk_reads_completed
- expr: node_disk_reads_merged_total
record: node_disk_reads_merged
- expr: node_disk_read_time_seconds_total * 1000
record: node_disk_read_time_ms
- expr: node_disk_writes_completed_total
record: node_disk_writes_completed
- expr: node_disk_writes_merged_total
record: node_disk_writes_merged
- expr: node_disk_write_time_seconds_total * 1000
record: node_disk_write_time_ms
- name: node_exporter-16-filesystem
rules:
- expr: node_filesystem_free_bytes
record: node_filesystem_free
- expr: node_filesystem_avail_bytes
record: node_filesystem_avail
- expr: node_filesystem_size_bytes
record: node_filesystem_size
- name: node_exporter-16-infiniband
rules:
- expr: node_infiniband_port_data_received_bytes_total
record: node_infiniband_port_data_received_bytes
- expr: node_infiniband_port_data_transmitted_bytes_total
record: node_infiniband_port_data_transmitted_bytes
- name: node_exporter-16-interrupts
rules:
- expr: node_interrupts_total
record: node_interrupts
- name: node_exporter-16-memory
rules:
- expr: node_memory_Active_bytes
record: node_memory_Active
- expr: node_memory_Active_anon_bytes
record: node_memory_Active_anon
- expr: node_memory_Active_file_bytes
record: node_memory_Active_file
- expr: node_memory_AnonHugePages_bytes
record: node_memory_AnonHugePages
- expr: node_memory_AnonPages_bytes
record: node_memory_AnonPages
- expr: node_memory_Bounce_bytes
record: node_memory_Bounce
- expr: node_memory_Buffers_bytes
record: node_memory_Buffers
- expr: node_memory_Cached_bytes
record: node_memory_Cached
- expr: node_memory_CommitLimit_bytes
record: node_memory_CommitLimit
- expr: node_memory_Committed_AS_bytes
record: node_memory_Committed_AS
- expr: node_memory_DirectMap2M_bytes
record: node_memory_DirectMap2M
- expr: node_memory_DirectMap4k_bytes
record: node_memory_DirectMap4k
- expr: node_memory_Dirty_bytes
record: node_memory_Dirty
- expr: node_memory_HardwareCorrupted_bytes
record: node_memory_HardwareCorrupted
- expr: node_memory_Hugepagesize_bytes
record: node_memory_Hugepagesize
- expr: node_memory_Inactive_bytes
record: node_memory_Inactive
- expr: node_memory_Inactive_anon_bytes
record: node_memory_Inactive_anon
- expr: node_memory_Inactive_file_bytes
record: node_memory_Inactive_file
- expr: node_memory_KernelStack_bytes
record: node_memory_KernelStack
- expr: node_memory_Mapped_bytes
record: node_memory_Mapped
- expr: node_memory_MemAvailable_bytes
record: node_memory_MemAvailable
- expr: node_memory_MemFree_bytes
record: node_memory_MemFree
- expr: node_memory_MemTotal_bytes
record: node_memory_MemTotal
- expr: node_memory_Mlocked_bytes
record: node_memory_Mlocked
- expr: node_memory_NFS_Unstable_bytes
record: node_memory_NFS_Unstable
- expr: node_memory_PageTables_bytes
record: node_memory_PageTables
- expr: node_memory_Shmem_bytes
record: node_memory_Shmem
- expr: node_memory_Slab_bytes
record: node_memory_Slab
- expr: node_memory_SReclaimable_bytes
record: node_memory_SReclaimable
- expr: node_memory_SUnreclaim_bytes
record: node_memory_SUnreclaim
- expr: node_memory_SwapCached_bytes
record: node_memory_SwapCached
- expr: node_memory_SwapFree_bytes
record: node_memory_SwapFree
- expr: node_memory_SwapTotal_bytes
record: node_memory_SwapTotal
- expr: node_memory_Unevictable_bytes
record: node_memory_Unevictable
- expr: node_memory_VmallocChunk_bytes
record: node_memory_VmallocChunk
- expr: node_memory_VmallocTotal_bytes
record: node_memory_VmallocTotal
- expr: node_memory_VmallocUsed_bytes
record: node_memory_VmallocUsed
- expr: node_memory_Writeback_bytes
record: node_memory_Writeback
- expr: node_memory_WritebackTmp_bytes
record: node_memory_WritebackTmp
- name: node_exporter-16-network
rules:
- expr: node_network_receive_bytes_total
record: node_network_receive_bytes
- expr: node_network_receive_compressed_total
record: node_network_receive_compressed
- expr: node_network_receive_drop_total
record: node_network_receive_drop
- expr: node_network_receive_errs_total
record: node_network_receive_errs
- expr: node_network_receive_fifo_total
record: node_network_receive_fifo
- expr: node_network_receive_frame_total
record: node_network_receive_frame
- expr: node_network_receive_multicast_total
record: node_network_receive_multicast
- expr: node_network_receive_packets_total
record: node_network_receive_packets
- expr: node_network_transmit_bytes_total
record: node_network_transmit_bytes
- expr: node_network_transmit_compressed_total
record: node_network_transmit_compressed
- expr: node_network_transmit_drop_total
record: node_network_transmit_drop
- expr: node_network_transmit_errs_total
record: node_network_transmit_errs
- expr: node_network_transmit_fifo_total
record: node_network_transmit_fifo
- expr: node_network_transmit_frame_total
record: node_network_transmit_frame
- expr: node_network_transmit_multicast_total
record: node_network_transmit_multicast
- expr: node_network_transmit_packets_total
record: node_network_transmit_packets
- name: node_exporter-16-nfs
rules:
- expr: node_nfs_connections_total
record: node_nfs_net_connections
- expr: node_nfs_packets_total
record: node_nfs_net_reads
- expr: label_replace(label_replace(node_nfs_requests_total, "proto", "$1", "version",
"(.+)"), "method", "$1", "procedure", "(.+)")
record: node_nfs_procedures
- expr: node_nfs_rpc_authentication_refreshes_total
record: node_nfs_rpc_authentication_refreshes
- expr: node_nfs_rpcs_total
record: node_nfs_rpc_operations
- expr: node_nfs_rpc_retransmissions_total
record: node_nfs_rpc_retransmissions
- name: node_exporter-16-textfile
rules:
- expr: node_textfile_mtime_seconds
record: node_textfile_mtime
- name: etcd3_alert2.rules
rules:
- record: instance:fd_utilization
expr: process_open_fds / process_max_fds
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment