Created
March 24, 2021 20:09
-
-
Save kradalby/d2ab437ac20c96f4940553330520696c to your computer and use it in GitHub Desktop.
Prometheus rules for v0.3.2 terraform provider
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
groups: | |
- name: alertmanager.rules | |
rules: | |
- alert: AlertmanagerFailedReload | |
annotations: | |
description: Configuration has failed to load for {{$labels.instance}}. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-alertmanagerfailedreload | |
summary: Reloading an Alertmanager configuration has failed. | |
expr: | | |
# Without max_over_time, failed scrapes could create false negatives, see | |
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. | |
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager"}[5m]) == 0 | |
for: 10m | |
labels: | |
severity: critical | |
- alert: AlertmanagerMembersInconsistent | |
annotations: | |
description: Alertmanager {{$labels.instance}} has only found {{ $value }} members of the {{$labels.job}} cluster. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-alertmanagermembersinconsistent | |
summary: A member of an Alertmanager cluster has not found all other cluster members. | |
expr: | | |
# Without max_over_time, failed scrapes could create false negatives, see | |
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. | |
max_over_time(alertmanager_cluster_members{job="alertmanager"}[5m]) | |
< on (job) group_left | |
count by (job) (max_over_time(alertmanager_cluster_members{job="alertmanager"}[5m])) | |
for: 10m | |
labels: | |
severity: critical | |
- alert: AlertmanagerFailedToSendAlerts | |
annotations: | |
description: Alertmanager {{$labels.instance}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-alertmanagerfailedtosendalerts | |
summary: An Alertmanager instance failed to send notifications. | |
expr: | | |
( | |
rate(alertmanager_notifications_failed_total{job="alertmanager"}[5m]) | |
/ | |
rate(alertmanager_notifications_total{job="alertmanager"}[5m]) | |
) | |
> 0.01 | |
for: 5m | |
labels: | |
severity: warning | |
- alert: AlertmanagerClusterFailedToSendAlerts | |
annotations: | |
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-alertmanagerclusterfailedtosendalerts | |
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. | |
expr: | | |
min by (job, integration) ( | |
rate(alertmanager_notifications_failed_total{job="alertmanager", integration=~`.*`}[5m]) | |
/ | |
rate(alertmanager_notifications_total{job="alertmanager", integration=~`.*`}[5m]) | |
) | |
> 0.01 | |
for: 5m | |
labels: | |
severity: critical | |
- alert: AlertmanagerClusterFailedToSendAlerts | |
annotations: | |
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-alertmanagerclusterfailedtosendalerts | |
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. | |
expr: | | |
min by (job, integration) ( | |
rate(alertmanager_notifications_failed_total{job="alertmanager", integration!~`.*`}[5m]) | |
/ | |
rate(alertmanager_notifications_total{job="alertmanager", integration!~`.*`}[5m]) | |
) | |
> 0.01 | |
for: 5m | |
labels: | |
severity: warning | |
- alert: AlertmanagerConfigInconsistent | |
annotations: | |
description: Alertmanager instances within the {{$labels.job}} cluster have different configurations. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-alertmanagerconfiginconsistent | |
summary: Alertmanager instances within the same cluster have different configurations. | |
expr: | | |
count by (job) ( | |
count_values by (job) ("config_hash", alertmanager_config_hash{job="alertmanager"}) | |
) | |
!= 1 | |
for: 20m | |
labels: | |
severity: critical | |
- alert: AlertmanagerClusterDown | |
annotations: | |
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.' | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-alertmanagerclusterdown | |
summary: Half or more of the Alertmanager instances within the same cluster are down. | |
expr: | | |
( | |
count by (job) ( | |
avg_over_time(up{job="alertmanager"}[5m]) < 0.5 | |
) | |
/ | |
count by (job) ( | |
up{job="alertmanager"} | |
) | |
) | |
>= 0.5 | |
for: 5m | |
labels: | |
severity: critical | |
- alert: AlertmanagerClusterCrashlooping | |
annotations: | |
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.' | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-alertmanagerclustercrashlooping | |
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. | |
expr: | | |
( | |
count by (job) ( | |
changes(process_start_time_seconds{job="alertmanager"}[10m]) > 4 | |
) | |
/ | |
count by (job) ( | |
up{job="alertmanager"} | |
) | |
) | |
>= 0.5 | |
for: 5m | |
labels: | |
severity: critical | |
- name: cert-manager | |
rules: | |
- alert: CertManagerAbsent | |
annotations: | |
description: New certificates will not be able to be minted, and existing ones can't be renewed until cert-manager is back. | |
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerabsent | |
summary: Cert Manager has dissapeared from Prometheus service discovery. | |
expr: absent(up{job="cert-manager"}) | |
for: 10m | |
labels: | |
severity: critical | |
- name: certificates | |
rules: | |
- alert: CertManagerCertExpirySoon | |
annotations: | |
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager | |
description: The domain that this cert covers will be unavailable after {{ $value | humanizeDuration }}. Clients using endpoints that this cert protects will start to fail in {{ $value | humanizeDuration }}. | |
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertexpirysoon | |
summary: The cert `{{ $labels.name }}` is {{ $value | humanizeDuration }} from expiry, it should have renewed over a week ago. | |
expr: | | |
avg by (exported_namespace, namespace, name) ( | |
certmanager_certificate_expiration_timestamp_seconds - time() | |
) < (21 * 24 * 3600) # 21 days in seconds | |
for: 1h | |
labels: | |
severity: warning | |
- alert: CertManagerCertNotReady | |
annotations: | |
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager | |
description: This certificate has not been ready to serve traffic for at least 10m. If the cert is being renewed or there is another valid cert, the ingress controller _may_ be able to serve that instead. | |
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertnotready | |
summary: The cert `{{ $labels.name }}` is not ready to serve traffic. | |
expr: | | |
max by (name, exported_namespace, namespace, condition) ( | |
certmanager_certificate_ready_status{condition!="True"} == 1 | |
) | |
for: 10m | |
labels: | |
severity: critical | |
- alert: CertManagerHittingRateLimits | |
annotations: | |
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager | |
description: Depending on the rate limit, cert-manager may be unable to generate certificates for up to a week. | |
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerhittingratelimits | |
summary: Cert manager hitting LetsEncrypt rate limits. | |
expr: | | |
sum by (host) ( | |
rate(certmanager_http_acme_client_request_count{status="429"}[5m]) | |
) > 0 | |
for: 5m | |
labels: | |
severity: critical | |
- name: coredns | |
rules: | |
- alert: CoreDNSDown | |
annotations: | |
message: CoreDNS has disappeared from Prometheus target discovery. | |
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsdown | |
expr: | | |
absent(up{job="coredns"} == 1) | |
for: 15m | |
labels: | |
severity: critical | |
- alert: CoreDNSLatencyHigh | |
annotations: | |
message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server {{ $labels.server }} zone {{ $labels.zone }} . | |
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednslatencyhigh | |
expr: | | |
histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="coredns"}[5m])) by(server, zone, le)) > 4 | |
for: 10m | |
labels: | |
severity: critical | |
- alert: CoreDNSErrorsHigh | |
annotations: | |
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests. | |
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh | |
expr: | | |
sum(rate(coredns_dns_response_rcode_count_total{job="coredns",rcode="SERVFAIL"}[5m])) | |
/ | |
sum(rate(coredns_dns_response_rcode_count_total{job="coredns"}[5m])) > 0.03 | |
for: 10m | |
labels: | |
severity: critical | |
- alert: CoreDNSErrorsHigh | |
annotations: | |
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests. | |
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh | |
expr: | | |
sum(rate(coredns_dns_response_rcode_count_total{job="coredns",rcode="SERVFAIL"}[5m])) | |
/ | |
sum(rate(coredns_dns_response_rcode_count_total{job="coredns"}[5m])) > 0.01 | |
for: 10m | |
labels: | |
severity: warning | |
- name: coredns_forward | |
rules: | |
- alert: CoreDNSForwardLatencyHigh | |
annotations: | |
message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding requests to {{ $labels.to }}. | |
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardlatencyhigh | |
expr: | | |
histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="coredns"}[5m])) by(to, le)) > 4 | |
for: 10m | |
labels: | |
severity: critical | |
- alert: CoreDNSForwardErrorsHigh | |
annotations: | |
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}. | |
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh | |
expr: | | |
sum(rate(coredns_forward_response_rcode_count_total{job="coredns",rcode="SERVFAIL"}[5m])) | |
/ | |
sum(rate(coredns_forward_response_rcode_count_total{job="coredns"}[5m])) > 0.03 | |
for: 10m | |
labels: | |
severity: critical | |
- alert: CoreDNSForwardErrorsHigh | |
annotations: | |
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}. | |
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh | |
expr: | | |
sum(rate(coredns_dns_response_rcode_count_total{job="coredns",rcode="SERVFAIL"}[5m])) | |
/ | |
sum(rate(coredns_dns_response_rcode_count_total{job="coredns"}[5m])) > 0.01 | |
for: 10m | |
labels: | |
severity: warning | |
- name: prometheus-operator | |
rules: | |
- alert: PrometheusOperatorListErrors | |
annotations: | |
description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-prometheusoperatorlisterrors | |
summary: Errors while performing list operations in controller. | |
expr: | | |
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator"}[10m]))) > 0.4 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: PrometheusOperatorWatchErrors | |
annotations: | |
description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-prometheusoperatorwatcherrors | |
summary: Errors while performing watch operations in controller. | |
expr: | | |
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator"}[10m]))) > 0.4 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: PrometheusOperatorSyncFailed | |
annotations: | |
description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-prometheusoperatorsyncfailed | |
summary: Last controller reconciliation failed | |
expr: | | |
min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator"}[5m]) > 0 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusOperatorReconcileErrors | |
annotations: | |
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-prometheusoperatorreconcileerrors | |
summary: Errors while reconciling controller. | |
expr: | | |
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator"}[5m]))) > 0.1 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusOperatorNodeLookupErrors | |
annotations: | |
description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-prometheusoperatornodelookuperrors | |
summary: Errors while reconciling Prometheus. | |
expr: | | |
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusOperatorNotReady | |
annotations: | |
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-prometheusoperatornotready | |
summary: Prometheus operator not ready | |
expr: | | |
min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator"}[5m]) == 0) | |
for: 5m | |
labels: | |
severity: warning | |
- alert: PrometheusOperatorRejectedResources | |
annotations: | |
description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-prometheusoperatorrejectedresources | |
summary: Resources rejected by Prometheus operator | |
expr: | | |
min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator"}[5m]) > 0 | |
for: 5m | |
labels: | |
severity: warning | |
- name: thanos-compact | |
rules: | |
- alert: ThanosCompactMultipleRunning | |
annotations: | |
description: No more than one Thanos Compact instance should be running at once. There are {{ $value }} | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactmultiplerunning | |
summary: Thanos Compact has multiple instances running. | |
expr: sum(up{job=~"thanos-compact.*"}) > 1 | |
for: 5m | |
labels: | |
severity: warning | |
- alert: ThanosCompactHalted | |
annotations: | |
description: Thanos Compact {{$labels.job}} has failed to run and now is halted. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthalted | |
summary: Thanos Compact has failed to run ans is now halted. | |
expr: thanos_compact_halted{job=~"thanos-compact.*"} == 1 | |
for: 5m | |
labels: | |
severity: warning | |
- alert: ThanosCompactHighCompactionFailures | |
annotations: | |
description: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthighcompactionfailures | |
summary: Thanos Compact is failing to execute compactions. | |
expr: | | |
( | |
sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m])) | |
/ | |
sum by (job) (rate(thanos_compact_group_compactions_total{job=~"thanos-compact.*"}[5m])) | |
* 100 > 5 | |
) | |
for: 15m | |
labels: | |
severity: warning | |
- alert: ThanosCompactBucketHighOperationFailures | |
annotations: | |
description: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactbuckethighoperationfailures | |
summary: Thanos Compact Bucket is having a high number of operation failures. | |
expr: | | |
( | |
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m])) | |
/ | |
sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~"thanos-compact.*"}[5m])) | |
* 100 > 5 | |
) | |
for: 15m | |
labels: | |
severity: warning | |
- alert: ThanosCompactHasNotRun | |
annotations: | |
description: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthasnotrun | |
summary: Thanos Compact has not uploaded anything for last 24 hours. | |
expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) / 60 / 60 > 24 | |
labels: | |
severity: warning | |
- name: thanos-sidecar | |
rules: | |
- alert: ThanosSidecarPrometheusDown | |
annotations: | |
description: Thanos Sidecar {{$labels.job}} {{$labels.instance}} cannot connect to Prometheus. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarprometheusdown | |
summary: Thanos Sidecar cannot connect to Prometheus | |
expr: | | |
sum by (job, instance) (thanos_sidecar_prometheus_up{job=~"thanos-sidecar.*"} == 0) | |
for: 5m | |
labels: | |
severity: critical | |
- alert: ThanosSidecarBucketOperationsFailed | |
annotations: | |
description: Thanos Sidecar {{$labels.job}} {{$labels.instance}} bucket operations are failing | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarbucketoperationsfailed | |
summary: Thanos Sidecar bucket operations are failing | |
expr: | | |
rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-sidecar.*"}[5m]) > 0 | |
for: 5m | |
labels: | |
severity: critical | |
- alert: ThanosSidecarUnhealthy | |
annotations: | |
description: Thanos Sidecar {{$labels.job}} {{$labels.instance}} is unhealthy for more than {{$value}} seconds. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy | |
summary: Thanos Sidecar is unhealthy. | |
expr: | | |
time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"})) >= 240 | |
labels: | |
severity: critical | |
- name: thanos-store | |
rules: | |
- alert: ThanosStoreGrpcErrorRate | |
annotations: | |
description: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate | |
summary: Thanos Store is failing to handle qrpcd requests. | |
expr: | | |
( | |
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m])) | |
/ | |
sum by (job) (rate(grpc_server_started_total{job=~"thanos-store.*"}[5m])) | |
* 100 > 5 | |
) | |
for: 5m | |
labels: | |
severity: warning | |
- alert: ThanosStoreSeriesGateLatencyHigh | |
annotations: | |
description: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreseriesgatelatencyhigh | |
summary: Thanos Store has high latency for store series gate requests. | |
expr: | | |
( | |
histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2 | |
and | |
sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0 | |
) | |
for: 10m | |
labels: | |
severity: warning | |
- alert: ThanosStoreBucketHighOperationFailures | |
annotations: | |
description: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstorebuckethighoperationfailures | |
summary: Thanos Store Bucket is failing to execute operations. | |
expr: | | |
( | |
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m])) | |
/ | |
sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m])) | |
* 100 > 5 | |
) | |
for: 15m | |
labels: | |
severity: warning | |
- alert: ThanosStoreObjstoreOperationLatencyHigh | |
annotations: | |
description: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreobjstoreoperationlatencyhigh | |
summary: Thanos Store is having high latency for bucket operations. | |
expr: | | |
( | |
histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2 | |
and | |
sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0 | |
) | |
for: 10m | |
labels: | |
severity: warning | |
- name: thanos-component-absent | |
rules: | |
- alert: ThanosCompactIsDown | |
annotations: | |
description: ThanosCompact has disappeared from Prometheus target discovery. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactisdown | |
summary: thanos component has disappeared from Prometheus target discovery. | |
expr: | | |
absent(up{job=~"thanos-compact.*"} == 1) | |
for: 5m | |
labels: | |
severity: critical | |
- alert: ThanosSidecarIsDown | |
annotations: | |
description: ThanosSidecar has disappeared from Prometheus target discovery. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarisdown | |
summary: thanos component has disappeared from Prometheus target discovery. | |
expr: | | |
absent(up{job=~"thanos-sidecar.*"} == 1) | |
for: 5m | |
labels: | |
severity: critical | |
- alert: ThanosStoreIsDown | |
annotations: | |
description: ThanosStore has disappeared from Prometheus target discovery. | |
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreisdown | |
summary: thanos component has disappeared from Prometheus target discovery. | |
expr: | | |
absent(up{job=~"thanos-store.*"} == 1) | |
for: 5m | |
labels: | |
severity: critical | |
- name: jaeger_alerts | |
rules: | |
- alert: JaegerAgentUDPPacketsBeingDropped | |
annotations: | |
message: | | |
{{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }} UDP packets per second. | |
expr: rate(jaeger_agent_thrift_udp_server_packets_dropped_total[1m]) > 1 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: JaegerAgentHTTPServerErrs | |
annotations: | |
message: | | |
{{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% HTTP errors. | |
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace)> 1 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: JaegerClientSpansDropped | |
annotations: | |
message: | | |
service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans. | |
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)> 1 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: JaegerAgentSpansDropped | |
annotations: | |
message: | | |
agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans. | |
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace)> 1 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: JaegerCollectorQueueNotDraining | |
annotations: | |
message: | | |
collector {{ $labels.job }} {{ $labels.instance }} is not able to drain the queue. | |
expr: avg_over_time(jaeger_collector_queue_length[10m]) > 1000 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: JaegerCollectorDroppingSpans | |
annotations: | |
message: | | |
collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans. | |
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace)> 1 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: JaegerSamplingUpdateFailing | |
annotations: | |
message: | | |
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies. | |
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)> 1 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: JaegerCollectorPersistenceSlow | |
annotations: | |
message: | | |
{{ $labels.job }} {{ $labels.instance }} is slow at persisting spans. | |
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m]))) > 0.5 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: JaegerThrottlingUpdateFailing | |
annotations: | |
message: | | |
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies. | |
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace)> 1 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: JaegerQueryReqsFailing | |
annotations: | |
message: | | |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}. | |
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace)> 1 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: JaegerCassandraWritesFailing | |
annotations: | |
message: | | |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}. | |
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)> 1 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: JaegerCassandraReadsFailing | |
annotations: | |
message: | | |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}. | |
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance, job, namespace)> 1 | |
for: 15m | |
labels: | |
severity: warning | |
- name: loki_alerts | |
rules: | |
- alert: LokiRequestErrors | |
annotations: | |
message: | | |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. | |
expr: | | |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) | |
/ | |
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) | |
> 10 | |
for: 15m | |
labels: | |
severity: critical | |
- alert: LokiRequestPanics | |
annotations: | |
message: | | |
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. | |
expr: | | |
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 | |
labels: | |
severity: critical | |
- alert: LokiRequestLatency | |
annotations: | |
message: | | |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. | |
expr: | | |
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1 | |
for: 15m | |
labels: | |
severity: critical | |
- name: node-exporter | |
rules: | |
- alert: NodeFilesystemSpaceFillingUp | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. | |
summary: Filesystem is predicted to run out of space within the next 24 hours. | |
expr: | | |
( | |
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40 | |
and | |
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 | |
and | |
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 | |
) | |
for: 1h | |
labels: | |
severity: warning | |
- alert: NodeFilesystemSpaceFillingUp | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. | |
summary: Filesystem is predicted to run out of space within the next 4 hours. | |
expr: | | |
( | |
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 20 | |
and | |
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 | |
and | |
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 | |
) | |
for: 1h | |
labels: | |
severity: critical | |
- alert: NodeFilesystemAlmostOutOfSpace | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. | |
summary: Filesystem has less than 5% space left. | |
expr: | | |
( | |
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5 | |
and | |
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 | |
) | |
for: 1h | |
labels: | |
severity: warning | |
- alert: NodeFilesystemAlmostOutOfSpace | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. | |
summary: Filesystem has less than 3% space left. | |
expr: | | |
( | |
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3 | |
and | |
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 | |
) | |
for: 1h | |
labels: | |
severity: critical | |
- alert: NodeFilesystemFilesFillingUp | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. | |
summary: Filesystem is predicted to run out of inodes within the next 24 hours. | |
expr: | | |
( | |
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40 | |
and | |
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 | |
and | |
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 | |
) | |
for: 1h | |
labels: | |
severity: warning | |
- alert: NodeFilesystemFilesFillingUp | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. | |
summary: Filesystem is predicted to run out of inodes within the next 4 hours. | |
expr: | | |
( | |
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20 | |
and | |
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 | |
and | |
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 | |
) | |
for: 1h | |
labels: | |
severity: critical | |
- alert: NodeFilesystemAlmostOutOfFiles | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. | |
summary: Filesystem has less than 5% inodes left. | |
expr: | | |
( | |
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5 | |
and | |
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 | |
) | |
for: 1h | |
labels: | |
severity: warning | |
- alert: NodeFilesystemAlmostOutOfFiles | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. | |
summary: Filesystem has less than 3% inodes left. | |
expr: | | |
( | |
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3 | |
and | |
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 | |
) | |
for: 1h | |
labels: | |
severity: critical | |
- alert: NodeNetworkReceiveErrs | |
annotations: | |
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' | |
summary: Network interface is reporting many receive errors. | |
expr: | | |
rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 | |
for: 1h | |
labels: | |
severity: warning | |
- alert: NodeNetworkTransmitErrs | |
annotations: | |
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' | |
summary: Network interface is reporting many transmit errors. | |
expr: | | |
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 | |
for: 1h | |
labels: | |
severity: warning | |
- alert: NodeHighNumberConntrackEntriesUsed | |
annotations: | |
description: '{{ $value | humanizePercentage }} of conntrack entries are used.' | |
summary: Number of conntrack are getting close to the limit. | |
expr: | | |
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 | |
labels: | |
severity: warning | |
- alert: NodeTextFileCollectorScrapeError | |
annotations: | |
description: Node Exporter text file collector failed to scrape. | |
summary: Node Exporter text file collector failed to scrape. | |
expr: | | |
node_textfile_scrape_error{job="node-exporter"} == 1 | |
labels: | |
severity: warning | |
- alert: NodeClockSkewDetected | |
annotations: | |
description: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. | |
summary: Clock skew detected. | |
expr: | | |
( | |
node_timex_offset_seconds > 0.05 | |
and | |
deriv(node_timex_offset_seconds[5m]) >= 0 | |
) | |
or | |
( | |
node_timex_offset_seconds < -0.05 | |
and | |
deriv(node_timex_offset_seconds[5m]) <= 0 | |
) | |
for: 10m | |
labels: | |
severity: warning | |
- alert: NodeClockNotSynchronising | |
annotations: | |
description: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. | |
summary: Clock not synchronising. | |
expr: | | |
min_over_time(node_timex_sync_status[5m]) == 0 | |
and | |
node_timex_maxerror_seconds >= 16 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: NodeRAIDDegraded | |
annotations: | |
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. | |
summary: RAID Array is degraded | |
expr: | | |
node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 | |
for: 15m | |
labels: | |
severity: critical | |
- alert: NodeRAIDDiskFailure | |
annotations: | |
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap. | |
summary: Failed device in RAID array | |
expr: | | |
node_md_disks{state="failed"} > 0 | |
labels: | |
severity: warning | |
- name: PostgreSQL | |
rules: | |
- alert: PostgreSQLMaxConnectionsReached | |
annotations: | |
description: '{{ $labels.instance }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy.' | |
summary: '{{ $labels.instance }} has maxed out Postgres connections.' | |
expr: sum(pg_stat_activity_count) by (instance) >= sum(pg_settings_max_connections) by (instance) - sum(pg_settings_superuser_reserved_connections) by (instance) | |
for: 1m | |
labels: | |
severity: email | |
- alert: PostgreSQLHighConnections | |
annotations: | |
description: '{{ $labels.instance }} is exceeding 80% of the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Please check utilization graphs and confirm if this is normal service growth, abuse or an otherwise temporary condition or if new resources need to be provisioned (or the limits increased, which is mostly likely).' | |
summary: '{{ $labels.instance }} is over 80% of max Postgres connections.' | |
expr: sum(pg_stat_activity_count) by (instance) > (sum(pg_settings_max_connections) by (instance) - sum(pg_settings_superuser_reserved_connections) by (instance)) * 0.8 | |
for: 10m | |
labels: | |
severity: email | |
- alert: PostgreSQLDown | |
annotations: | |
description: '{{ $labels.instance }} is rejecting query requests from the exporter, and thus probably not allowing DNS requests to work either. User services should not be effected provided at least 1 node is still alive.' | |
summary: 'PostgreSQL is not processing queries: {{ $labels.instance }}' | |
expr: pg_up != 1 | |
for: 1m | |
labels: | |
severity: email | |
- alert: PostgreSQLSlowQueries | |
annotations: | |
description: 'PostgreSQL high number of slow queries {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }} ' | |
summary: 'PostgreSQL high number of slow on {{ $labels.cluster }} for database {{ $labels.datname }} ' | |
expr: avg(rate(pg_stat_activity_max_tx_duration{datname!~"template.*"}[2m])) by (datname) > 2 * 60 | |
for: 2m | |
labels: | |
severity: email | |
- alert: PostgreSQLQPS | |
annotations: | |
description: PostgreSQL high number of queries per second on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }} | |
summary: PostgreSQL high number of queries per second {{ $labels.cluster }} for database {{ $labels.datname }} | |
expr: avg(irate(pg_stat_database_xact_commit{datname!~"template.*"}[5m]) + irate(pg_stat_database_xact_rollback{datname!~"template.*"}[5m])) by (datname) > 10000 | |
for: 5m | |
labels: | |
severity: email | |
- alert: PostgreSQLCacheHitRatio | |
annotations: | |
description: PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }} | |
summary: PostgreSQL low cache hit rate on {{ $labels.cluster }} for database {{ $labels.datname }} | |
expr: avg(rate(pg_stat_database_blks_hit{datname!~"template.*"}[5m]) / (rate(pg_stat_database_blks_hit{datname!~"template.*"}[5m]) + rate(pg_stat_database_blks_read{datname!~"template.*"}[5m]))) by (datname) < 0.98 | |
for: 5m | |
labels: | |
severity: email | |
- name: prometheus | |
rules: | |
- alert: PrometheusBadConfig | |
annotations: | |
description: Prometheus {{$labels.instance}} has failed to reload its configuration. | |
summary: Failed Prometheus configuration reload. | |
expr: | | |
# Without max_over_time, failed scrapes could create false negatives, see | |
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. | |
max_over_time(prometheus_config_last_reload_successful{job="prometheus"}[5m]) == 0 | |
for: 10m | |
labels: | |
severity: critical | |
- alert: PrometheusNotificationQueueRunningFull | |
annotations: | |
description: Alert notification queue of Prometheus {{$labels.instance}} is running full. | |
summary: Prometheus alert notification queue predicted to run full in less than 30m. | |
expr: | | |
# Without min_over_time, failed scrapes could create false negatives, see | |
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. | |
( | |
predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30) | |
> | |
min_over_time(prometheus_notifications_queue_capacity{job="prometheus"}[5m]) | |
) | |
for: 15m | |
labels: | |
severity: warning | |
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers | |
annotations: | |
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.' | |
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. | |
expr: | | |
( | |
rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) | |
/ | |
rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) | |
) | |
* 100 | |
> 1 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: PrometheusNotConnectedToAlertmanagers | |
annotations: | |
description: Prometheus {{$labels.instance}} is not connected to any Alertmanagers. | |
summary: Prometheus is not connected to any Alertmanagers. | |
expr: | | |
# Without max_over_time, failed scrapes could create false negatives, see | |
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. | |
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus"}[5m]) < 1 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusTSDBReloadsFailing | |
annotations: | |
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} reload failures over the last 3h. | |
summary: Prometheus has issues reloading blocks from disk. | |
expr: | | |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0 | |
for: 4h | |
labels: | |
severity: warning | |
- alert: PrometheusTSDBCompactionsFailing | |
annotations: | |
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} compaction failures over the last 3h. | |
summary: Prometheus has issues compacting blocks. | |
expr: | | |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0 | |
for: 4h | |
labels: | |
severity: warning | |
- alert: PrometheusNotIngestingSamples | |
annotations: | |
description: Prometheus {{$labels.instance}} is not ingesting samples. | |
summary: Prometheus is not ingesting samples. | |
expr: | | |
( | |
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0 | |
and | |
( | |
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus"}) > 0 | |
or | |
sum without(rule_group) (prometheus_rule_group_rules{job="prometheus"}) > 0 | |
) | |
) | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusDuplicateTimestamps | |
annotations: | |
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. | |
summary: Prometheus is dropping samples with duplicate timestamps. | |
expr: | | |
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusOutOfOrderTimestamps | |
annotations: | |
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. | |
summary: Prometheus drops samples with out-of-order timestamps. | |
expr: | | |
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusRemoteStorageFailures | |
annotations: | |
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} | |
summary: Prometheus fails to send samples to remote storage. | |
expr: | | |
( | |
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m])) | |
/ | |
( | |
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m])) | |
+ | |
(rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus"}[5m])) | |
) | |
) | |
* 100 | |
> 1 | |
for: 15m | |
labels: | |
severity: critical | |
- alert: PrometheusRemoteWriteBehind | |
annotations: | |
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. | |
summary: Prometheus remote write is behind. | |
expr: | | |
# Without max_over_time, failed scrapes could create false negatives, see | |
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. | |
( | |
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus"}[5m]) | |
- ignoring(remote_name, url) group_right | |
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus"}[5m]) | |
) | |
> 120 | |
for: 15m | |
labels: | |
severity: critical | |
- alert: PrometheusRemoteWriteDesiredShards | |
annotations: | |
description: Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}` $labels.instance | query | first | value }}. | |
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. | |
expr: | | |
# Without max_over_time, failed scrapes could create false negatives, see | |
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. | |
( | |
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus"}[5m]) | |
> | |
max_over_time(prometheus_remote_storage_shards_max{job="prometheus"}[5m]) | |
) | |
for: 15m | |
labels: | |
severity: warning | |
- alert: PrometheusRuleFailures | |
annotations: | |
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. | |
summary: Prometheus is failing rule evaluations. | |
expr: | | |
increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0 | |
for: 15m | |
labels: | |
severity: critical | |
- alert: PrometheusMissingRuleEvaluations | |
annotations: | |
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. | |
summary: Prometheus is missing rule evaluations due to slow rule group evaluation. | |
expr: | | |
increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: PrometheusTargetLimitHit | |
annotations: | |
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. | |
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. | |
expr: | | |
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager | |
annotations: | |
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.' | |
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. | |
expr: | | |
min without (alertmanager) ( | |
rate(prometheus_notifications_errors_total{job="prometheus",alertmanager!~``}[5m]) | |
/ | |
rate(prometheus_notifications_sent_total{job="prometheus",alertmanager!~``}[5m]) | |
) | |
* 100 | |
> 3 | |
for: 15m | |
labels: | |
severity: critical | |
- name: minio | |
rules: | |
- alert: minioDisksOffline | |
annotations: | |
message: MinIO '{{ $labels.instance }}' has disks offline | |
expr: | | |
minio_disks_offline != 0 | |
for: 1m | |
labels: | |
severity: critical | |
- alert: minioStorageUsed | |
annotations: | |
message: MinIO disk '{{ $labels.disk }}' has more than 80% storaged used | |
expr: | | |
disk_storage_used / disk_storage_total > 0.8 | |
for: 1m | |
labels: | |
severity: warning | |
- name: promtail_alerts | |
rules: | |
- alert: PromtailRequestsErrors | |
annotations: | |
message: | | |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. | |
expr: | | |
100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) | |
/ | |
sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) | |
> 10 | |
for: 15m | |
labels: | |
severity: critical | |
- alert: PromtailRequestLatency | |
annotations: | |
message: | | |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. | |
expr: | | |
job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1 | |
for: 15m | |
labels: | |
severity: critical | |
- alert: PromtailFileLagging | |
annotations: | |
message: | | |
{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 1MB for more than 15m. | |
expr: | | |
abs(promtail_file_bytes_total - promtail_read_bytes_total) > 1e6 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: PromtailFileMissing | |
annotations: | |
message: | | |
{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} matches the glob but is not being tailed. | |
expr: | | |
promtail_file_bytes_total unless promtail_read_bytes_total | |
for: 15m | |
labels: | |
severity: critical |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
groups: | |
- name: thanos-query.rules | |
rules: | |
- expr: | | |
( | |
sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="unary"}[5m])) | |
/ | |
sum(rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="unary"}[5m])) | |
) | |
record: :grpc_client_failures_per_unary:sum_rate | |
- expr: | | |
( | |
sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="server_stream"}[5m])) | |
/ | |
sum(rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="server_stream"}[5m])) | |
) | |
record: :grpc_client_failures_per_stream:sum_rate | |
- expr: | | |
( | |
sum(rate(thanos_query_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m])) | |
/ | |
sum(rate(thanos_query_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m])) | |
) | |
record: :thanos_query_store_apis_dns_failures_per_lookup:sum_rate | |
- expr: | | |
histogram_quantile(0.99, | |
sum(rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) by (le) | |
) | |
labels: | |
quantile: "0.99" | |
record: :query_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.99, | |
sum(rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m])) by (le) | |
) | |
labels: | |
quantile: "0.99" | |
record: :api_range_query_duration_seconds:histogram_quantile | |
- name: thanos-receive.rules | |
rules: | |
- expr: | | |
sum( | |
rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="unary"}[5m]) | |
/ | |
rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="unary"}[5m]) | |
) | |
record: :grpc_server_failures_per_unary:sum_rate | |
- expr: | | |
sum( | |
rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="server_stream"}[5m]) | |
/ | |
rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="server_stream"}[5m]) | |
) | |
record: :grpc_server_failures_per_stream:sum_rate | |
- expr: | | |
sum( | |
rate(http_requests_total{handler="receive", job=~"thanos-receive.*", code!~"5.."}[5m]) | |
/ | |
rate(http_requests_total{handler="receive", job=~"thanos-receive.*"}[5m]) | |
) | |
record: :http_failure_per_request:sum_rate | |
- expr: | | |
histogram_quantile(0.99, | |
sum(rate(http_request_duration_seconds_bucket{handler="receive", job=~"thanos-receive.*"}[5m])) by (le) | |
) | |
labels: | |
quantile: "0.99" | |
record: :http_request_duration_seconds:histogram_quantile | |
- expr: | | |
( | |
sum(rate(thanos_receive_replications_total{result="error", job=~"thanos-receive.*"}[5m])) | |
/ | |
sum(rate(thanos_receive_replications_total{job=~"thanos-receive.*"}[5m])) | |
) | |
record: :thanos_receive_replication_failure_per_requests:sum_rate | |
- expr: | | |
( | |
sum(rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m])) | |
/ | |
sum(rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m])) | |
) | |
record: :thanos_receive_forward_failure_per_requests:sum_rate | |
- expr: | | |
( | |
sum(rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m])) | |
/ | |
sum(rate(thanos_receive_hashrings_file_refreshes_total{job=~"thanos-receive.*"}[5m])) | |
) | |
record: :thanos_receive_hashring_file_failure_per_refresh:sum_rate | |
- name: thanos-store.rules | |
rules: | |
- expr: | | |
( | |
sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="unary"}[5m])) | |
/ | |
sum(rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="unary"}[5m])) | |
) | |
record: :grpc_server_failures_per_unary:sum_rate | |
- expr: | | |
( | |
sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="server_stream"}[5m])) | |
/ | |
sum(rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="server_stream"}[5m])) | |
) | |
record: :grpc_server_failures_per_stream:sum_rate | |
- expr: | | |
( | |
sum(rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m])) | |
/ | |
sum(rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m])) | |
) | |
record: :thanos_objstore_bucket_failures_per_operation:sum_rate | |
- expr: | | |
histogram_quantile(0.99, | |
sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m])) by (le) | |
) | |
labels: | |
quantile: "0.99" | |
record: :thanos_objstore_bucket_operation_duration_seconds:histogram_quantile | |
- name: thanos-bucket-replicate.rules | |
rules: [] | |
- name: loki_rules | |
rules: | |
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job)) | |
record: job:loki_request_duration_seconds:99quantile | |
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job)) | |
record: job:loki_request_duration_seconds:50quantile | |
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job) / sum(rate(loki_request_duration_seconds_count[1m])) by (job) | |
record: job:loki_request_duration_seconds:avg | |
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job) | |
record: job:loki_request_duration_seconds_bucket:sum_rate | |
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job) | |
record: job:loki_request_duration_seconds_sum:sum_rate | |
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job) | |
record: job:loki_request_duration_seconds_count:sum_rate | |
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route)) | |
record: job_route:loki_request_duration_seconds:99quantile | |
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route)) | |
record: job_route:loki_request_duration_seconds:50quantile | |
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (job, route) | |
record: job_route:loki_request_duration_seconds:avg | |
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route) | |
record: job_route:loki_request_duration_seconds_bucket:sum_rate | |
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) | |
record: job_route:loki_request_duration_seconds_sum:sum_rate | |
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job, route) | |
record: job_route:loki_request_duration_seconds_count:sum_rate | |
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, route)) | |
record: namespace_job_route:loki_request_duration_seconds:99quantile | |
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, route)) | |
record: namespace_job_route:loki_request_duration_seconds:50quantile | |
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) | |
record: namespace_job_route:loki_request_duration_seconds:avg | |
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, route) | |
record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate | |
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route) | |
record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate | |
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) | |
record: namespace_job_route:loki_request_duration_seconds_count:sum_rate | |
- name: node-exporter.rules | |
rules: | |
- expr: | | |
count without (cpu) ( | |
count without (mode) ( | |
node_cpu_seconds_total{job="node-exporter"} | |
) | |
) | |
record: instance:node_num_cpu:sum | |
- expr: | | |
1 - avg without (cpu, mode) ( | |
rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m]) | |
) | |
record: instance:node_cpu_utilisation:rate1m | |
- expr: | | |
( | |
node_load1{job="node-exporter"} | |
/ | |
instance:node_num_cpu:sum{job="node-exporter"} | |
) | |
record: instance:node_load1_per_cpu:ratio | |
- expr: | | |
1 - ( | |
node_memory_MemAvailable_bytes{job="node-exporter"} | |
/ | |
node_memory_MemTotal_bytes{job="node-exporter"} | |
) | |
record: instance:node_memory_utilisation:ratio | |
- expr: | | |
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) | |
record: instance:node_vmstat_pgmajfault:rate1m | |
- expr: | | |
rate(node_disk_io_time_seconds_total{job="node-exporter", device!=""}[1m]) | |
record: instance_device:node_disk_io_time_seconds:rate1m | |
- expr: | | |
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device!=""}[1m]) | |
record: instance_device:node_disk_io_time_weighted_seconds:rate1m | |
- expr: | | |
sum without (device) ( | |
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m]) | |
) | |
record: instance:node_network_receive_bytes_excluding_lo:rate1m | |
- expr: | | |
sum without (device) ( | |
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m]) | |
) | |
record: instance:node_network_transmit_bytes_excluding_lo:rate1m | |
- expr: | | |
sum without (device) ( | |
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m]) | |
) | |
record: instance:node_network_receive_drop_excluding_lo:rate1m | |
- expr: | | |
sum without (device) ( | |
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m]) | |
) | |
record: instance:node_network_transmit_drop_excluding_lo:rate1m | |
- name: promtail_rules | |
rules: | |
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job)) | |
record: job:promtail_request_duration_seconds:99quantile | |
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job)) | |
record: job:promtail_request_duration_seconds:50quantile | |
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job) | |
record: job:promtail_request_duration_seconds:avg | |
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job) | |
record: job:promtail_request_duration_seconds_bucket:sum_rate | |
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) | |
record: job:promtail_request_duration_seconds_sum:sum_rate | |
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job) | |
record: job:promtail_request_duration_seconds_count:sum_rate | |
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace)) | |
record: job_namespace:promtail_request_duration_seconds:99quantile | |
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace)) | |
record: job_namespace:promtail_request_duration_seconds:50quantile | |
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace) | |
record: job_namespace:promtail_request_duration_seconds:avg | |
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace) | |
record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate | |
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace) | |
record: job_namespace:promtail_request_duration_seconds_sum:sum_rate | |
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace) | |
record: job_namespace:promtail_request_duration_seconds_count:sum_rate | |
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code, namespace)) | |
record: job_status_code_namespace:promtail_request_duration_seconds:99quantile | |
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code, namespace)) | |
record: job_status_code_namespace:promtail_request_duration_seconds:50quantile | |
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code, namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code, namespace) | |
record: job_status_code_namespace:promtail_request_duration_seconds:avg | |
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code, namespace) | |
record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate | |
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code, namespace) | |
record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate | |
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code, namespace) | |
record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment