Skip to content

Instantly share code, notes, and snippets.

@kradalby
Created March 24, 2021 20:09
Show Gist options
  • Save kradalby/d2ab437ac20c96f4940553330520696c to your computer and use it in GitHub Desktop.
Save kradalby/d2ab437ac20c96f4940553330520696c to your computer and use it in GitHub Desktop.
Prometheus rules for v0.3.2 terraform provider
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerFailedReload
annotations:
description: Configuration has failed to load for {{$labels.instance}}.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-alertmanagerfailedreload
summary: Reloading an Alertmanager configuration has failed.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager"}[5m]) == 0
for: 10m
labels:
severity: critical
- alert: AlertmanagerMembersInconsistent
annotations:
description: Alertmanager {{$labels.instance}} has only found {{ $value }} members of the {{$labels.job}} cluster.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-alertmanagermembersinconsistent
summary: A member of an Alertmanager cluster has not found all other cluster members.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_cluster_members{job="alertmanager"}[5m])
< on (job) group_left
count by (job) (max_over_time(alertmanager_cluster_members{job="alertmanager"}[5m]))
for: 10m
labels:
severity: critical
- alert: AlertmanagerFailedToSendAlerts
annotations:
description: Alertmanager {{$labels.instance}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-alertmanagerfailedtosendalerts
summary: An Alertmanager instance failed to send notifications.
expr: |
(
rate(alertmanager_notifications_failed_total{job="alertmanager"}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager"}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
expr: |
min by (job, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager", integration=~`.*`}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager", integration=~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
expr: |
min by (job, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager", integration!~`.*`}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager", integration!~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
- alert: AlertmanagerConfigInconsistent
annotations:
description: Alertmanager instances within the {{$labels.job}} cluster have different configurations.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-alertmanagerconfiginconsistent
summary: Alertmanager instances within the same cluster have different configurations.
expr: |
count by (job) (
count_values by (job) ("config_hash", alertmanager_config_hash{job="alertmanager"})
)
!= 1
for: 20m
labels:
severity: critical
- alert: AlertmanagerClusterDown
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.'
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-alertmanagerclusterdown
summary: Half or more of the Alertmanager instances within the same cluster are down.
expr: |
(
count by (job) (
avg_over_time(up{job="alertmanager"}[5m]) < 0.5
)
/
count by (job) (
up{job="alertmanager"}
)
)
>= 0.5
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterCrashlooping
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.'
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-alertmanagerclustercrashlooping
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
expr: |
(
count by (job) (
changes(process_start_time_seconds{job="alertmanager"}[10m]) > 4
)
/
count by (job) (
up{job="alertmanager"}
)
)
>= 0.5
for: 5m
labels:
severity: critical
- name: cert-manager
rules:
- alert: CertManagerAbsent
annotations:
description: New certificates will not be able to be minted, and existing ones can't be renewed until cert-manager is back.
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerabsent
summary: Cert Manager has dissapeared from Prometheus service discovery.
expr: absent(up{job="cert-manager"})
for: 10m
labels:
severity: critical
- name: certificates
rules:
- alert: CertManagerCertExpirySoon
annotations:
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
description: The domain that this cert covers will be unavailable after {{ $value | humanizeDuration }}. Clients using endpoints that this cert protects will start to fail in {{ $value | humanizeDuration }}.
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertexpirysoon
summary: The cert `{{ $labels.name }}` is {{ $value | humanizeDuration }} from expiry, it should have renewed over a week ago.
expr: |
avg by (exported_namespace, namespace, name) (
certmanager_certificate_expiration_timestamp_seconds - time()
) < (21 * 24 * 3600) # 21 days in seconds
for: 1h
labels:
severity: warning
- alert: CertManagerCertNotReady
annotations:
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
description: This certificate has not been ready to serve traffic for at least 10m. If the cert is being renewed or there is another valid cert, the ingress controller _may_ be able to serve that instead.
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertnotready
summary: The cert `{{ $labels.name }}` is not ready to serve traffic.
expr: |
max by (name, exported_namespace, namespace, condition) (
certmanager_certificate_ready_status{condition!="True"} == 1
)
for: 10m
labels:
severity: critical
- alert: CertManagerHittingRateLimits
annotations:
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
description: Depending on the rate limit, cert-manager may be unable to generate certificates for up to a week.
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerhittingratelimits
summary: Cert manager hitting LetsEncrypt rate limits.
expr: |
sum by (host) (
rate(certmanager_http_acme_client_request_count{status="429"}[5m])
) > 0
for: 5m
labels:
severity: critical
- name: coredns
rules:
- alert: CoreDNSDown
annotations:
message: CoreDNS has disappeared from Prometheus target discovery.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsdown
expr: |
absent(up{job="coredns"} == 1)
for: 15m
labels:
severity: critical
- alert: CoreDNSLatencyHigh
annotations:
message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server {{ $labels.server }} zone {{ $labels.zone }} .
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednslatencyhigh
expr: |
histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="coredns"}[5m])) by(server, zone, le)) > 4
for: 10m
labels:
severity: critical
- alert: CoreDNSErrorsHigh
annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
expr: |
sum(rate(coredns_dns_response_rcode_count_total{job="coredns",rcode="SERVFAIL"}[5m]))
/
sum(rate(coredns_dns_response_rcode_count_total{job="coredns"}[5m])) > 0.03
for: 10m
labels:
severity: critical
- alert: CoreDNSErrorsHigh
annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
expr: |
sum(rate(coredns_dns_response_rcode_count_total{job="coredns",rcode="SERVFAIL"}[5m]))
/
sum(rate(coredns_dns_response_rcode_count_total{job="coredns"}[5m])) > 0.01
for: 10m
labels:
severity: warning
- name: coredns_forward
rules:
- alert: CoreDNSForwardLatencyHigh
annotations:
message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding requests to {{ $labels.to }}.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardlatencyhigh
expr: |
histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="coredns"}[5m])) by(to, le)) > 4
for: 10m
labels:
severity: critical
- alert: CoreDNSForwardErrorsHigh
annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
expr: |
sum(rate(coredns_forward_response_rcode_count_total{job="coredns",rcode="SERVFAIL"}[5m]))
/
sum(rate(coredns_forward_response_rcode_count_total{job="coredns"}[5m])) > 0.03
for: 10m
labels:
severity: critical
- alert: CoreDNSForwardErrorsHigh
annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
expr: |
sum(rate(coredns_dns_response_rcode_count_total{job="coredns",rcode="SERVFAIL"}[5m]))
/
sum(rate(coredns_dns_response_rcode_count_total{job="coredns"}[5m])) > 0.01
for: 10m
labels:
severity: warning
- name: prometheus-operator
rules:
- alert: PrometheusOperatorListErrors
annotations:
description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-prometheusoperatorlisterrors
summary: Errors while performing list operations in controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorWatchErrors
annotations:
description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-prometheusoperatorwatcherrors
summary: Errors while performing watch operations in controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorSyncFailed
annotations:
description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-prometheusoperatorsyncfailed
summary: Last controller reconciliation failed
expr: |
min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorReconcileErrors
annotations:
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-prometheusoperatorreconcileerrors
summary: Errors while reconciling controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator"}[5m]))) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNodeLookupErrors
annotations:
description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-prometheusoperatornodelookuperrors
summary: Errors while reconciling Prometheus.
expr: |
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNotReady
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-prometheusoperatornotready
summary: Prometheus operator not ready
expr: |
min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator"}[5m]) == 0)
for: 5m
labels:
severity: warning
- alert: PrometheusOperatorRejectedResources
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-prometheusoperatorrejectedresources
summary: Resources rejected by Prometheus operator
expr: |
min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator"}[5m]) > 0
for: 5m
labels:
severity: warning
- name: thanos-compact
rules:
- alert: ThanosCompactMultipleRunning
annotations:
description: No more than one Thanos Compact instance should be running at once. There are {{ $value }}
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactmultiplerunning
summary: Thanos Compact has multiple instances running.
expr: sum(up{job=~"thanos-compact.*"}) > 1
for: 5m
labels:
severity: warning
- alert: ThanosCompactHalted
annotations:
description: Thanos Compact {{$labels.job}} has failed to run and now is halted.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthalted
summary: Thanos Compact has failed to run ans is now halted.
expr: thanos_compact_halted{job=~"thanos-compact.*"} == 1
for: 5m
labels:
severity: warning
- alert: ThanosCompactHighCompactionFailures
annotations:
description: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthighcompactionfailures
summary: Thanos Compact is failing to execute compactions.
expr: |
(
sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m]))
/
sum by (job) (rate(thanos_compact_group_compactions_total{job=~"thanos-compact.*"}[5m]))
* 100 > 5
)
for: 15m
labels:
severity: warning
- alert: ThanosCompactBucketHighOperationFailures
annotations:
description: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactbuckethighoperationfailures
summary: Thanos Compact Bucket is having a high number of operation failures.
expr: |
(
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m]))
/
sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~"thanos-compact.*"}[5m]))
* 100 > 5
)
for: 15m
labels:
severity: warning
- alert: ThanosCompactHasNotRun
annotations:
description: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthasnotrun
summary: Thanos Compact has not uploaded anything for last 24 hours.
expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) / 60 / 60 > 24
labels:
severity: warning
- name: thanos-sidecar
rules:
- alert: ThanosSidecarPrometheusDown
annotations:
description: Thanos Sidecar {{$labels.job}} {{$labels.instance}} cannot connect to Prometheus.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarprometheusdown
summary: Thanos Sidecar cannot connect to Prometheus
expr: |
sum by (job, instance) (thanos_sidecar_prometheus_up{job=~"thanos-sidecar.*"} == 0)
for: 5m
labels:
severity: critical
- alert: ThanosSidecarBucketOperationsFailed
annotations:
description: Thanos Sidecar {{$labels.job}} {{$labels.instance}} bucket operations are failing
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarbucketoperationsfailed
summary: Thanos Sidecar bucket operations are failing
expr: |
rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-sidecar.*"}[5m]) > 0
for: 5m
labels:
severity: critical
- alert: ThanosSidecarUnhealthy
annotations:
description: Thanos Sidecar {{$labels.job}} {{$labels.instance}} is unhealthy for more than {{$value}} seconds.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy
summary: Thanos Sidecar is unhealthy.
expr: |
time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"})) >= 240
labels:
severity: critical
- name: thanos-store
rules:
- alert: ThanosStoreGrpcErrorRate
annotations:
description: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate
summary: Thanos Store is failing to handle qrpcd requests.
expr: |
(
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m]))
/
sum by (job) (rate(grpc_server_started_total{job=~"thanos-store.*"}[5m]))
* 100 > 5
)
for: 5m
labels:
severity: warning
- alert: ThanosStoreSeriesGateLatencyHigh
annotations:
description: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreseriesgatelatencyhigh
summary: Thanos Store has high latency for store series gate requests.
expr: |
(
histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
and
sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0
)
for: 10m
labels:
severity: warning
- alert: ThanosStoreBucketHighOperationFailures
annotations:
description: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstorebuckethighoperationfailures
summary: Thanos Store Bucket is failing to execute operations.
expr: |
(
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
/
sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m]))
* 100 > 5
)
for: 15m
labels:
severity: warning
- alert: ThanosStoreObjstoreOperationLatencyHigh
annotations:
description: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreobjstoreoperationlatencyhigh
summary: Thanos Store is having high latency for bucket operations.
expr: |
(
histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
and
sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0
)
for: 10m
labels:
severity: warning
- name: thanos-component-absent
rules:
- alert: ThanosCompactIsDown
annotations:
description: ThanosCompact has disappeared from Prometheus target discovery.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactisdown
summary: thanos component has disappeared from Prometheus target discovery.
expr: |
absent(up{job=~"thanos-compact.*"} == 1)
for: 5m
labels:
severity: critical
- alert: ThanosSidecarIsDown
annotations:
description: ThanosSidecar has disappeared from Prometheus target discovery.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarisdown
summary: thanos component has disappeared from Prometheus target discovery.
expr: |
absent(up{job=~"thanos-sidecar.*"} == 1)
for: 5m
labels:
severity: critical
- alert: ThanosStoreIsDown
annotations:
description: ThanosStore has disappeared from Prometheus target discovery.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreisdown
summary: thanos component has disappeared from Prometheus target discovery.
expr: |
absent(up{job=~"thanos-store.*"} == 1)
for: 5m
labels:
severity: critical
- name: jaeger_alerts
rules:
- alert: JaegerAgentUDPPacketsBeingDropped
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }} UDP packets per second.
expr: rate(jaeger_agent_thrift_udp_server_packets_dropped_total[1m]) > 1
for: 15m
labels:
severity: warning
- alert: JaegerAgentHTTPServerErrs
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% HTTP errors.
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace)> 1
for: 15m
labels:
severity: warning
- alert: JaegerClientSpansDropped
annotations:
message: |
service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)> 1
for: 15m
labels:
severity: warning
- alert: JaegerAgentSpansDropped
annotations:
message: |
agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace)> 1
for: 15m
labels:
severity: warning
- alert: JaegerCollectorQueueNotDraining
annotations:
message: |
collector {{ $labels.job }} {{ $labels.instance }} is not able to drain the queue.
expr: avg_over_time(jaeger_collector_queue_length[10m]) > 1000
for: 15m
labels:
severity: warning
- alert: JaegerCollectorDroppingSpans
annotations:
message: |
collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace)> 1
for: 15m
labels:
severity: warning
- alert: JaegerSamplingUpdateFailing
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies.
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)> 1
for: 15m
labels:
severity: warning
- alert: JaegerCollectorPersistenceSlow
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is slow at persisting spans.
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m]))) > 0.5
for: 15m
labels:
severity: warning
- alert: JaegerThrottlingUpdateFailing
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies.
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace)> 1
for: 15m
labels:
severity: warning
- alert: JaegerQueryReqsFailing
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace)> 1
for: 15m
labels:
severity: warning
- alert: JaegerCassandraWritesFailing
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)> 1
for: 15m
labels:
severity: warning
- alert: JaegerCassandraReadsFailing
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance, job, namespace)> 1
for: 15m
labels:
severity: warning
- name: loki_alerts
rules:
- alert: LokiRequestErrors
annotations:
message: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
> 10
for: 15m
labels:
severity: critical
- alert: LokiRequestPanics
annotations:
message: |
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
expr: |
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
labels:
severity: critical
- alert: LokiRequestLatency
annotations:
message: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
expr: |
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1
for: 15m
labels:
severity: critical
- name: node-exporter
rules:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 20
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
summary: Filesystem has less than 5% space left.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
summary: Filesystem has less than 3% space left.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
summary: Filesystem has less than 5% inodes left.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
summary: Filesystem has less than 3% inodes left.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
summary: Network interface is reporting many receive errors.
expr: |
rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
summary: Network interface is reporting many transmit errors.
expr: |
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
summary: Number of conntrack are getting close to the limit.
expr: |
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
labels:
severity: warning
- alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector failed to scrape.
summary: Node Exporter text file collector failed to scrape.
expr: |
node_textfile_scrape_error{job="node-exporter"} == 1
labels:
severity: warning
- alert: NodeClockSkewDetected
annotations:
description: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
summary: Clock skew detected.
expr: |
(
node_timex_offset_seconds > 0.05
and
deriv(node_timex_offset_seconds[5m]) >= 0
)
or
(
node_timex_offset_seconds < -0.05
and
deriv(node_timex_offset_seconds[5m]) <= 0
)
for: 10m
labels:
severity: warning
- alert: NodeClockNotSynchronising
annotations:
description: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
summary: Clock not synchronising.
expr: |
min_over_time(node_timex_sync_status[5m]) == 0
and
node_timex_maxerror_seconds >= 16
for: 10m
labels:
severity: warning
- alert: NodeRAIDDegraded
annotations:
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
summary: RAID Array is degraded
expr: |
node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
for: 15m
labels:
severity: critical
- alert: NodeRAIDDiskFailure
annotations:
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
summary: Failed device in RAID array
expr: |
node_md_disks{state="failed"} > 0
labels:
severity: warning
- name: PostgreSQL
rules:
- alert: PostgreSQLMaxConnectionsReached
annotations:
description: '{{ $labels.instance }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy.'
summary: '{{ $labels.instance }} has maxed out Postgres connections.'
expr: sum(pg_stat_activity_count) by (instance) >= sum(pg_settings_max_connections) by (instance) - sum(pg_settings_superuser_reserved_connections) by (instance)
for: 1m
labels:
severity: email
- alert: PostgreSQLHighConnections
annotations:
description: '{{ $labels.instance }} is exceeding 80% of the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Please check utilization graphs and confirm if this is normal service growth, abuse or an otherwise temporary condition or if new resources need to be provisioned (or the limits increased, which is mostly likely).'
summary: '{{ $labels.instance }} is over 80% of max Postgres connections.'
expr: sum(pg_stat_activity_count) by (instance) > (sum(pg_settings_max_connections) by (instance) - sum(pg_settings_superuser_reserved_connections) by (instance)) * 0.8
for: 10m
labels:
severity: email
- alert: PostgreSQLDown
annotations:
description: '{{ $labels.instance }} is rejecting query requests from the exporter, and thus probably not allowing DNS requests to work either. User services should not be effected provided at least 1 node is still alive.'
summary: 'PostgreSQL is not processing queries: {{ $labels.instance }}'
expr: pg_up != 1
for: 1m
labels:
severity: email
- alert: PostgreSQLSlowQueries
annotations:
description: 'PostgreSQL high number of slow queries {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }} '
summary: 'PostgreSQL high number of slow on {{ $labels.cluster }} for database {{ $labels.datname }} '
expr: avg(rate(pg_stat_activity_max_tx_duration{datname!~"template.*"}[2m])) by (datname) > 2 * 60
for: 2m
labels:
severity: email
- alert: PostgreSQLQPS
annotations:
description: PostgreSQL high number of queries per second on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}
summary: PostgreSQL high number of queries per second {{ $labels.cluster }} for database {{ $labels.datname }}
expr: avg(irate(pg_stat_database_xact_commit{datname!~"template.*"}[5m]) + irate(pg_stat_database_xact_rollback{datname!~"template.*"}[5m])) by (datname) > 10000
for: 5m
labels:
severity: email
- alert: PostgreSQLCacheHitRatio
annotations:
description: PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}
summary: PostgreSQL low cache hit rate on {{ $labels.cluster }} for database {{ $labels.datname }}
expr: avg(rate(pg_stat_database_blks_hit{datname!~"template.*"}[5m]) / (rate(pg_stat_database_blks_hit{datname!~"template.*"}[5m]) + rate(pg_stat_database_blks_read{datname!~"template.*"}[5m]))) by (datname) < 0.98
for: 5m
labels:
severity: email
- name: prometheus
rules:
- alert: PrometheusBadConfig
annotations:
description: Prometheus {{$labels.instance}} has failed to reload its configuration.
summary: Failed Prometheus configuration reload.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_config_last_reload_successful{job="prometheus"}[5m]) == 0
for: 10m
labels:
severity: critical
- alert: PrometheusNotificationQueueRunningFull
annotations:
description: Alert notification queue of Prometheus {{$labels.instance}} is running full.
summary: Prometheus alert notification queue predicted to run full in less than 30m.
expr: |
# Without min_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30)
>
min_over_time(prometheus_notifications_queue_capacity{job="prometheus"}[5m])
)
for: 15m
labels:
severity: warning
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
expr: |
(
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
)
* 100
> 1
for: 15m
labels:
severity: warning
- alert: PrometheusNotConnectedToAlertmanagers
annotations:
description: Prometheus {{$labels.instance}} is not connected to any Alertmanagers.
summary: Prometheus is not connected to any Alertmanagers.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus"}[5m]) < 1
for: 10m
labels:
severity: warning
- alert: PrometheusTSDBReloadsFailing
annotations:
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} reload failures over the last 3h.
summary: Prometheus has issues reloading blocks from disk.
expr: |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0
for: 4h
labels:
severity: warning
- alert: PrometheusTSDBCompactionsFailing
annotations:
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} compaction failures over the last 3h.
summary: Prometheus has issues compacting blocks.
expr: |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0
for: 4h
labels:
severity: warning
- alert: PrometheusNotIngestingSamples
annotations:
description: Prometheus {{$labels.instance}} is not ingesting samples.
summary: Prometheus is not ingesting samples.
expr: |
(
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0
and
(
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus"}) > 0
or
sum without(rule_group) (prometheus_rule_group_rules{job="prometheus"}) > 0
)
)
for: 10m
labels:
severity: warning
- alert: PrometheusDuplicateTimestamps
annotations:
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
summary: Prometheus is dropping samples with duplicate timestamps.
expr: |
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusOutOfOrderTimestamps
annotations:
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
summary: Prometheus drops samples with out-of-order timestamps.
expr: |
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusRemoteStorageFailures
annotations:
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
summary: Prometheus fails to send samples to remote storage.
expr: |
(
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m]))
/
(
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m]))
+
(rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus"}[5m]))
)
)
* 100
> 1
for: 15m
labels:
severity: critical
- alert: PrometheusRemoteWriteBehind
annotations:
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
summary: Prometheus remote write is behind.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus"}[5m])
- ignoring(remote_name, url) group_right
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus"}[5m])
)
> 120
for: 15m
labels:
severity: critical
- alert: PrometheusRemoteWriteDesiredShards
annotations:
description: Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}` $labels.instance | query | first | value }}.
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus"}[5m])
>
max_over_time(prometheus_remote_storage_shards_max{job="prometheus"}[5m])
)
for: 15m
labels:
severity: warning
- alert: PrometheusRuleFailures
annotations:
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
summary: Prometheus is failing rule evaluations.
expr: |
increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0
for: 15m
labels:
severity: critical
- alert: PrometheusMissingRuleEvaluations
annotations:
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
expr: |
increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusTargetLimitHit
annotations:
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
expr: |
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.'
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |
min without (alertmanager) (
rate(prometheus_notifications_errors_total{job="prometheus",alertmanager!~``}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus",alertmanager!~``}[5m])
)
* 100
> 3
for: 15m
labels:
severity: critical
- name: minio
rules:
- alert: minioDisksOffline
annotations:
message: MinIO '{{ $labels.instance }}' has disks offline
expr: |
minio_disks_offline != 0
for: 1m
labels:
severity: critical
- alert: minioStorageUsed
annotations:
message: MinIO disk '{{ $labels.disk }}' has more than 80% storaged used
expr: |
disk_storage_used / disk_storage_total > 0.8
for: 1m
labels:
severity: warning
- name: promtail_alerts
rules:
- alert: PromtailRequestsErrors
annotations:
message: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
expr: |
100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance)
/
sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance)
> 10
for: 15m
labels:
severity: critical
- alert: PromtailRequestLatency
annotations:
message: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
expr: |
job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1
for: 15m
labels:
severity: critical
- alert: PromtailFileLagging
annotations:
message: |
{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 1MB for more than 15m.
expr: |
abs(promtail_file_bytes_total - promtail_read_bytes_total) > 1e6
for: 15m
labels:
severity: warning
- alert: PromtailFileMissing
annotations:
message: |
{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} matches the glob but is not being tailed.
expr: |
promtail_file_bytes_total unless promtail_read_bytes_total
for: 15m
labels:
severity: critical
groups:
- name: thanos-query.rules
rules:
- expr: |
(
sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="unary"}[5m]))
/
sum(rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="unary"}[5m]))
)
record: :grpc_client_failures_per_unary:sum_rate
- expr: |
(
sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="server_stream"}[5m]))
/
sum(rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="server_stream"}[5m]))
)
record: :grpc_client_failures_per_stream:sum_rate
- expr: |
(
sum(rate(thanos_query_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
/
sum(rate(thanos_query_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
)
record: :thanos_query_store_apis_dns_failures_per_lookup:sum_rate
- expr: |
histogram_quantile(0.99,
sum(rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) by (le)
)
labels:
quantile: "0.99"
record: :query_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.99,
sum(rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m])) by (le)
)
labels:
quantile: "0.99"
record: :api_range_query_duration_seconds:histogram_quantile
- name: thanos-receive.rules
rules:
- expr: |
sum(
rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="unary"}[5m])
/
rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="unary"}[5m])
)
record: :grpc_server_failures_per_unary:sum_rate
- expr: |
sum(
rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="server_stream"}[5m])
/
rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="server_stream"}[5m])
)
record: :grpc_server_failures_per_stream:sum_rate
- expr: |
sum(
rate(http_requests_total{handler="receive", job=~"thanos-receive.*", code!~"5.."}[5m])
/
rate(http_requests_total{handler="receive", job=~"thanos-receive.*"}[5m])
)
record: :http_failure_per_request:sum_rate
- expr: |
histogram_quantile(0.99,
sum(rate(http_request_duration_seconds_bucket{handler="receive", job=~"thanos-receive.*"}[5m])) by (le)
)
labels:
quantile: "0.99"
record: :http_request_duration_seconds:histogram_quantile
- expr: |
(
sum(rate(thanos_receive_replications_total{result="error", job=~"thanos-receive.*"}[5m]))
/
sum(rate(thanos_receive_replications_total{job=~"thanos-receive.*"}[5m]))
)
record: :thanos_receive_replication_failure_per_requests:sum_rate
- expr: |
(
sum(rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
/
sum(rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m]))
)
record: :thanos_receive_forward_failure_per_requests:sum_rate
- expr: |
(
sum(rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
/
sum(rate(thanos_receive_hashrings_file_refreshes_total{job=~"thanos-receive.*"}[5m]))
)
record: :thanos_receive_hashring_file_failure_per_refresh:sum_rate
- name: thanos-store.rules
rules:
- expr: |
(
sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="unary"}[5m]))
/
sum(rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="unary"}[5m]))
)
record: :grpc_server_failures_per_unary:sum_rate
- expr: |
(
sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="server_stream"}[5m]))
/
sum(rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="server_stream"}[5m]))
)
record: :grpc_server_failures_per_stream:sum_rate
- expr: |
(
sum(rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
/
sum(rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m]))
)
record: :thanos_objstore_bucket_failures_per_operation:sum_rate
- expr: |
histogram_quantile(0.99,
sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m])) by (le)
)
labels:
quantile: "0.99"
record: :thanos_objstore_bucket_operation_duration_seconds:histogram_quantile
- name: thanos-bucket-replicate.rules
rules: []
- name: loki_rules
rules:
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job))
record: job:loki_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job))
record: job:loki_request_duration_seconds:50quantile
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job) / sum(rate(loki_request_duration_seconds_count[1m])) by (job)
record: job:loki_request_duration_seconds:avg
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job)
record: job:loki_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
record: job:loki_request_duration_seconds_sum:sum_rate
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job)
record: job:loki_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route))
record: job_route:loki_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route))
record: job_route:loki_request_duration_seconds:50quantile
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
record: job_route:loki_request_duration_seconds:avg
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route)
record: job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
record: job_route:loki_request_duration_seconds_sum:sum_rate
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
record: job_route:loki_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, route))
record: namespace_job_route:loki_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, route))
record: namespace_job_route:loki_request_duration_seconds:50quantile
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds:avg
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_count:sum_rate
- name: node-exporter.rules
rules:
- expr: |
count without (cpu) (
count without (mode) (
node_cpu_seconds_total{job="node-exporter"}
)
)
record: instance:node_num_cpu:sum
- expr: |
1 - avg without (cpu, mode) (
rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m])
)
record: instance:node_cpu_utilisation:rate1m
- expr: |
(
node_load1{job="node-exporter"}
/
instance:node_num_cpu:sum{job="node-exporter"}
)
record: instance:node_load1_per_cpu:ratio
- expr: |
1 - (
node_memory_MemAvailable_bytes{job="node-exporter"}
/
node_memory_MemTotal_bytes{job="node-exporter"}
)
record: instance:node_memory_utilisation:ratio
- expr: |
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
record: instance:node_vmstat_pgmajfault:rate1m
- expr: |
rate(node_disk_io_time_seconds_total{job="node-exporter", device!=""}[1m])
record: instance_device:node_disk_io_time_seconds:rate1m
- expr: |
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device!=""}[1m])
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
- expr: |
sum without (device) (
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_receive_bytes_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_transmit_bytes_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_receive_drop_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_transmit_drop_excluding_lo:rate1m
- name: promtail_rules
rules:
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job))
record: job:promtail_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job))
record: job:promtail_request_duration_seconds:50quantile
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job)
record: job:promtail_request_duration_seconds:avg
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job)
record: job:promtail_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job)
record: job:promtail_request_duration_seconds_sum:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job)
record: job:promtail_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace))
record: job_namespace:promtail_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace))
record: job_namespace:promtail_request_duration_seconds:50quantile
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds:avg
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace)
record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds_sum:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code, namespace))
record: job_status_code_namespace:promtail_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code, namespace))
record: job_status_code_namespace:promtail_request_duration_seconds:50quantile
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code, namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code, namespace)
record: job_status_code_namespace:promtail_request_duration_seconds:avg
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code, namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code, namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code, namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment