-
-
Save brancz/eb25f016a663eebd1f44b0261e577874 to your computer and use it in GitHub Desktop.
prometheus standard alerts for etcd
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# general cluster availability | |
# alert if another failed peer will result in an unavailable cluster | |
ALERT InsufficientPeers | |
IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) | |
FOR 3m | |
LABELS { | |
severity = "critical" | |
} | |
ANNOTATIONS { | |
summary = "Etcd cluster small", | |
description = "If one more etcd peer goes down the cluster will be unavailable", | |
} | |
# etcd leader alerts | |
# ================== | |
# alert if any etcd instance has no leader | |
ALERT EtcdNoLeader | |
IF etcd_server_has_leader{job="etcd"} == 0 | |
FOR 1m | |
LABELS { | |
severity = "critical" | |
} | |
ANNOTATIONS { | |
summary = "etcd node has no leader", | |
description = "etcd node {{ $labels.instance }} has no leader", | |
} | |
# alert if there are lots of leader changes | |
ALERT HighNumberOfLeaderChanges | |
IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 | |
LABELS { | |
severity = "warning" | |
} | |
ANNOTATIONS { | |
summary = "a high number of leader changes within the etcd cluster are happening", | |
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour", | |
} | |
# gRPC request alerts | |
# =================== | |
# alert if more than 1% of gRPC method calls have failed within the last 5 minutes | |
ALERT HighNumberOfFailedGRPCRequests | |
IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) | |
/ sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01 | |
FOR 10m | |
LABELS { | |
severity = "warning" | |
} | |
ANNOTATIONS { | |
summary = "a high number of gRPC requests are failing", | |
description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", | |
} | |
# alert if more than 5% of gRPC method calls have failed within the last 5 minutes | |
ALERT HighNumberOfFailedGRPCRequests | |
IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) | |
/ sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05 | |
FOR 5m | |
LABELS { | |
severity = "critical" | |
} | |
ANNOTATIONS { | |
summary = "a high number of gRPC requests are failing", | |
description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", | |
} | |
# alert if the 99th percentile of gRPC method calls take more than 150ms | |
ALERT GRPCRequestsSlow | |
IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 | |
FOR 10m | |
LABELS { | |
severity = "critical" | |
} | |
ANNOTATIONS { | |
summary = "slow gRPC requests", | |
description = "on ectd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow", | |
} | |
# HTTP requests alerts | |
# ==================== | |
# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes | |
ALERT HighNumberOfFailedHTTPRequests | |
IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) | |
/ sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 | |
FOR 10m | |
LABELS { | |
severity = "warning" | |
} | |
ANNOTATIONS { | |
summary = "a high number of HTTP requests are failing", | |
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", | |
} | |
# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes | |
ALERT HighNumberOfFailedHTTPRequests | |
IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) | |
/ sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05 | |
FOR 5m | |
LABELS { | |
severity = "critical" | |
} | |
ANNOTATIONS { | |
summary = "a high number of HTTP requests are failing", | |
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", | |
} | |
# alert if the 99th percentile of HTTP requests take more than 150ms | |
ALERT HTTPRequestsSlow | |
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 | |
FOR 10m | |
LABELS { | |
severity = "warning" | |
} | |
ANNOTATIONS { | |
summary = "slow HTTP requests", | |
description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", | |
} | |
# file descriptor alerts | |
# ====================== | |
instance:fd_utilization = process_open_fds / process_max_fds | |
# alert if file descriptors are likely to exhaust within the next 4 hours | |
ALERT FdExhaustionClose | |
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 | |
FOR 10m | |
LABELS { | |
severity = "warning" | |
} | |
ANNOTATIONS { | |
summary = "file descriptors soon exhausted", | |
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", | |
} | |
# alert if file descriptors are likely to exhaust within the next hour | |
ALERT FdExhaustionClose | |
IF predict_linear(instance:fd_utilization[10m], 3600) > 1 | |
FOR 10m | |
LABELS { | |
severity = "critical" | |
} | |
ANNOTATIONS { | |
summary = "file descriptors soon exhausted", | |
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", | |
} | |
# etcd peer communication alerts | |
# ============================== | |
# alert if 99th percentile of round trips take 150ms | |
ALERT EtcdPeerCommunicationSlow | |
IF histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15 | |
FOR 10m | |
LABELS { | |
severity = "warning" | |
} | |
ANNOTATIONS { | |
summary = "etcd peer communication is slow", | |
description = "ectd instance {{ $labels.instance }} peer communication with {{ $label.To }} is slow", | |
} | |
# etcd proposal alerts | |
# ==================== | |
# alert if there are several failed proposals within an hour | |
ALERT HighNumberOfFailedProposals | |
IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 | |
LABELS { | |
severity = "warning" | |
} | |
ANNOTATIONS { | |
summary = "a high number of failed proposals within the etcd cluster are happening", | |
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", | |
} | |
# etcd disk io latency alerts | |
# =========================== | |
# alert if 99th percentile of fsync durations is higher than 500ms | |
ALERT HighFsyncDurations | |
IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 | |
FOR 10m | |
LABELS { | |
severity = "warning" | |
} | |
ANNOTATIONS { | |
summary = "high fsync durations", | |
description = "ectd instance {{ $labels.instance }} fync durations are high", | |
} | |
# alert if 99th percentile of commit durations is higher than 250ms | |
ALERT HighCommitDurations | |
IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 | |
FOR 10m | |
LABELS { | |
severity = "warning" | |
} | |
ANNOTATIONS { | |
summary = "high commit durations", | |
description = "ectd instance {{ $labels.instance }} commit durations are high", | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
fyi you use
ectd
(instead ofetcd
) a few times in your descriptions