Skip to content

Instantly share code, notes, and snippets.

@mattmattox
Created April 12, 2022 17:33
Show Gist options
  • Save mattmattox/9d442945a40e7c0b06de875a6e14cf6c to your computer and use it in GitHub Desktop.
Save mattmattox/9d442945a40e7c0b06de875a6e14cf6c to your computer and use it in GitHub Desktop.
Custom PrometheusRule to pageout to Slack
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
annotations:
meta.helm.sh/release-name: monitoring
meta.helm.sh/release-namespace: monitoring
prometheus-operator-validated: "true"
labels:
app: kube-prometheus-stack
app.kubernetes.io/instance: monitoring
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/part-of: kube-prometheus-stack
app.kubernetes.io/version: 34.9.0
chart: kube-prometheus-stack-34.9.0
cluster: a-rke2-devops
heritage: Helm
release: monitoring
name: monitoring-kube-prometheus-slack.rules
namespace: monitoring
spec:
groups:
- name: slack.rules
rules:
- alert: HostOutOfMemory
expr: 'node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10'
for: 2m
labels:
severity: slack-alert
- alert: HostMemoryUnderMemoryPressure
expr: 'rate(node_vmstat_pgmajfault[1m]) > 1000'
for: 2m
labels:
severity: slack-alert
- alert: HostUnusualNetworkThroughputIn
expr: 'sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100'
for: 2m
labels:
severity: slack-alert
- alert: HostUnusualNetworkThroughputOut
expr: 'sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100'
for: 2m
labels:
severity: slack-alert
- alert: HostUnusualDiskReadRate
expr: 'sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50'
for: 5m
labels:
severity: slack-alert
- alert: HostUnusualDiskWriteRate
expr: 'sum by (instance) (rate(node_disk_write_bytes_total[2m])) / 1024 / 1024 > 50'
for: 5m
labels:
severity: slack-alert
- alert: HostOutOfDiskSpace
expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
for: 2m
labels:
severity: slack-alert
- alert: HostDiskWillFillIn24Hours
expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
for: 2m
labels:
severity: slack-alert
- alert: HostDiskWillFillIn48Hours
expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 48 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
for: 2m
labels:
severity: slack-alert
- alert: HostOutOfInodes
expr: 'node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0'
for: 2m
labels:
severity: slack-alert
- alert: HostUnusualDiskReadLatency
expr: 'rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0'
for: 2m
labels:
severity: slack-alert
- alert: HostUnusualDiskWriteLatency
expr: 'rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0'
for: 2m
labels:
severity: slack-alert
- alert: HostHighCpuLoad
expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80'
for: 2m
labels:
severity: slack-alert
- alert: HostCpuStealNoisyNeighbor
expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
for: 2m
labels:
severity: slack-alert
- alert: HostContextSwitching
expr: '(rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000'
for: 2m
labels:
severity: slack-alert
- alert: KubernetesNodeNotReady
expr: 'kube_node_status_condition{condition="Ready",status="false"} == 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesNodeNotSchedulable
expr: 'kube_node_status_condition{condition="Schedulable",status="false"} == 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesNodeMemoryPressure
expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesNodeDiskPressure
expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesNodeNetworkUnavailable
expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesNodeOutOfDisk
expr: 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesOutOfCapacity
expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesContainerOomKiller
expr: 'kube_container_status_last_seen_seconds_ago{state="OOMKilled",container=~"/kubelet.*"} > 10'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesPersistentVolumeClaimPending
expr: 'kube_persistentvolumeclaim_info{status="Pending"} > 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesPersistentVolumeClaimLost
expr: 'kube_persistentvolumeclaim_info{status="Lost"} > 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesPersistentVolumeClaimFailed
expr: 'kube_persistentvolumeclaim_info{status="Failed"} > 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesVolumeOutOfDiskSpace
expr: 'kube_persistentvolume_info{status="OutOfDisk"} > 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesVolumeFailed
expr: 'kube_persistentvolume_info{status="Failed"} > 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesVolumeLost
expr: 'kube_persistentvolume_info{status="Lost"} > 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesVolumePending
expr: 'kube_persistentvolume_info{status="Pending"} > 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesVolumeOutOfDiskSpace
expr: 'kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10'
for: 2m
labels:
severity: slack-alert
- alert: KubernetesStatefulSetDown
expr: 'kube_statefulset_status_replicas{status="Failed"} > 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesStatefulSetPending
expr: 'kube_statefulset_status_replicas{status="Pending"} > 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesPodNotHealthy
expr: 'min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[15m:1m]) > 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesPodCrashLooping
expr: 'min_over_time(sum by (namespace, pod) (kube_pod_container_status_restarts_total{container=~"kube-.*",reason="CrashLooping"})[15m:1m]) > 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesPodRestarting
expr: 'min_over_time(sum by (namespace, pod) (kube_pod_container_status_restarts_total{container=~"kube-.*"})[15m:1m]) > 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesReplicasSetMismatch
expr: 'kube_replicationcontroller_status_replicas != kube_replicationcontroller_status_replicas_current'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesDeploymentReplicasMismatch
expr: 'kube_deployment_status_replicas != kube_deployment_status_replicas_current'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesDeploymentFailed
expr: 'kube_deployment_status_replicas != kube_deployment_status_replicas_current and kube_deployment_status_replicas_available == 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesStatefulSetReplicasMismatch
expr: 'kube_statefulset_status_replicas != kube_statefulset_status_replicas_current'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesStatefulSetFailed
expr: 'kube_statefulset_status_replicas != kube_statefulset_status_replicas_current and kube_statefulset_status_replicas_available == 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesDaemonSetReplicasMismatch
expr: 'kube_daemonset_status_replicas != kube_daemonset_status_replicas_current'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesDaemonSetFailed
expr: 'kube_daemonset_status_replicas != kube_daemonset_status_replicas_current and kube_daemonset_status_replicas_available == 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesJobFailed
expr: 'kube_job_status_failed > 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesDeploymentGenerationMismatch
expr: 'kube_deployment_status_observed_generation != kube_deployment_status_replicas_current_generation'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesStatefulSetUpdateNotRolledOut
expr: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesDaemonSetRolloutStuck
expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
for: 10m
labels:
severity: slack-alert
- alert: KubernetesDaemonSetMisscheduled
expr: 'kube_daemonset_status_number_misscheduled > 0'
for: 1m
labels:
severity: slack-alert
- alert: KubernetesCronJobTooLong
expr: 'time() - kube_cronjob_next_schedule_time > 3600'
for: 1m
labels:
severity: slack-alert
- alert: KubernetesJobSlowCompletion
expr: 'kube_job_spec_completions - kube_job_status_succeeded > 0'
for: 12h
labels:
severity: slack-alert
- alert: KubernetesApiServerErrors
expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3'
for: 2m
labels:
severity: slack-alert
- alert: KubernetesApiClientErrors
expr: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1'
for: 2m
labels:
severity: slack-alert
- alert: KubernetesClientCertificateExpiresNextWeek
expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60'
labels:
severity: slack-alert
- alert: KubernetesClientCertificateExpiresSoon
expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60'
labels:
severity: slack-alert
- alert: KubernetesClientCertificateExpired
expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 0'
labels:
severity: slack-alert
- alert: KubernetesApiSserverLatency
expr: 'apiserver_request_duration_seconds_sum{job="apiserver"} > 0 and histogram_quantile(0.99, sum by (job, le) (rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m]))) > 0.5'
labels:
severity: slack-alert
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment