Skip to content

Instantly share code, notes, and snippets.

@iamabhishek-dubey
Last active December 18, 2018 07:29
Show Gist options
  • Save iamabhishek-dubey/6029b2fbe319c333e668775a687aad8d to your computer and use it in GitHub Desktop.
Save iamabhishek-dubey/6029b2fbe319c333e668775a687aad8d to your computer and use it in GitHub Desktop.
groups:
- name: cluster-rules
interval: 30s # defaults to global interval testing threshold
rules:
- alert: os_cluster_memory_low
expr: sum (container_memory_working_set_bytes{id="/"}) / sum (machine_memory_bytes{}) *100 > 70
for: 10m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "low"
summary: "Cluster memory usage exceeding 70%"
description: "Cluster memory usage exceeding 70% for more than 10 minutes."
- alert: os_cluster_memory_medium
expr: sum (container_memory_working_set_bytes{id="/"}) / sum (machine_memory_bytes{}) *100 > 80
for: 5m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "medium"
summary: "Cluster memory usage exceeding 80%"
description: "Cluster memory usage exceeding 80% for more than 5 minutes."
- alert: os_cluster_memory_high
expr: sum (container_memory_working_set_bytes{id="/"}) / sum (machine_memory_bytes{}) *100 > 90
for: 2m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "high"
summary: "Cluster memory usage exceeding 90%"
description: "Cluster memory usage exceeding 90% for more than 2 minutes."
- alert : os_cluster_cpu_low
expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) / sum (machine_cpu_cores) * 100 >= 70
for: 10m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "low"
summary: "Cluster CPU usage exceeding 70%"
description: "Cluster CPU usage exceeding 70% for more than 10 minutes."
- alert : os_cluster_cpu_medium
expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) / sum (machine_cpu_cores) * 100 >= 80
for: 5m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "medium"
summary: "Cluster CPU usage exceeding 80%"
description: "Cluster CPU usage exceeding 80% for more than 5 minutes."
- alert : os_cluster_cpu_high
expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) / sum (machine_cpu_cores) * 100 >= 90
for: 2m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "high"
summary: "Cluster CPU usage exceeding 90%"
description: "Cluster CPU usage exceeding 90% for more than 2 minutes."
- alert: os_cluster_filesystem_low
expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) / sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) * 100 >= 70
for: 10m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "low"
summary: "Cluster filesystem usage exceeding 70%"
description: "Cluster filesystem usage exceeding 70% for more than 10 minutes."
- alert: os_cluster_filesystem_medium
expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) / sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) * 100 >= 80
for: 5m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "medium"
summary: "Cluster filesystem usage exceeding 80%"
description: "Cluster filesystem usage exceeding 80% for more than 5 minutes."
- alert: os_cluster_filesystem_high
expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) / sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) * 100 >= 90
for: 2m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "high"
summary: "Cluster filesystem usage exceeding 90%"
description: "Cluster filesystem usage exceeding 90% for more than 2 minutes."
- alert: os_node_down
expr: up{job="kubernetes-nodes"} == 0
for: 2m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "high"
summary: "Instance: {{$labels.instance}} is down."
description: "Instance: {{$labels.instance}} of job {{ $labels.job }} has been down for more than 2 minutes."
- alert: os_node_memory_low
expr: sum (container_memory_working_set_bytes{id="/"}) by (kubernetes_io_hostname) / sum (machine_memory_bytes{}) by (kubernetes_io_hostname) * 100 > 70
for: 10m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "low"
summary: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 70%"
description: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 70% for more than 10 minutes."
- alert: os_node_memory_medium
expr: sum (container_memory_working_set_bytes{id="/"}) by (kubernetes_io_hostname) / sum (machine_memory_bytes{}) by (kubernetes_io_hostname) * 100 > 80
for: 5m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "medium"
summary: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 80%"
description: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 80% for more than 5 minutes."
- alert: os_node_memory_high
expr: sum (container_memory_working_set_bytes{id="/"}) by (kubernetes_io_hostname) / sum (machine_memory_bytes{}) by (kubernetes_io_hostname) * 100 > 90
for: 2m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "high"
summary: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 90%"
description: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 90% for more than 5 minutes."
- alert : os_node_cpu_low
expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) by (kubernetes_io_hostname) / sum (machine_cpu_cores) by (kubernetes_io_hostname) * 100 > 70
for: 10m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "low"
summary: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 70%"
description: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 70% for more than 10 minutes."
- alert : os_node_cpu_medium
expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) by (kubernetes_io_hostname) / sum (machine_cpu_cores) by (kubernetes_io_hostname) * 100 > 80
for: 5m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "medium"
summary: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 80%"
description: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 80% for more than 5 minutes."
- alert : os_node_cpu_high
expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) by (kubernetes_io_hostname) / sum (machine_cpu_cores) by (kubernetes_io_hostname) * 100 > 90
for: 2m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "high"
summary: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 90%"
description: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 90% for more than 2 minutes."
- alert: os_node_filesystem_low
expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname)/ sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname) * 100 > 70
for: 10m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "low"
summary: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 70%"
description: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 70% for more than 10 minutes."
- alert: os_node_filesystem_medium
expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname)/ sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname) * 100 > 80
for: 5m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "medium"
summary: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 80%"
description: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 80% for more than 5 minutes."
- alert: os_node_filesystem_high
expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname)/ sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname) * 100 > 90
for: 2m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "high"
summary: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 90%"
description: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 90% for more than 2 minutes."
- alert: os_node_thinpool_low
expr: (container_fs_usage_bytes{id="/",device="docker--vg-docker--pool"} /1000) / (container_fs_limit_bytes{id="/",device="docker--vg-docker--pool"} /1000 ) * 100 > 70
for: 10m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "low"
summary: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 70%"
description: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 70% for more than 10 minutes."
- alert: os_node_thinpool_medium
expr: (container_fs_usage_bytes{id="/",device="docker--vg-docker--pool"} /1000) / (container_fs_limit_bytes{id="/",device="docker--vg-docker--pool"} /1000 ) * 100 > 80
for: 5m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "medium"
summary: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 80%"
description: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 80% for more than 5 minutes."
- alert: os_node_thinpool_high
expr: (container_fs_usage_bytes{id="/",device="docker--vg-docker--pool"} /1000) / (container_fs_limit_bytes{id="/",device="docker--vg-docker--pool"} /1000 ) * 100 > 90
for: 2m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "high"
summary: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 90%"
description: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 90% for more than 2 minutes."
- alert: os_container_memory_low
expr: (sum (container_memory_working_set_bytes{image!=""}) by (container_name,role,namespace) /1024/1024) /(sum (container_spec_memory_limit_bytes{image!=""}) by (container_name,role,namespace) /1024 /1024 >0) * 100 > 70
for: 10m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "low"
summary: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} from namespace: {{$labels.namespace}} - memory usage exceeding 70%"
description: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} from namespace: {{$labels.namespace}} - memory usage exceeding 70% for more than 10 minutes."
- alert: os_container_memory_medium
expr: (sum (container_memory_working_set_bytes{image!=""}) by (container_name,role,namespace) /1024 /1024) / (sum (container_spec_memory_limit_bytes{image=""}) by (container_name,role,namespace) / 1024 / 1024 > 0 ) * 100 > 80
for: 5m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "medium"
summary: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} - memory usage exceeding 80%"
description: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} - memory usage exceeding 80% for more than 5 minutes."
- alert: os_container_memory_high
expr: (sum (container_memory_working_set_bytes{image!=""}) by (container_name,role,namespace) /1024 /1024) / (sum (container_spec_memory_limit_bytes{image!=""}) by (container_name,role,namespace) / 1024 / 1024 > 0 ) * 100 > 90
for: 2m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "high"
summary: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} - memory usage exceeding 90%"
description: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} - memory usage exceeding 90% for more than 5 minutes."
- alert : os_container_cpu_low
expr: ((sum(rate(container_cpu_usage_seconds_total [1m])) by (container_name,namespace) > 0) * 1000) / (sum(container_spec_cpu_quota) by (container_name,namespace) / 100) * 100 > 70
for: 10m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "low"
summary: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 70%"
description: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 70% for more than 10 minutes."
- alert : os_container_cpu_medium
expr: ((sum(rate(container_cpu_usage_seconds_total [1m])) by (container_name,namespace) > 0) * 1000) / (sum(container_spec_cpu_quota) by (container_name,namespace) / 100) * 100 > 80
for: 5m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "medium"
summary: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 80%"
description: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 80% for more than 5 minutes."
- alert : os_container_cpu_high
expr: ((sum(rate(container_cpu_usage_seconds_total [1m])) by (container_name,namespace) > 0) * 1000) / (sum(container_spec_cpu_quota) by (container_name,namespace) / 100) * 100 > 90
for: 2m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "high"
summary: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 90%"
description: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 90% for more than 2 minutes."
- alert : HaproxyDown
expr: haproxy_up{job = "haproxy-scrape"} == 0
for: 2m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "high"
summary: "HAProxy Instance:{{ $labels.instance }} down"
description: "HAProxy Instance :{{ $labels.instance }} could not be scraped for more than 2 minutes."
- alert: Haproxy_server_connection_error
expr: sum(deriv(haproxy_server_connection_errors_total{namespace!=""}[5m])) by (instance,namespace,service,server,route)>0
for: 2m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "high"
summary: "HAProxy server:{{ $labels.server}} connection error"
description: "HAProxy server:{{ $labels.server}} with service: {{labels.service}}/route:{{labels.route}} in namespace:{{labels.namespace}} has connection errors."
- alert : Haproxy_server_responce_error
expr: sum(deriv(haproxy_server_response_errors_total{namespace!=""}[5m])) by (instance,namespace,service,server,route)>0
for: 2m
labels:
env: "non-production"
cluser_name: "shared"
annotations:
priority: "high"
summary: "HAProxy server:{{ $labels.server}} responce"
description: "HAProxy server:{{ $labels.server}} with service: {{labels.service}}/route:{{labels.route}} in namespace:{{labels.namespace}} has response errors."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment