Last active
December 18, 2018 07:29
-
-
Save iamabhishek-dubey/6029b2fbe319c333e668775a687aad8d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
groups: | |
- name: cluster-rules | |
interval: 30s # defaults to global interval testing threshold | |
rules: | |
- alert: os_cluster_memory_low | |
expr: sum (container_memory_working_set_bytes{id="/"}) / sum (machine_memory_bytes{}) *100 > 70 | |
for: 10m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "low" | |
summary: "Cluster memory usage exceeding 70%" | |
description: "Cluster memory usage exceeding 70% for more than 10 minutes." | |
- alert: os_cluster_memory_medium | |
expr: sum (container_memory_working_set_bytes{id="/"}) / sum (machine_memory_bytes{}) *100 > 80 | |
for: 5m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "medium" | |
summary: "Cluster memory usage exceeding 80%" | |
description: "Cluster memory usage exceeding 80% for more than 5 minutes." | |
- alert: os_cluster_memory_high | |
expr: sum (container_memory_working_set_bytes{id="/"}) / sum (machine_memory_bytes{}) *100 > 90 | |
for: 2m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "high" | |
summary: "Cluster memory usage exceeding 90%" | |
description: "Cluster memory usage exceeding 90% for more than 2 minutes." | |
- alert : os_cluster_cpu_low | |
expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) / sum (machine_cpu_cores) * 100 >= 70 | |
for: 10m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "low" | |
summary: "Cluster CPU usage exceeding 70%" | |
description: "Cluster CPU usage exceeding 70% for more than 10 minutes." | |
- alert : os_cluster_cpu_medium | |
expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) / sum (machine_cpu_cores) * 100 >= 80 | |
for: 5m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "medium" | |
summary: "Cluster CPU usage exceeding 80%" | |
description: "Cluster CPU usage exceeding 80% for more than 5 minutes." | |
- alert : os_cluster_cpu_high | |
expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) / sum (machine_cpu_cores) * 100 >= 90 | |
for: 2m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "high" | |
summary: "Cluster CPU usage exceeding 90%" | |
description: "Cluster CPU usage exceeding 90% for more than 2 minutes." | |
- alert: os_cluster_filesystem_low | |
expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) / sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) * 100 >= 70 | |
for: 10m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "low" | |
summary: "Cluster filesystem usage exceeding 70%" | |
description: "Cluster filesystem usage exceeding 70% for more than 10 minutes." | |
- alert: os_cluster_filesystem_medium | |
expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) / sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) * 100 >= 80 | |
for: 5m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "medium" | |
summary: "Cluster filesystem usage exceeding 80%" | |
description: "Cluster filesystem usage exceeding 80% for more than 5 minutes." | |
- alert: os_cluster_filesystem_high | |
expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) / sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) * 100 >= 90 | |
for: 2m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "high" | |
summary: "Cluster filesystem usage exceeding 90%" | |
description: "Cluster filesystem usage exceeding 90% for more than 2 minutes." | |
- alert: os_node_down | |
expr: up{job="kubernetes-nodes"} == 0 | |
for: 2m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "high" | |
summary: "Instance: {{$labels.instance}} is down." | |
description: "Instance: {{$labels.instance}} of job {{ $labels.job }} has been down for more than 2 minutes." | |
- alert: os_node_memory_low | |
expr: sum (container_memory_working_set_bytes{id="/"}) by (kubernetes_io_hostname) / sum (machine_memory_bytes{}) by (kubernetes_io_hostname) * 100 > 70 | |
for: 10m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "low" | |
summary: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 70%" | |
description: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 70% for more than 10 minutes." | |
- alert: os_node_memory_medium | |
expr: sum (container_memory_working_set_bytes{id="/"}) by (kubernetes_io_hostname) / sum (machine_memory_bytes{}) by (kubernetes_io_hostname) * 100 > 80 | |
for: 5m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "medium" | |
summary: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 80%" | |
description: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 80% for more than 5 minutes." | |
- alert: os_node_memory_high | |
expr: sum (container_memory_working_set_bytes{id="/"}) by (kubernetes_io_hostname) / sum (machine_memory_bytes{}) by (kubernetes_io_hostname) * 100 > 90 | |
for: 2m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "high" | |
summary: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 90%" | |
description: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 90% for more than 5 minutes." | |
- alert : os_node_cpu_low | |
expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) by (kubernetes_io_hostname) / sum (machine_cpu_cores) by (kubernetes_io_hostname) * 100 > 70 | |
for: 10m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "low" | |
summary: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 70%" | |
description: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 70% for more than 10 minutes." | |
- alert : os_node_cpu_medium | |
expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) by (kubernetes_io_hostname) / sum (machine_cpu_cores) by (kubernetes_io_hostname) * 100 > 80 | |
for: 5m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "medium" | |
summary: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 80%" | |
description: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 80% for more than 5 minutes." | |
- alert : os_node_cpu_high | |
expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) by (kubernetes_io_hostname) / sum (machine_cpu_cores) by (kubernetes_io_hostname) * 100 > 90 | |
for: 2m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "high" | |
summary: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 90%" | |
description: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 90% for more than 2 minutes." | |
- alert: os_node_filesystem_low | |
expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname)/ sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname) * 100 > 70 | |
for: 10m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "low" | |
summary: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 70%" | |
description: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 70% for more than 10 minutes." | |
- alert: os_node_filesystem_medium | |
expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname)/ sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname) * 100 > 80 | |
for: 5m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "medium" | |
summary: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 80%" | |
description: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 80% for more than 5 minutes." | |
- alert: os_node_filesystem_high | |
expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname)/ sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname) * 100 > 90 | |
for: 2m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "high" | |
summary: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 90%" | |
description: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 90% for more than 2 minutes." | |
- alert: os_node_thinpool_low | |
expr: (container_fs_usage_bytes{id="/",device="docker--vg-docker--pool"} /1000) / (container_fs_limit_bytes{id="/",device="docker--vg-docker--pool"} /1000 ) * 100 > 70 | |
for: 10m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "low" | |
summary: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 70%" | |
description: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 70% for more than 10 minutes." | |
- alert: os_node_thinpool_medium | |
expr: (container_fs_usage_bytes{id="/",device="docker--vg-docker--pool"} /1000) / (container_fs_limit_bytes{id="/",device="docker--vg-docker--pool"} /1000 ) * 100 > 80 | |
for: 5m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "medium" | |
summary: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 80%" | |
description: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 80% for more than 5 minutes." | |
- alert: os_node_thinpool_high | |
expr: (container_fs_usage_bytes{id="/",device="docker--vg-docker--pool"} /1000) / (container_fs_limit_bytes{id="/",device="docker--vg-docker--pool"} /1000 ) * 100 > 90 | |
for: 2m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "high" | |
summary: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 90%" | |
description: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 90% for more than 2 minutes." | |
- alert: os_container_memory_low | |
expr: (sum (container_memory_working_set_bytes{image!=""}) by (container_name,role,namespace) /1024/1024) /(sum (container_spec_memory_limit_bytes{image!=""}) by (container_name,role,namespace) /1024 /1024 >0) * 100 > 70 | |
for: 10m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "low" | |
summary: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} from namespace: {{$labels.namespace}} - memory usage exceeding 70%" | |
description: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} from namespace: {{$labels.namespace}} - memory usage exceeding 70% for more than 10 minutes." | |
- alert: os_container_memory_medium | |
expr: (sum (container_memory_working_set_bytes{image!=""}) by (container_name,role,namespace) /1024 /1024) / (sum (container_spec_memory_limit_bytes{image=""}) by (container_name,role,namespace) / 1024 / 1024 > 0 ) * 100 > 80 | |
for: 5m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "medium" | |
summary: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} - memory usage exceeding 80%" | |
description: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} - memory usage exceeding 80% for more than 5 minutes." | |
- alert: os_container_memory_high | |
expr: (sum (container_memory_working_set_bytes{image!=""}) by (container_name,role,namespace) /1024 /1024) / (sum (container_spec_memory_limit_bytes{image!=""}) by (container_name,role,namespace) / 1024 / 1024 > 0 ) * 100 > 90 | |
for: 2m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "high" | |
summary: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} - memory usage exceeding 90%" | |
description: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} - memory usage exceeding 90% for more than 5 minutes." | |
- alert : os_container_cpu_low | |
expr: ((sum(rate(container_cpu_usage_seconds_total [1m])) by (container_name,namespace) > 0) * 1000) / (sum(container_spec_cpu_quota) by (container_name,namespace) / 100) * 100 > 70 | |
for: 10m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "low" | |
summary: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 70%" | |
description: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 70% for more than 10 minutes." | |
- alert : os_container_cpu_medium | |
expr: ((sum(rate(container_cpu_usage_seconds_total [1m])) by (container_name,namespace) > 0) * 1000) / (sum(container_spec_cpu_quota) by (container_name,namespace) / 100) * 100 > 80 | |
for: 5m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "medium" | |
summary: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 80%" | |
description: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 80% for more than 5 minutes." | |
- alert : os_container_cpu_high | |
expr: ((sum(rate(container_cpu_usage_seconds_total [1m])) by (container_name,namespace) > 0) * 1000) / (sum(container_spec_cpu_quota) by (container_name,namespace) / 100) * 100 > 90 | |
for: 2m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "high" | |
summary: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 90%" | |
description: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 90% for more than 2 minutes." | |
- alert : HaproxyDown | |
expr: haproxy_up{job = "haproxy-scrape"} == 0 | |
for: 2m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "high" | |
summary: "HAProxy Instance:{{ $labels.instance }} down" | |
description: "HAProxy Instance :{{ $labels.instance }} could not be scraped for more than 2 minutes." | |
- alert: Haproxy_server_connection_error | |
expr: sum(deriv(haproxy_server_connection_errors_total{namespace!=""}[5m])) by (instance,namespace,service,server,route)>0 | |
for: 2m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "high" | |
summary: "HAProxy server:{{ $labels.server}} connection error" | |
description: "HAProxy server:{{ $labels.server}} with service: {{labels.service}}/route:{{labels.route}} in namespace:{{labels.namespace}} has connection errors." | |
- alert : Haproxy_server_responce_error | |
expr: sum(deriv(haproxy_server_response_errors_total{namespace!=""}[5m])) by (instance,namespace,service,server,route)>0 | |
for: 2m | |
labels: | |
env: "non-production" | |
cluser_name: "shared" | |
annotations: | |
priority: "high" | |
summary: "HAProxy server:{{ $labels.server}} responce" | |
description: "HAProxy server:{{ $labels.server}} with service: {{labels.service}}/route:{{labels.route}} in namespace:{{labels.namespace}} has response errors." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment