iamabhishek-dubey/prometheus.alerts

## prometheus.alerts
groups:
- name: cluster-rules
  interval: 30s # defaults to global interval testing threshold
  rules:
  - alert: os_cluster_memory_low
    expr: sum (container_memory_working_set_bytes{id="/"}) / sum (machine_memory_bytes{}) *100 > 70
    for: 10m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "low"
      summary: "Cluster memory usage exceeding 70%"
      description: "Cluster memory usage exceeding 70% for more than 10 minutes."
  - alert: os_cluster_memory_medium
    expr: sum (container_memory_working_set_bytes{id="/"}) / sum (machine_memory_bytes{}) *100 > 80
    for: 5m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "medium"
      summary: "Cluster memory usage exceeding 80%"
      description: "Cluster memory usage exceeding 80% for more than 5 minutes."
  - alert: os_cluster_memory_high
    expr: sum (container_memory_working_set_bytes{id="/"}) / sum (machine_memory_bytes{}) *100 > 90
    for: 2m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "high"
      summary: "Cluster memory usage exceeding 90%"
      description: "Cluster memory usage exceeding 90% for more than 2 minutes."
  - alert : os_cluster_cpu_low
    expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) / sum (machine_cpu_cores) * 100 >= 70
    for: 10m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "low"
      summary: "Cluster CPU usage exceeding 70%"
      description: "Cluster CPU usage exceeding 70% for more than 10 minutes."
  - alert : os_cluster_cpu_medium
    expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) / sum (machine_cpu_cores) * 100 >= 80
    for: 5m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "medium"
      summary: "Cluster CPU usage exceeding 80%"
      description: "Cluster CPU usage exceeding 80% for more than 5 minutes."
  - alert : os_cluster_cpu_high
    expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) / sum (machine_cpu_cores) * 100 >= 90
    for: 2m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "high"
      summary: "Cluster CPU usage exceeding 90%"
      description: "Cluster CPU usage exceeding 90% for more than 2 minutes."
   - alert: os_cluster_filesystem_low
    expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) / sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) * 100 >= 70
    for: 10m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "low"
      summary: "Cluster filesystem usage exceeding 70%"
      description: "Cluster filesystem usage exceeding 70% for more than 10 minutes."
  - alert: os_cluster_filesystem_medium
    expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) / sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) * 100 >= 80
    for: 5m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "medium"
      summary: "Cluster filesystem usage exceeding 80%"
      description: "Cluster filesystem usage exceeding 80% for more than 5 minutes."
  - alert: os_cluster_filesystem_high
    expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) / sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) * 100 >= 90
    for: 2m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "high"
      summary: "Cluster filesystem usage exceeding 90%"
      description: "Cluster filesystem usage exceeding 90% for more than 2 minutes."
  - alert: os_node_down
    expr: up{job="kubernetes-nodes"} == 0
    for: 2m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "high"
      summary: "Instance: {{$labels.instance}} is down."
      description: "Instance: {{$labels.instance}} of job {{ $labels.job }} has been down for more than 2 minutes."
  - alert: os_node_memory_low
    expr: sum (container_memory_working_set_bytes{id="/"}) by (kubernetes_io_hostname) / sum (machine_memory_bytes{}) by (kubernetes_io_hostname)  * 100 > 70
    for: 10m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "low"
      summary: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 70%"
      description: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 70% for more than 10 minutes."
  - alert: os_node_memory_medium
    expr: sum (container_memory_working_set_bytes{id="/"}) by (kubernetes_io_hostname) / sum (machine_memory_bytes{}) by (kubernetes_io_hostname)  * 100 > 80
    for: 5m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "medium"
      summary: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 80%"
      description: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 80% for more than 5 minutes."
  - alert: os_node_memory_high
    expr: sum (container_memory_working_set_bytes{id="/"}) by (kubernetes_io_hostname) / sum (machine_memory_bytes{}) by (kubernetes_io_hostname)  * 100 > 90
    for: 2m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "high"
      summary: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 90%"
      description: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 90% for more than 5 minutes."
  - alert : os_node_cpu_low
    expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) by (kubernetes_io_hostname) / sum (machine_cpu_cores)  by (kubernetes_io_hostname) * 100 > 70
    for: 10m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "low"
      summary: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 70%"
      description: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 70% for more than 10 minutes."
  - alert : os_node_cpu_medium
    expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) by (kubernetes_io_hostname) / sum (machine_cpu_cores)  by (kubernetes_io_hostname) * 100 > 80
    for: 5m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "medium"
      summary: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 80%"
      description: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 80% for more than 5 minutes."
  - alert : os_node_cpu_high
    expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) by (kubernetes_io_hostname) / sum (machine_cpu_cores)  by (kubernetes_io_hostname) * 100 > 90
    for: 2m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "high"
      summary: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 90%"
      description: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 90% for more than 2 minutes."
  - alert: os_node_filesystem_low
    expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname)/ sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname) * 100  > 70
    for: 10m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "low"
      summary: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 70%"
      description: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 70% for more than 10 minutes."
  - alert: os_node_filesystem_medium
    expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname)/ sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname) * 100  > 80
    for: 5m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "medium"
      summary: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 80%"
      description: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 80% for more than 5 minutes."
  - alert: os_node_filesystem_high
    expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname)/ sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname) * 100  > 90
    for: 2m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "high"
      summary: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 90%"
      description: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 90% for more than 2 minutes."
  - alert: os_node_thinpool_low
    expr: (container_fs_usage_bytes{id="/",device="docker--vg-docker--pool"} /1000) /  (container_fs_limit_bytes{id="/",device="docker--vg-docker--pool"} /1000 ) * 100  > 70
    for: 10m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "low"
      summary: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 70%"
      description: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 70% for more than 10 minutes."
  - alert: os_node_thinpool_medium
    expr: (container_fs_usage_bytes{id="/",device="docker--vg-docker--pool"} /1000) /  (container_fs_limit_bytes{id="/",device="docker--vg-docker--pool"} /1000 ) * 100  > 80
    for: 5m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "medium"
      summary: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 80%"
      description: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 80% for more than 5 minutes."
  - alert: os_node_thinpool_high
    expr: (container_fs_usage_bytes{id="/",device="docker--vg-docker--pool"} /1000) /  (container_fs_limit_bytes{id="/",device="docker--vg-docker--pool"} /1000 ) * 100  > 90
    for: 2m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "high"
      summary: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 90%"
      description: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 90% for more than 2 minutes."
  - alert: os_container_memory_low
    expr: (sum (container_memory_working_set_bytes{image!=""}) by (container_name,role,namespace) /1024/1024) /(sum (container_spec_memory_limit_bytes{image!=""}) by (container_name,role,namespace) /1024 /1024 >0) * 100 > 70
    for: 10m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "low"
      summary: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} from namespace: {{$labels.namespace}} - memory usage exceeding 70%"
      description: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} from namespace: {{$labels.namespace}} - memory usage exceeding 70% for more than 10 minutes."
  - alert: os_container_memory_medium
    expr: (sum (container_memory_working_set_bytes{image!=""}) by (container_name,role,namespace) /1024 /1024) / (sum (container_spec_memory_limit_bytes{image=""}) by (container_name,role,namespace) / 1024 / 1024 > 0 ) * 100 > 80
    for: 5m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "medium"
      summary: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} - memory usage exceeding 80%"
      description: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} - memory usage exceeding 80% for more than 5 minutes."
  - alert: os_container_memory_high
    expr: (sum (container_memory_working_set_bytes{image!=""}) by (container_name,role,namespace) /1024 /1024) / (sum (container_spec_memory_limit_bytes{image!=""}) by (container_name,role,namespace) / 1024 / 1024 > 0 ) * 100 > 90
    for: 2m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "high"
      summary: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} - memory usage exceeding 90%"
      description: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} - memory usage exceeding 90% for more than 5 minutes."
  - alert : os_container_cpu_low
    expr: ((sum(rate(container_cpu_usage_seconds_total [1m]))  by (container_name,namespace) > 0) * 1000) / (sum(container_spec_cpu_quota) by (container_name,namespace) / 100) * 100 > 70
    for: 10m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "low"
      summary: "Container: {{$labels.container_name}} in  namespace: {{$labels.namespace}} - CPU usage exceeding 70%"
      description: "Container: {{$labels.container_name}} in  namespace: {{$labels.namespace}} - CPU usage exceeding 70% for more than 10 minutes."
  - alert : os_container_cpu_medium
    expr: ((sum(rate(container_cpu_usage_seconds_total [1m]))  by (container_name,namespace) > 0) * 1000) / (sum(container_spec_cpu_quota) by (container_name,namespace) / 100) * 100 > 80
    for: 5m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "medium"
      summary: "Container: {{$labels.container_name}} in  namespace: {{$labels.namespace}} - CPU usage exceeding 80%"
      description: "Container: {{$labels.container_name}} in  namespace: {{$labels.namespace}} - CPU usage exceeding 80% for more than 5 minutes."
  - alert : os_container_cpu_high
    expr: ((sum(rate(container_cpu_usage_seconds_total [1m]))  by (container_name,namespace) > 0) * 1000) / (sum(container_spec_cpu_quota) by (container_name,namespace) / 100) * 100 > 90
    for: 2m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "high"
      summary: "Container: {{$labels.container_name}} in  namespace: {{$labels.namespace}} - CPU usage exceeding 90%"
      description: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 90% for more than 2 minutes."
  - alert : HaproxyDown
    expr: haproxy_up{job = "haproxy-scrape"} == 0
    for: 2m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "high"
      summary: "HAProxy Instance:{{ $labels.instance }} down"
      description: "HAProxy Instance :{{ $labels.instance }} could not be scraped for more than 2 minutes."
  - alert: Haproxy_server_connection_error
    expr: sum(deriv(haproxy_server_connection_errors_total{namespace!=""}[5m])) by (instance,namespace,service,server,route)>0
    for: 2m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "high"
      summary: "HAProxy server:{{ $labels.server}} connection error"
      description: "HAProxy server:{{ $labels.server}} with service: {{labels.service}}/route:{{labels.route}} in namespace:{{labels.namespace}} has connection errors."
  - alert : Haproxy_server_responce_error
    expr: sum(deriv(haproxy_server_response_errors_total{namespace!=""}[5m])) by (instance,namespace,service,server,route)>0
    for: 2m
    labels:
      env: "non-production"
      cluser_name: "shared"
    annotations:
      priority: "high"
      summary: "HAProxy server:{{ $labels.server}} responce"
      description: "HAProxy server:{{ $labels.server}} with service: {{labels.service}}/route:{{labels.route}} in namespace:{{labels.namespace}} has response errors."
	groups:
	- name: cluster-rules
	interval: 30s # defaults to global interval testing threshold
	rules:
	- alert: os_cluster_memory_low
	expr: sum (container_memory_working_set_bytes{id="/"}) / sum (machine_memory_bytes{}) *100 > 70
	for: 10m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "low"
	summary: "Cluster memory usage exceeding 70%"
	description: "Cluster memory usage exceeding 70% for more than 10 minutes."
	- alert: os_cluster_memory_medium
	expr: sum (container_memory_working_set_bytes{id="/"}) / sum (machine_memory_bytes{}) *100 > 80
	for: 5m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "medium"
	summary: "Cluster memory usage exceeding 80%"
	description: "Cluster memory usage exceeding 80% for more than 5 minutes."
	- alert: os_cluster_memory_high
	expr: sum (container_memory_working_set_bytes{id="/"}) / sum (machine_memory_bytes{}) *100 > 90
	for: 2m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "high"
	summary: "Cluster memory usage exceeding 90%"
	description: "Cluster memory usage exceeding 90% for more than 2 minutes."
	- alert : os_cluster_cpu_low
	expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) / sum (machine_cpu_cores) * 100 >= 70
	for: 10m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "low"
	summary: "Cluster CPU usage exceeding 70%"
	description: "Cluster CPU usage exceeding 70% for more than 10 minutes."
	- alert : os_cluster_cpu_medium
	expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) / sum (machine_cpu_cores) * 100 >= 80
	for: 5m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "medium"
	summary: "Cluster CPU usage exceeding 80%"
	description: "Cluster CPU usage exceeding 80% for more than 5 minutes."
	- alert : os_cluster_cpu_high
	expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) / sum (machine_cpu_cores) * 100 >= 90
	for: 2m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "high"
	summary: "Cluster CPU usage exceeding 90%"
	description: "Cluster CPU usage exceeding 90% for more than 2 minutes."
	- alert: os_cluster_filesystem_low
	expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) / sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) * 100 >= 70
	for: 10m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "low"
	summary: "Cluster filesystem usage exceeding 70%"
	description: "Cluster filesystem usage exceeding 70% for more than 10 minutes."
	- alert: os_cluster_filesystem_medium
	expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) / sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) * 100 >= 80
	for: 5m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "medium"
	summary: "Cluster filesystem usage exceeding 80%"
	description: "Cluster filesystem usage exceeding 80% for more than 5 minutes."
	- alert: os_cluster_filesystem_high
	expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) / sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) * 100 >= 90
	for: 2m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "high"
	summary: "Cluster filesystem usage exceeding 90%"
	description: "Cluster filesystem usage exceeding 90% for more than 2 minutes."
	- alert: os_node_down
	expr: up{job="kubernetes-nodes"} == 0
	for: 2m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "high"
	summary: "Instance: {{$labels.instance}} is down."
	description: "Instance: {{$labels.instance}} of job {{ $labels.job }} has been down for more than 2 minutes."
	- alert: os_node_memory_low
	expr: sum (container_memory_working_set_bytes{id="/"}) by (kubernetes_io_hostname) / sum (machine_memory_bytes{}) by (kubernetes_io_hostname) * 100 > 70
	for: 10m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "low"
	summary: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 70%"
	description: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 70% for more than 10 minutes."
	- alert: os_node_memory_medium
	expr: sum (container_memory_working_set_bytes{id="/"}) by (kubernetes_io_hostname) / sum (machine_memory_bytes{}) by (kubernetes_io_hostname) * 100 > 80
	for: 5m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "medium"
	summary: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 80%"
	description: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 80% for more than 5 minutes."
	- alert: os_node_memory_high
	expr: sum (container_memory_working_set_bytes{id="/"}) by (kubernetes_io_hostname) / sum (machine_memory_bytes{}) by (kubernetes_io_hostname) * 100 > 90
	for: 2m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "high"
	summary: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 90%"
	description: "Instance: {{$labels.kubernetes_io_hostname}} - memory usage exceeding 90% for more than 5 minutes."
	- alert : os_node_cpu_low
	expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) by (kubernetes_io_hostname) / sum (machine_cpu_cores) by (kubernetes_io_hostname) * 100 > 70
	for: 10m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "low"
	summary: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 70%"
	description: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 70% for more than 10 minutes."
	- alert : os_node_cpu_medium
	expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) by (kubernetes_io_hostname) / sum (machine_cpu_cores) by (kubernetes_io_hostname) * 100 > 80
	for: 5m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "medium"
	summary: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 80%"
	description: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 80% for more than 5 minutes."
	- alert : os_node_cpu_high
	expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[1m])) by (kubernetes_io_hostname) / sum (machine_cpu_cores) by (kubernetes_io_hostname) * 100 > 90
	for: 2m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "high"
	summary: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 90%"
	description: "Instance: {{$labels.kubernetes_io_hostname}} - CPU usage exceeding 90% for more than 2 minutes."
	- alert: os_node_filesystem_low
	expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname)/ sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname) * 100 > 70
	for: 10m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "low"
	summary: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 70%"
	description: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 70% for more than 10 minutes."
	- alert: os_node_filesystem_medium
	expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname)/ sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname) * 100 > 80
	for: 5m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "medium"
	summary: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 80%"
	description: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 80% for more than 5 minutes."
	- alert: os_node_filesystem_high
	expr: sum (container_fs_usage_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname)/ sum (container_fs_limit_bytes{device=~"^/dev/sd[a-z][1-9]",id="/"}) by (kubernetes_io_hostname) * 100 > 90
	for: 2m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "high"
	summary: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 90%"
	description: "Instance: {{$labels.kubernetes_io_hostname}} - filesystem usage exceeding 90% for more than 2 minutes."
	- alert: os_node_thinpool_low
	expr: (container_fs_usage_bytes{id="/",device="docker--vg-docker--pool"} /1000) / (container_fs_limit_bytes{id="/",device="docker--vg-docker--pool"} /1000 ) * 100 > 70
	for: 10m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "low"
	summary: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 70%"
	description: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 70% for more than 10 minutes."
	- alert: os_node_thinpool_medium
	expr: (container_fs_usage_bytes{id="/",device="docker--vg-docker--pool"} /1000) / (container_fs_limit_bytes{id="/",device="docker--vg-docker--pool"} /1000 ) * 100 > 80
	for: 5m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "medium"
	summary: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 80%"
	description: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 80% for more than 5 minutes."
	- alert: os_node_thinpool_high
	expr: (container_fs_usage_bytes{id="/",device="docker--vg-docker--pool"} /1000) / (container_fs_limit_bytes{id="/",device="docker--vg-docker--pool"} /1000 ) * 100 > 90
	for: 2m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "high"
	summary: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 90%"
	description: "Instance: {{$labels.kubernetes_io_hostname}} - thinpool usage exceeding 90% for more than 2 minutes."
	- alert: os_container_memory_low
	expr: (sum (container_memory_working_set_bytes{image!=""}) by (container_name,role,namespace) /1024/1024) /(sum (container_spec_memory_limit_bytes{image!=""}) by (container_name,role,namespace) /1024 /1024 >0) * 100 > 70
	for: 10m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "low"
	summary: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} from namespace: {{$labels.namespace}} - memory usage exceeding 70%"
	description: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} from namespace: {{$labels.namespace}} - memory usage exceeding 70% for more than 10 minutes."
	- alert: os_container_memory_medium
	expr: (sum (container_memory_working_set_bytes{image!=""}) by (container_name,role,namespace) /1024 /1024) / (sum (container_spec_memory_limit_bytes{image=""}) by (container_name,role,namespace) / 1024 / 1024 > 0 ) * 100 > 80
	for: 5m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "medium"
	summary: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} - memory usage exceeding 80%"
	description: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} - memory usage exceeding 80% for more than 5 minutes."
	- alert: os_container_memory_high
	expr: (sum (container_memory_working_set_bytes{image!=""}) by (container_name,role,namespace) /1024 /1024) / (sum (container_spec_memory_limit_bytes{image!=""}) by (container_name,role,namespace) / 1024 / 1024 > 0 ) * 100 > 90
	for: 2m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "high"
	summary: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} - memory usage exceeding 90%"
	description: "Container: {{$labels.container_name}} in node with role: {{$labels.role}} - memory usage exceeding 90% for more than 5 minutes."
	- alert : os_container_cpu_low
	expr: ((sum(rate(container_cpu_usage_seconds_total [1m])) by (container_name,namespace) > 0) * 1000) / (sum(container_spec_cpu_quota) by (container_name,namespace) / 100) * 100 > 70
	for: 10m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "low"
	summary: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 70%"
	description: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 70% for more than 10 minutes."
	- alert : os_container_cpu_medium
	expr: ((sum(rate(container_cpu_usage_seconds_total [1m])) by (container_name,namespace) > 0) * 1000) / (sum(container_spec_cpu_quota) by (container_name,namespace) / 100) * 100 > 80
	for: 5m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "medium"
	summary: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 80%"
	description: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 80% for more than 5 minutes."
	- alert : os_container_cpu_high
	expr: ((sum(rate(container_cpu_usage_seconds_total [1m])) by (container_name,namespace) > 0) * 1000) / (sum(container_spec_cpu_quota) by (container_name,namespace) / 100) * 100 > 90
	for: 2m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "high"
	summary: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 90%"
	description: "Container: {{$labels.container_name}} in namespace: {{$labels.namespace}} - CPU usage exceeding 90% for more than 2 minutes."
	- alert : HaproxyDown
	expr: haproxy_up{job = "haproxy-scrape"} == 0
	for: 2m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "high"
	summary: "HAProxy Instance:{{ $labels.instance }} down"
	description: "HAProxy Instance :{{ $labels.instance }} could not be scraped for more than 2 minutes."
	- alert: Haproxy_server_connection_error
	expr: sum(deriv(haproxy_server_connection_errors_total{namespace!=""}[5m])) by (instance,namespace,service,server,route)>0
	for: 2m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "high"
	summary: "HAProxy server:{{ $labels.server}} connection error"
	description: "HAProxy server:{{ $labels.server}} with service: {{labels.service}}/route:{{labels.route}} in namespace:{{labels.namespace}} has connection errors."
	- alert : Haproxy_server_responce_error
	expr: sum(deriv(haproxy_server_response_errors_total{namespace!=""}[5m])) by (instance,namespace,service,server,route)>0
	for: 2m
	labels:
	env: "non-production"
	cluser_name: "shared"
	annotations:
	priority: "high"
	summary: "HAProxy server:{{ $labels.server}} responce"
	description: "HAProxy server:{{ $labels.server}} with service: {{labels.service}}/route:{{labels.route}} in namespace:{{labels.namespace}} has response errors."