Skip to content

Instantly share code, notes, and snippets.

@neillturner
Last active August 6, 2021 08:19
Show Gist options
  • Save neillturner/45915fdbfb3359d7d98b97fee281eadb to your computer and use it in GitHub Desktop.
Save neillturner/45915fdbfb3359d7d98b97fee281eadb to your computer and use it in GitHub Desktop.
alertmanager.yml:
global:
resolve_timeout: 1m
slack_api_url: 'https://hooks.slack.com/services/XXXXXXXXXXXXXXXXXXXXXX'
route:
receiver: 'slack-notifications'
repeat_interval: 1h
routes:
- match:
context: 'Monitoring-check'
receiver: 'slack-notifications'
repeat_interval: 24h
receivers:
- name: 'slack-notifications'
slack_configs:
- channel: '#my-slack-alerts'
send_resolved: true
icon_url: https://avatars3.githubusercontent.com/u/3380462
title: |-
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
{{- if gt (len .CommonLabels) (len .GroupLabels) -}}
{{" "}}(
{{- with .CommonLabels.Remove .GroupLabels.Names }}
{{- range $index, $label := .SortedPairs -}}
{{ if $index }}, {{ end }}
{{- $label.Name }}="{{ $label.Value -}}"
{{- end }}
{{- end -}}
)
{{- end }}
text: >-
{{ range .Alerts -}}
*Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
*Description:* {{ .Annotations.description }}
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
{{ end }}
{{ end }}
alerting_rules.yml:
groups:
- name: Monitoring-check
rules:
- alert: Monitoring is Working in DEVELOPMENT cluster
expr: prometheus_build_info{instance="localhost:9090"} == 1
for: 5m
labels:
env: DEVELOPMENT
service: prometheus
severity: info
context: Monitoring-check
annotations:
description: Prometheus is responding - No Further Action Required
summary: Prometheus is responding
- name: EKS-control-plane
rules:
- alert: KubernetesApiServerEtcdAccessLatency
annotations:
description: Latency for apiserver to access etcd is higher than 2 sec
summary: Access to etcd is slow
expr: histogram_quantile(0.99, rate(etcd_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
env: Development
service: etcd
severity: warning
context: EKS-control-plane
- alert: KubernetesApiServerLatency
expr: histogram_quantile(0.99, rate(apiserver_request_duration_seconds_bucket{verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) > 2
for: 5m
labels:
env: Development
service: apiserver
severity: warning
context: EKS-control-plane
annotations:
description: ApiServer requests taking longer than 2 secs
summary: ApiServers requests are slow
- alert: KubernetesWorkQueuesTimes
expr: histogram_quantile(0.99, rate(workqueue_queue_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
env: Development
service: controller-manager
severity: warning
context: EKS-control-plane
annotations:
description: Duration for controller-manager workqueues is higher than 2 sec
summary: Controller Manager is slow
- alert: KubernetesControllerManagerAWSRequests
expr: histogram_quantile(0.99, rate(cloudprovider_aws_api_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
env: Development
service: controller-manager
severity: warning
context: EKS-control-plane
annotations:
description: Duration for controller-manager AWS requests greater than 1 sec
summary: Controller Manager AWS requests are slow
- name: cluster-state-node
rules:
- alert: KubernetesKubeStateMetricsScrapeFailed
expr: absent(up{app_kubernetes_io_name="kube-state-metrics"})
for: 1h
labels:
env: Development
service: k8s
severity: warning
context: kube-state-metrics
annotations:
description: Failed to scrape kube-state-metrics. Metrics on the cluster state might be outdated. Check the prometheus-kube-state-metrics deployment via flux.
summary: Kube state metrics scrape failed
# Commented because currently firing because kube-state-metrics is getting disk and memory pressure statuses
# - alert: KubernetesNodeNotReady
# annotations:
# description: "{{ $value }} Nodes are in NotReady status for more than an hour"
# summary: Node status is NotReady
# expr: count(kube_node_status_condition{condition="Ready",status="false"} == 0)
# for: 1h
# labels:
# env: Development
# service: kube-state-metrics
# severity: warning
# context: node
- alert: KubernetesNodeUnschedulable
annotations:
description: "{{ $value }} nodes are Unschedulable for more than an 5 mins"
summary: Node status is NotReady
expr: kube_node_spec_unschedulable > 0
for: 5m
labels:
env: Development
service: kube-state-metrics
severity: warning
context: node
- alert: KubernetesNodeDaemonsetUnavailable
annotations:
description: "{{ $value }} nodes are unable to run Non-user pods for more than an 5 mins"
summary: Nodes unable to run pod
expr: kube_daemonset_status_number_unavailable{namespace!~"user-.*"} > 0
for: 5m
labels:
env: Development
service: kube-state-metrics
severity: warning
context: node
- name: cluster-state-deployment
rules:
- alert: KubernetesPodMaxUnavailableDeployment
annotations:
description: Maximum number of unavailable pods during a Non-user rolling update for more than an 5 mins
summary: Unavailable pods during a rolling update
expr: count(kube_deployment_spec_strategy_rollingupdate_max_unavailable{namespace!~"user-.*"}) > 100
for: 5m
labels:
env: Development
service: kube-state-metrics
severity: warning
context: deployment
- alert: KubernetesPodUnavailableDeployment
annotations:
description: Pods unavailable for a Non-user deployment for more than an 5 mins
summary: Pods unavailable for a deployment
expr: kube_deployment_status_replicas_unavailable{namespace!~"user-.*"} > 0
for: 5m
labels:
env: Development
service: kube-state-metrics
severity: warning
context: deployment
- name: cluster-state-pod
rules:
- alert: KubernetesPodNotReady
annotations:
description: Non-user Pod status is NotReady for more than an 5 mins
summary: Pod status is NotReady
expr: kube_pod_info{created_by_kind!="Job"} AND ON (pod, namespace) kube_pod_status_ready{condition="false",namespace!~"user-.*"} == 1
for: 5m
labels:
env: Development
service: kube-state-metrics
severity: warning
context: pod
- alert: KubernetesPodFailures
annotations:
description: Non-user Pods have more than 500 failures for more than an hour
summary: Pod has large number of failures
expr: count(kube_pod_status_phase{phase="Failed",namespace!~"user-.*"}) > 500
for: 1h
labels:
env: Development
service: kube-state-metrics
severity: warning
context: pod
- name: past-issues
rules:
- alert: KubernetesEtcObjectsTooMany
annotations:
description: Number of objects in EKS Cluster greater than 31000
summary: Too many Etcd objects
expr: sum(etcd_object_counts) > 31000
for: 5m
labels:
env: Development
service: etcd
severity: warning
context: EKS-control-plane
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment