-
-
Save neillturner/45915fdbfb3359d7d98b97fee281eadb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
alertmanager.yml: | |
global: | |
resolve_timeout: 1m | |
slack_api_url: 'https://hooks.slack.com/services/XXXXXXXXXXXXXXXXXXXXXX' | |
route: | |
receiver: 'slack-notifications' | |
repeat_interval: 1h | |
routes: | |
- match: | |
context: 'Monitoring-check' | |
receiver: 'slack-notifications' | |
repeat_interval: 24h | |
receivers: | |
- name: 'slack-notifications' | |
slack_configs: | |
- channel: '#my-slack-alerts' | |
send_resolved: true | |
icon_url: https://avatars3.githubusercontent.com/u/3380462 | |
title: |- | |
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }} | |
{{- if gt (len .CommonLabels) (len .GroupLabels) -}} | |
{{" "}}( | |
{{- with .CommonLabels.Remove .GroupLabels.Names }} | |
{{- range $index, $label := .SortedPairs -}} | |
{{ if $index }}, {{ end }} | |
{{- $label.Name }}="{{ $label.Value -}}" | |
{{- end }} | |
{{- end -}} | |
) | |
{{- end }} | |
text: >- | |
{{ range .Alerts -}} | |
*Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} | |
*Description:* {{ .Annotations.description }} | |
*Details:* | |
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` | |
{{ end }} | |
{{ end }} | |
alerting_rules.yml: | |
groups: | |
- name: Monitoring-check | |
rules: | |
- alert: Monitoring is Working in DEVELOPMENT cluster | |
expr: prometheus_build_info{instance="localhost:9090"} == 1 | |
for: 5m | |
labels: | |
env: DEVELOPMENT | |
service: prometheus | |
severity: info | |
context: Monitoring-check | |
annotations: | |
description: Prometheus is responding - No Further Action Required | |
summary: Prometheus is responding | |
- name: EKS-control-plane | |
rules: | |
- alert: KubernetesApiServerEtcdAccessLatency | |
annotations: | |
description: Latency for apiserver to access etcd is higher than 2 sec | |
summary: Access to etcd is slow | |
expr: histogram_quantile(0.99, rate(etcd_request_duration_seconds_bucket[5m])) > 2 | |
for: 5m | |
labels: | |
env: Development | |
service: etcd | |
severity: warning | |
context: EKS-control-plane | |
- alert: KubernetesApiServerLatency | |
expr: histogram_quantile(0.99, rate(apiserver_request_duration_seconds_bucket{verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) > 2 | |
for: 5m | |
labels: | |
env: Development | |
service: apiserver | |
severity: warning | |
context: EKS-control-plane | |
annotations: | |
description: ApiServer requests taking longer than 2 secs | |
summary: ApiServers requests are slow | |
- alert: KubernetesWorkQueuesTimes | |
expr: histogram_quantile(0.99, rate(workqueue_queue_duration_seconds_bucket[5m])) > 2 | |
for: 5m | |
labels: | |
env: Development | |
service: controller-manager | |
severity: warning | |
context: EKS-control-plane | |
annotations: | |
description: Duration for controller-manager workqueues is higher than 2 sec | |
summary: Controller Manager is slow | |
- alert: KubernetesControllerManagerAWSRequests | |
expr: histogram_quantile(0.99, rate(cloudprovider_aws_api_request_duration_seconds_bucket[5m])) > 1 | |
for: 5m | |
labels: | |
env: Development | |
service: controller-manager | |
severity: warning | |
context: EKS-control-plane | |
annotations: | |
description: Duration for controller-manager AWS requests greater than 1 sec | |
summary: Controller Manager AWS requests are slow | |
- name: cluster-state-node | |
rules: | |
- alert: KubernetesKubeStateMetricsScrapeFailed | |
expr: absent(up{app_kubernetes_io_name="kube-state-metrics"}) | |
for: 1h | |
labels: | |
env: Development | |
service: k8s | |
severity: warning | |
context: kube-state-metrics | |
annotations: | |
description: Failed to scrape kube-state-metrics. Metrics on the cluster state might be outdated. Check the prometheus-kube-state-metrics deployment via flux. | |
summary: Kube state metrics scrape failed | |
# Commented because currently firing because kube-state-metrics is getting disk and memory pressure statuses | |
# - alert: KubernetesNodeNotReady | |
# annotations: | |
# description: "{{ $value }} Nodes are in NotReady status for more than an hour" | |
# summary: Node status is NotReady | |
# expr: count(kube_node_status_condition{condition="Ready",status="false"} == 0) | |
# for: 1h | |
# labels: | |
# env: Development | |
# service: kube-state-metrics | |
# severity: warning | |
# context: node | |
- alert: KubernetesNodeUnschedulable | |
annotations: | |
description: "{{ $value }} nodes are Unschedulable for more than an 5 mins" | |
summary: Node status is NotReady | |
expr: kube_node_spec_unschedulable > 0 | |
for: 5m | |
labels: | |
env: Development | |
service: kube-state-metrics | |
severity: warning | |
context: node | |
- alert: KubernetesNodeDaemonsetUnavailable | |
annotations: | |
description: "{{ $value }} nodes are unable to run Non-user pods for more than an 5 mins" | |
summary: Nodes unable to run pod | |
expr: kube_daemonset_status_number_unavailable{namespace!~"user-.*"} > 0 | |
for: 5m | |
labels: | |
env: Development | |
service: kube-state-metrics | |
severity: warning | |
context: node | |
- name: cluster-state-deployment | |
rules: | |
- alert: KubernetesPodMaxUnavailableDeployment | |
annotations: | |
description: Maximum number of unavailable pods during a Non-user rolling update for more than an 5 mins | |
summary: Unavailable pods during a rolling update | |
expr: count(kube_deployment_spec_strategy_rollingupdate_max_unavailable{namespace!~"user-.*"}) > 100 | |
for: 5m | |
labels: | |
env: Development | |
service: kube-state-metrics | |
severity: warning | |
context: deployment | |
- alert: KubernetesPodUnavailableDeployment | |
annotations: | |
description: Pods unavailable for a Non-user deployment for more than an 5 mins | |
summary: Pods unavailable for a deployment | |
expr: kube_deployment_status_replicas_unavailable{namespace!~"user-.*"} > 0 | |
for: 5m | |
labels: | |
env: Development | |
service: kube-state-metrics | |
severity: warning | |
context: deployment | |
- name: cluster-state-pod | |
rules: | |
- alert: KubernetesPodNotReady | |
annotations: | |
description: Non-user Pod status is NotReady for more than an 5 mins | |
summary: Pod status is NotReady | |
expr: kube_pod_info{created_by_kind!="Job"} AND ON (pod, namespace) kube_pod_status_ready{condition="false",namespace!~"user-.*"} == 1 | |
for: 5m | |
labels: | |
env: Development | |
service: kube-state-metrics | |
severity: warning | |
context: pod | |
- alert: KubernetesPodFailures | |
annotations: | |
description: Non-user Pods have more than 500 failures for more than an hour | |
summary: Pod has large number of failures | |
expr: count(kube_pod_status_phase{phase="Failed",namespace!~"user-.*"}) > 500 | |
for: 1h | |
labels: | |
env: Development | |
service: kube-state-metrics | |
severity: warning | |
context: pod | |
- name: past-issues | |
rules: | |
- alert: KubernetesEtcObjectsTooMany | |
annotations: | |
description: Number of objects in EKS Cluster greater than 31000 | |
summary: Too many Etcd objects | |
expr: sum(etcd_object_counts) > 31000 | |
for: 5m | |
labels: | |
env: Development | |
service: etcd | |
severity: warning | |
context: EKS-control-plane |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment