Skip to content

Instantly share code, notes, and snippets.

@anton-johansson
Forked from amimof/prometheus.yaml
Created May 30, 2019 09:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anton-johansson/7ccd9043889b1531b7cd7032b09f3f81 to your computer and use it in GitHub Desktop.
Save anton-johansson/7ccd9043889b1531b7cd7032b09f3f81 to your computer and use it in GitHub Desktop.
Prometheus Kubernetes Deployment
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- nodes/metrics
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
- apiGroups:
- route.openshift.io
attributeRestrictions: null
resources:
- routers/metrics
verbs:
- get
- apiGroups:
- image.openshift.io
attributeRestrictions: null
resources:
- registry/metrics
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: prometheus-node-exporter
rules:
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: kube-state-metrics
rules:
- apiGroups:
- ""
attributeRestrictions: null
resources:
- configmaps
- endpoints
- limitranges
- namespaces
- nodes
- persistentvolumeclaims
- persistentvolumes
- pods
- replicationcontrollers
- resourcequotas
- secrets
- services
verbs:
- list
- watch
- apiGroups:
- extensions
attributeRestrictions: null
resources:
- daemonsets
- deployments
- replicasets
verbs:
- list
- watch
- apiGroups:
- apps
attributeRestrictions: null
resources:
- statefulsets
verbs:
- list
- watch
- apiGroups:
- batch
attributeRestrictions: null
resources:
- cronjobs
- jobs
verbs:
- list
- watch
- apiGroups:
- autoscaling
attributeRestrictions: null
resources:
- horizontalpodautoscalers
verbs:
- list
- watch
- apiGroups:
- policy
attributeRestrictions: null
resources:
- poddisruptionbudgets
verbs:
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: prometheus-node-exporter
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus-node-exporter
subjects:
- kind: ServiceAccount
name: prometheus-node-exporter
namespace: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: prometheus
---
kind: ConfigMap
apiVersion: v1
metadata:
creationTimestamp: null
name: prometheus-config
namespace: prometheus
data:
prometheus.rules: |
groups:
- name: cluster
rules:
- alert: NodeDown
annotations:
description: "Node {{ $labels.instance }} has been down for more than 5 minutes."
identifier: "{{ $labels.instance }}"
summary: "Node {{ $labels.instance }} down"
expr: "up{job=\"kubernetes-nodes\"} == 0"
for: 5m
labels:
severity: critical
- alert: CPURequestsHigh
annotations:
description: "Cluster container CPU requests is above 80% (current value: {{ $value }}%)"
identifier: "{{ $labels.instance }}"
summary: "High container CPU requests"
expr: "(max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100) > 80"
for: 5m
labels:
severity: critical
- alert: MemoryRequestsHigh
annotations:
identifier: "{{ $labels.instance }}"
summary: "High container memory requests"
description: "Cluster container memory requests is above 80% (current value: {{ $value }}%)"
expr: "(max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100) > 80"
for: 5m
labels:
severity: critical
prometheus.yaml: |
alerting:
alertmanagers:
-
scheme: http
static_configs:
-
targets:
- "alertmanager:9093"
global:
evaluation_interval: 5s
external_labels:
cluster: prod-cluster
scrape_interval: 60s
rule_files:
- "*.rules"
scrape_configs:
-
job_name: prometheus
metrics_path: /metrics
scheme: http
static_configs:
-
targets:
- "localhost:9090"
-
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: kubernetes-apiservers
kubernetes_sd_configs:
-
role: endpoints
relabel_configs:
-
action: keep
regex: default;kubernetes;https
source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_service_name
- __meta_kubernetes_endpoint_port_name
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: kubernetes-nodes
kubernetes_sd_configs:
-
role: node
metric_relabel_configs:
-
action: drop
regex: openshift_sdn_pod_(setup|teardown)_latency(.*)
source_labels:
- __name__
relabel_configs:
-
action: labelmap
regex: __meta_kubernetes_node_label_(.+)
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-
job_name: kubernetes-nodes-exporter
kubernetes_sd_configs:
-
role: node
metric_relabel_configs:
-
action: drop
regex: node_cpu|node_(disk|scrape_collector)_.+
source_labels:
- __name__
-
action: replace
regex: (node_(netstat_Ip_.+|vmstat_(nr|thp)_.+|filesystem_(free|size|device_error)|network_(transmit|receive)_(drop|errs)))
replacement: renamed_$1
source_labels:
- __name__
target_label: __name__
-
action: drop
regex: node_(netstat|vmstat|filesystem|network)_.+
source_labels:
- __name__
-
action: replace
regex: renamed_(.+)
replacement: $1
source_labels:
- __name__
target_label: __name__
-
action: drop
regex: node_network_.+;veth.+
source_labels:
- __name__
- device
-
action: drop
regex: "node_filesystem_(free|size|device_error);([^/].*|/.+)"
source_labels:
- __name__
- mountpoint
relabel_configs:
-
regex: "(.*):10250"
replacement: "${1}:9100"
source_labels:
- __address__
target_label: __address__
-
source_labels:
- __meta_kubernetes_node_label_kubernetes_io_hostname
target_label: __instance__
-
action: labelmap
regex: __meta_kubernetes_node_label_(.+)
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: kubernetes-cadvisor
kubernetes_sd_configs:
-
role: node
metric_relabel_configs:
-
action: drop
regex: container_(cpu_user_seconds_total|cpu_cfs_periods_total|memory_swap|memory_working_set_bytes|memory_cache|last_seen|fs_(read_seconds_total|write_seconds_total|sector_(.*)|io_(.*)|reads_merged_total|writes_merged_total)|tasks_state|memory_failcnt|memory_failures_total|spec_memory_swap_limit_bytes|fs_(.*)_bytes_total)
source_labels:
- __name__
metrics_path: /metrics/cadvisor
relabel_configs:
-
action: labelmap
regex: __meta_kubernetes_node_label_(.+)
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-
job_name: kubernetes-pods
kubernetes_sd_configs:
-
role: pod
relabel_configs:
-
action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
-
action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: metrics_path
-
action: replace
regex: "([^:]+)(?::\\d+)?;(\\d+)"
replacement: "$1:$2"
source_labels:
- __address__
- __meta_kubernetes_pod_annotation_prometheus_io_port
target_label: address
-
action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
-
action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
-
action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: kubernetes_pod_name
-
job_name: kubernetes-service-endpoints
kubernetes_sd_configs:
-
role: endpoints
relabel_configs:
-
action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
-
action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
target_label: scheme
-
action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label: metrics_path
-
action: replace
regex: "([^:]+)(?::\\d+)?;(\\d+)"
replacement: "$1:$2"
source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
target_label: address
-
action: labelmap
regex: __meta_kubernetes_service_label_(.+)
-
action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
-
action: replace
source_labels:
- __meta_kubernetes_service_name
target_label: kubernetes_name
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
-
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: kubernetes-controllers
kubernetes_sd_configs:
-
role: endpoints
relabel_configs:
-
action: keep
regex: default;kubernetes;https
source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_service_name
- __meta_kubernetes_endpoint_port_name
# Uncomment if deploying on OpenShift. Controller metrics is exposed on port 8444
#-
# action: replace
# regex: "(.+)(?::\\d+)"
# replacement: "$1:8444"
# source_labels:
# - __address__
# target_label: __address__
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-
job_name: ingress-nginx-endpoints
kubernetes_sd_configs:
-
namespaces:
names:
- ingress-nginx
role: pod
relabel_configs:
-
action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
-
action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
target_label: __scheme__
-
action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
-
action: replace
regex: "([^:]+)(?::\\d+)?;(\\d+)"
replacement: "$1:$2"
source_labels:
- __address__
- __meta_kubernetes_pod_annotation_prometheus_io_port
target_label: __address__
-
action: drop
regex: prometheus-server
source_labels:
- __meta_kubernetes_service_name
# Uncomment to enable OpenShift haproxy (router) metrics
#-
# bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# job_name: openshift-router
# kubernetes_sd_configs:
# -
# namespaces:
# names:
# - default
# role: endpoints
# relabel_configs:
# -
# action: keep
# regex: router;1936-tcp
# source_labels:
# - __meta_kubernetes_service_name
# - __meta_kubernetes_endpoint_port_name
# scheme: http
---
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-config
namespace: prometheus
data:
alertmanager.yml: |
global:
route:
receiver: slack_alerts
group_by: ['alertname', 'cluster', 'service']
receivers:
- name: slack_alerts
slack_configs:
- api_url: 'https://hooks.slack.com/services/<KEY>'
channel: '#general'
send_resolved: true
title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ if or (and (eq (len .Alerts.Firing) 1) (eq (len .Alerts.Resolved) 0)) (and (eq (len .Alerts.Firing) 0) (eq (len .Alerts.Resolved) 1)) }}{{ range .Alerts.Firing }}{{ .Labels.alertname }} @ {{ .Labels.cluster }}{{ end }}{{ range .Alerts.Resolved }}{{ .Labels.alertname }} @ {{ .Labels.cluster }}{{ end }}{{ end }}'
text: |-
*Cluster:* `{{ .CommonLabels.cluster }}`
{{ if or (and (eq (len .Alerts.Firing) 1) (eq (len .Alerts.Resolved) 0)) (and (eq (len .Alerts.Firing) 0) (eq (len .Alerts.Resolved) 1)) }}
{{ range .Alerts.Firing }}{{ .Annotations.description }}{{ end }}{{ range .Alerts.Resolved }}{{ .Annotations.description }}{{ end }}
{{ else }}
{{ if gt (len .Alerts.Firing) 0 }}
*Alerts Firing:*
{{ range .Alerts }}
• *{{ .Labels.alertname }}:* {{ .Annotations.summary }} `{{ .Labels.severity | toUpper }}`
{{ end }}
{{ end }}{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
*Alerts Resolved:*
{{ range .Alerts.Resolved }}
• *{{ .Labels.alertname }}:* {{ .Annotations.summary }}
{{ end }}
{{ end }}
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: prometheus
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus-node-exporter
namespace: prometheus
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: kube-state-metrics
namespace: prometheus
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: alertmanager
namespace: prometheus
---
kind: Service
apiVersion: v1
metadata:
annotations:
prometheus.io/scrape: "true"
name: prometheus
namespace: prometheus
labels:
app: prometheus
spec:
ports:
- name: prometheus-http
port: 9090
protocol: TCP
selector:
app: prometheus
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: "true"
name: prometheus-node-exporter
namespace: prometheus
labels:
app: prometheus
spec:
clusterIP: None
ports:
- name: prometheus-node-exporter
port: 9100
protocol: TCP
selector:
app: prometheus
type: ClusterIP
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: "true"
labels:
k8s-app: kube-state-metrics
name: kube-state-metrics
namespace: prometheus
spec:
ports:
- name: http-metrics
port: 8080
protocol: TCP
targetPort: http-metrics
- name: telemetry
port: 8081
protocol: TCP
targetPort: telemetry
selector:
k8s-app: kube-state-metrics
sessionAffinity: None
type: ClusterIP
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: "true"
name: alertmanager
namespace: prometheus
labels:
app: alertmanager
spec:
ports:
- name: alertmanager-http
port: 9093
protocol: TCP
selector:
app: alertmanager
---
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: prometheus
namespace: prometheus
labels:
app: prometheus
spec:
replicas: 1
template:
metadata:
name: prometheus
labels:
app: prometheus
spec:
serviceAccountName: prometheus
serviceAccount: prometheus
containers:
- name: prometheus
image: prom/prometheus:latest
args:
- '--config.file=/etc/prometheus/prometheus.yaml'
- '--storage.tsdb.path=/prometheus/'
ports:
- name: http
containerPort: 9090
protocol: TCP
resources:
requests:
cpu: 500m
memory: 1024M
limits:
cpu: 500m
memory: 1024M
volumeMounts:
- name: prometheus-config
mountPath: /etc/prometheus
- mountPath: /prometheus
name: prometheus-data
volumes:
- name: prometheus-config
configMap:
name: prometheus-config
- name: prometheus-data
emptyDir: {}
---
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
labels:
k8s-app: kube-state-metrics
name: kube-state-metrics
namespace: prometheus
spec:
replicas: 1
selector:
matchLabels:
k8s-app: kube-state-metrics
strategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 25%
type: RollingUpdate
template:
metadata:
labels:
k8s-app: kube-state-metrics
spec:
containers:
- image: quay.io/coreos/kube-state-metrics:v1.4.0
imagePullPolicy: IfNotPresent
name: kube-state-metrics
ports:
- containerPort: 8080
name: http-metrics
protocol: TCP
- containerPort: 8081
name: telemetry
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /healthz
port: 8080
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 5
resources: {}
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
serviceAccount: kube-state-metrics
serviceAccountName: kube-state-metrics
---
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: alertmanager
namespace: prometheus
labels:
app: alertmanager
spec:
replicas: 1
template:
metadata:
name: alertmanager
labels:
app: alertmanager
spec:
serviceAccountName: alertmanager
serviceAccount: alertmanager
containers:
- name: alertmanager
image: prom/alertmanager:master
args:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
ports:
- name: http
containerPort: 9093
protocol: TCP
resources:
requests:
cpu: 500m
memory: 500M
limits:
cpu: 500m
memory: 500M
volumeMounts:
- name: alertmanager-config
mountPath: /etc/alertmanager
- name: alertmanager-data
mountPath: /alertmanager
volumes:
- name: alertmanager-config
configMap:
name: alertmanager-config
- name: alertmanager-data
emptyDir: {}
---
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
labels:
app: prometheus-node-exporter
name: prometheus-node-exporter
namespace: prometheus
spec:
revisionHistoryLimit: 10
selector:
matchLabels:
app: prometheus-node-exporter
template:
metadata:
labels:
app: prometheus-node-exporter
name: prometheus-node-exporter
spec:
containers:
- name: prometheus-node-exporter
image: prom/node-exporter:latest
imagePullPolicy: IfNotPresent
ports:
- containerPort: 9100
hostPort: 9100
name: scrape
protocol: TCP
resources:
limits:
cpu: 200m
memory: 50Mi
requests:
cpu: 100m
memory: 30Mi
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /host/proc
name: proc
readOnly: true
- mountPath: /host/sys
name: sys
readOnly: true
hostNetwork: true
hostPID: true
serviceAccount: prometheus-node-exporter
serviceAccountName: prometheus-node-exporter
volumes:
- hostPath:
path: /proc
type: ""
name: proc
- hostPath:
path: /sys
type: ""
name: sys
---
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: prometheus
namespace: prometheus
spec:
rules:
- host: prometheus.mdlwr.se
http:
paths:
- backend:
serviceName: prometheus
servicePort: 9090
path: /
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: alertmanager
namespace: prometheus
spec:
rules:
- host: alertmanager.mdlwr.se
http:
paths:
- backend:
serviceName: alertmanager
servicePort: 9093
path: /
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment