Skip to content

Instantly share code, notes, and snippets.

@anilsakr
Created September 1, 2020 09:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anilsakr/18ff74f85cd6594fef41d1c5d491b6ce to your computer and use it in GitHub Desktop.
Save anilsakr/18ff74f85cd6594fef41d1c5d491b6ce to your computer and use it in GitHub Desktop.
Kubernetes Setup for Prometheus and Grafana
apiVersion: batch/v1
kind: Job
metadata:
name: grafana-import-dashboards
namespace: monitoring
labels:
app: grafana
component: import-dashboards
spec:
template:
metadata:
name: grafana-import-dashboards
labels:
app: grafana
component: import-dashboards
spec:
serviceAccountName: prometheus-k8s
initContainers:
- name: wait-for-grafana
image: giantswarm/tiny-tools
args:
- /bin/sh
- -c
- >
set -x;
while [ $(curl -Lsw '%{http_code}' "http://grafana:3000" -o /dev/null) -ne 200 ]; do
echo '.'
sleep 15;
done
containers:
- name: grafana-import-dashboards
image: giantswarm/tiny-tools
command: ["/bin/sh", "-c"]
workingDir: /opt/grafana-import-dashboards
args:
- >
for file in *-datasource.json ; do
if [ -e "$file" ] ; then
echo "importing $file" &&
curl --silent --fail --show-error \
--request POST http://${GF_ADMIN_USER}:${GF_ADMIN_PASSWORD}@grafana:3000/api/datasources \
--header "Content-Type: application/json" \
--data-binary "@$file" ;
echo "" ;
fi
done ;
for file in *-dashboard.json ; do
if [ -e "$file" ] ; then
echo "importing $file" &&
( echo '{"dashboard":'; \
cat "$file"; \
echo ',"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]}' ) \
| jq -c '.' \
| curl --silent --fail --show-error \
--request POST http://${GF_ADMIN_USER}:${GF_ADMIN_PASSWORD}@grafana:3000/api/dashboards/import \
--header "Content-Type: application/json" \
--data-binary "@-" ;
echo "" ;
fi
done
env:
- name: GF_ADMIN_USER
valueFrom:
secretKeyRef:
name: grafana
key: admin-username
- name: GF_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: grafana
key: admin-password
volumeMounts:
- name: config-volume
mountPath: /opt/grafana-import-dashboards
restartPolicy: Never
volumes:
- name: config-volume
configMap:
name: grafana-import-dashboards
# Derived from ./manifests
---
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus-k8s
namespace: monitoring
---
apiVersion: v1
data:
default.tmpl: |
{{ define "__alertmanager" }}AlertManager{{ end }}
{{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }}
{{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}
{{ define "__description" }}{{ end }}
{{ define "__text_alert_list" }}{{ range . }}Labels:
{{ range .Labels.SortedPairs }} - {{ .Name }} = {{ .Value }}
{{ end }}Annotations:
{{ range .Annotations.SortedPairs }} - {{ .Name }} = {{ .Value }}
{{ end }}Source: {{ .GeneratorURL }}
{{ end }}{{ end }}
{{ define "slack.default.title" }}{{ template "__subject" . }}{{ end }}
{{ define "slack.default.username" }}{{ template "__alertmanager" . }}{{ end }}
{{ define "slack.default.fallback" }}{{ template "slack.default.title" . }} | {{ template "slack.default.titlelink" . }}{{ end }}
{{ define "slack.default.pretext" }}{{ end }}
{{ define "slack.default.titlelink" }}{{ template "__alertmanagerURL" . }}{{ end }}
{{ define "slack.default.iconemoji" }}{{ end }}
{{ define "slack.default.iconurl" }}{{ end }}
{{ define "slack.default.text" }}{{ end }}
{{ define "hipchat.default.from" }}{{ template "__alertmanager" . }}{{ end }}
{{ define "hipchat.default.message" }}{{ template "__subject" . }}{{ end }}
{{ define "pagerduty.default.description" }}{{ template "__subject" . }}{{ end }}
{{ define "pagerduty.default.client" }}{{ template "__alertmanager" . }}{{ end }}
{{ define "pagerduty.default.clientURL" }}{{ template "__alertmanagerURL" . }}{{ end }}
{{ define "pagerduty.default.instances" }}{{ template "__text_alert_list" . }}{{ end }}
{{ define "opsgenie.default.message" }}{{ template "__subject" . }}{{ end }}
{{ define "opsgenie.default.description" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }}
{{ if gt (len .Alerts.Firing) 0 -}}
Alerts Firing:
{{ template "__text_alert_list" .Alerts.Firing }}
{{- end }}
{{ if gt (len .Alerts.Resolved) 0 -}}
Alerts Resolved:
{{ template "__text_alert_list" .Alerts.Resolved }}
{{- end }}
{{- end }}
{{ define "opsgenie.default.source" }}{{ template "__alertmanagerURL" . }}{{ end }}
{{ define "victorops.default.message" }}{{ template "__subject" . }} | {{ template "__alertmanagerURL" . }}{{ end }}
{{ define "victorops.default.from" }}{{ template "__alertmanager" . }}{{ end }}
{{ define "email.default.subject" }}{{ template "__subject" . }}{{ end }}
{{ define "email.default.html" }}
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!--
Style and HTML derived from https://github.com/mailgun/transactional-email-templates
The MIT License (MIT)
Copyright (c) 2014 Mailgun
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
-->
<html xmlns="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<head style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<meta name="viewport" content="width=device-width" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />
<title style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">{{ template "__subject" . }}</title>
</head>
<body itemscope="" itemtype="http://schema.org/EmailMessage" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; -webkit-font-smoothing: antialiased; -webkit-text-size-adjust: none; height: 100%; line-height: 1.6em; width: 100% !important; background-color: #f6f6f6; margin: 0; padding: 0;" bgcolor="#f6f6f6">
<table style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; width: 100%; background-color: #f6f6f6; margin: 0;" bgcolor="#f6f6f6">
<tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0;" valign="top"></td>
<td width="600" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; display: block !important; max-width: 600px !important; clear: both !important; width: 100% !important; margin: 0 auto; padding: 0;" valign="top">
<div style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; max-width: 600px; display: block; margin: 0 auto; padding: 0;">
<table width="100%" cellpadding="0" cellspacing="0" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; border-radius: 3px; background-color: #fff; margin: 0; border: 1px solid #e9e9e9;" bgcolor="#fff">
<tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: top; color: #fff; font-weight: 500; text-align: center; border-radius: 3px 3px 0 0; background-color: #E6522C; margin: 0; padding: 20px;" align="center" bgcolor="#E6522C" valign="top">
{{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }}
{{ .Name }}={{ .Value }}
{{ end }}
</td>
</tr>
<tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 10px;" valign="top">
<table width="100%" cellpadding="0" cellspacing="0" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 0 0 20px;" valign="top">
<a href="{{ template "__alertmanagerURL" . }}" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; color: #FFF; text-decoration: none; line-height: 2em; font-weight: bold; text-align: center; cursor: pointer; display: inline-block; border-radius: 5px; text-transform: capitalize; background-color: #348eda; margin: 0; border-color: #348eda; border-style: solid; border-width: 10px 20px;">View in {{ template "__alertmanager" . }}</a>
</td>
</tr>
{{ if gt (len .Alerts.Firing) 0 }}
<tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 0 0 20px;" valign="top">
<strong style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">[{{ .Alerts.Firing | len }}] Firing</strong>
</td>
</tr>
{{ end }}
{{ range .Alerts.Firing }}
<tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 0 0 20px;" valign="top">
<strong style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">Labels</strong><br style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />
{{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}<br style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />{{ end }}
{{ if gt (len .Annotations) 0 }}<strong style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">Annotations</strong><br style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />{{ end }}
{{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}<br style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />{{ end }}
<a href="{{ .GeneratorURL }}" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; color: #348eda; text-decoration: underline; margin: 0;">Source</a><br style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />
</td>
</tr>
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
{{ if gt (len .Alerts.Firing) 0 }}
<tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 0 0 20px;" valign="top">
<br style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />
<hr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />
<br style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />
</td>
</tr>
{{ end }}
<tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 0 0 20px;" valign="top">
<strong style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">[{{ .Alerts.Resolved | len }}] Resolved</strong>
</td>
</tr>
{{ end }}
{{ range .Alerts.Resolved }}
<tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 0 0 20px;" valign="top">
<strong style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">Labels</strong><br style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />
{{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}<br style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />{{ end }}
{{ if gt (len .Annotations) 0 }}<strong style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">Annotations</strong><br style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />{{ end }}
{{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}<br style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />{{ end }}
<a href="{{ .GeneratorURL }}" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; color: #348eda; text-decoration: underline; margin: 0;">Source</a><br style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />
</td>
</tr>
{{ end }}
</table>
</td>
</tr>
</table>
<div style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; width: 100%; clear: both; color: #999; margin: 0; padding: 20px;">
<table width="100%" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 12px; vertical-align: top; text-align: center; color: #999; margin: 0; padding: 0 0 20px;" align="center" valign="top"><a href="{{ .ExternalURL }}" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 12px; color: #999; text-decoration: underline; margin: 0;">Sent by {{ template "__alertmanager" . }}</a></td>
</tr>
</table>
</div></div>
</td>
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0;" valign="top"></td>
</tr>
</table>
</body>
</html>
{{ end }}
{{ define "pushover.default.title" }}{{ template "__subject" . }}{{ end }}
{{ define "pushover.default.message" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }}
{{ if gt (len .Alerts.Firing) 0 }}
Alerts Firing:
{{ template "__text_alert_list" .Alerts.Firing }}
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
Alerts Resolved:
{{ template "__text_alert_list" .Alerts.Resolved }}
{{ end }}
{{ end }}
{{ define "pushover.default.url" }}{{ template "__alertmanagerURL" . }}{{ end }}
slack.tmpl: |
{{ define "slack.devops.text" }}
{{range .Alerts}}{{.Annotations.DESCRIPTION}}
{{end}}
{{ end }}
kind: ConfigMap
metadata:
creationTimestamp: null
name: alertmanager-templates
namespace: monitoring
---
kind: ConfigMap
apiVersion: v1
metadata:
name: alertmanager
namespace: monitoring
data:
config.yml: |-
global:
# ResolveTimeout is the time after which an alert is declared resolved
# if it has not been updated.
resolve_timeout: 5m
# The smarthost and SMTP sender used for mail notifications.
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: 'foo@bar.com'
smtp_auth_username: 'foo@bar.com'
smtp_auth_password: 'barfoo'
# The API URL to use for Slack notifications.
slack_api_url: 'https://hooks.slack.com/services/some/api/token'
# # The directory from which notification templates are read.
templates:
- '/etc/alertmanager-templates/*.tmpl'
# The root route on which each incoming alert enters.
route:
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
group_by: ['alertname', 'cluster', 'service']
# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 30s
# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 5m
# If an alert has successfully been sent, wait 'repeat_interval' to
# resend them.
#repeat_interval: 1m
repeat_interval: 15m
# A default receiver
# If an alert isn't caught by a route, send it to default.
receiver: default
# All the above attributes are inherited by all child routes and can
# overwritten on each.
# The child route trees.
routes:
# Send severity=slack alerts to slack.
- match:
severity: slack
receiver: slack_alert
# - match:
# severity: email
# receiver: email_alert
receivers:
- name: 'default'
slack_configs:
- channel: '#alertmanager-test'
text: '<!channel>{{ template "slack.devops.text" . }}'
send_resolved: true
- name: 'slack_alert'
slack_configs:
- channel: '#alertmanager-test'
send_resolved: true
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: alertmanager
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app: alertmanager
template:
metadata:
name: alertmanager
labels:
app: alertmanager
spec:
containers:
- name: alertmanager
image: quay.io/prometheus/alertmanager:v0.7.1
args:
- '-config.file=/etc/alertmanager/config.yml'
- '-storage.path=/alertmanager'
ports:
- name: alertmanager
containerPort: 9093
volumeMounts:
- name: config-volume
mountPath: /etc/alertmanager
- name: templates-volume
mountPath: /etc/alertmanager-templates
- name: alertmanager
mountPath: /alertmanager
volumes:
- name: config-volume
configMap:
name: alertmanager
- name: templates-volume
configMap:
name: alertmanager-templates
- name: alertmanager
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: 'true'
prometheus.io/path: '/metrics'
labels:
name: alertmanager
name: alertmanager
namespace: monitoring
spec:
selector:
app: alertmanager
type: NodePort
ports:
- name: alertmanager
protocol: TCP
port: 9093
targetPort: 9093
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana-core
namespace: monitoring
labels:
app: grafana
component: core
spec:
replicas: 1
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
component: core
spec:
containers:
- image: grafana/grafana:4.2.0
name: grafana-core
imagePullPolicy: IfNotPresent
# env:
resources:
# keep request = limit to keep this container in guaranteed class
limits:
cpu: 100m
memory: 100Mi
requests:
cpu: 100m
memory: 100Mi
env:
# The following env variables set up basic auth twith the default admin user and admin password.
- name: GF_AUTH_BASIC_ENABLED
value: "true"
- name: GF_SECURITY_ADMIN_USER
valueFrom:
secretKeyRef:
name: grafana
key: admin-username
- name: GF_SECURITY_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: grafana
key: admin-password
- name: GF_AUTH_ANONYMOUS_ENABLED
value: "false"
# - name: GF_AUTH_ANONYMOUS_ORG_ROLE
# value: Admin
# does not really work, because of template variables in exported dashboards:
# - name: GF_DASHBOARDS_JSON_ENABLED
# value: "true"
readinessProbe:
httpGet:
path: /login
port: 3000
# initialDelaySeconds: 30
# timeoutSeconds: 1
volumeMounts:
- name: grafana-persistent-storage
mountPath: /var/lib/grafana
volumes:
- name: grafana-persistent-storage
emptyDir: {}
---
apiVersion: v1
data:
grafana-net-2-dashboard.json: |
{
"__inputs": [{
"name": "DS_PROMETHEUS",
"label": "Prometheus",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}],
"__requires": [{
"type": "panel",
"id": "singlestat",
"name": "Singlestat",
"version": ""
}, {
"type": "panel",
"id": "text",
"name": "Text",
"version": ""
}, {
"type": "panel",
"id": "graph",
"name": "Graph",
"version": ""
}, {
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "3.1.0"
}, {
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
}],
"id": null,
"title": "Prometheus Stats",
"tags": [],
"style": "dark",
"timezone": "browser",
"editable": true,
"hideControls": true,
"sharedCrosshair": false,
"rows": [{
"collapse": false,
"editable": true,
"height": 178,
"panels": [{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"],
"datasource": "${DS_PROMETHEUS}",
"decimals": 1,
"editable": true,
"error": false,
"format": "s",
"id": 5,
"interval": null,
"links": [],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [{
"expr": "(time() - container_start_time_seconds{container_name=\"kube-apiserver\"})",
"intervalFactor": 2,
"refId": "A",
"step": 4
}],
"thresholds": "",
"title": "Uptime",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [{
"op": "=",
"text": "N/A",
"value": "null"
}],
"valueName": "current",
"mappingTypes": [{
"name": "value to text",
"value": 1
}, {
"name": "range to text",
"value": 2
}],
"rangeMaps": [{
"from": "null",
"to": "null",
"text": "N/A"
}],
"mappingType": 1,
"gauge": {
"show": false,
"minValue": 0,
"maxValue": 100,
"thresholdMarkers": true,
"thresholdLabels": false
}
}, {
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": ["rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)"],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "none",
"id": 6,
"interval": null,
"links": [],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"targets": [{
"expr": "prometheus_local_storage_memory_series",
"intervalFactor": 2,
"refId": "A",
"step": 4
}],
"thresholds": "1,5",
"title": "Local Storage Memory Series",
"type": "singlestat",
"valueFontSize": "70%",
"valueMaps": [],
"valueName": "current",
"mappingTypes": [{
"name": "value to text",
"value": 1
}, {
"name": "range to text",
"value": 2
}],
"rangeMaps": [{
"from": "null",
"to": "null",
"text": "N/A"
}],
"mappingType": 1,
"gauge": {
"show": false,
"minValue": 0,
"maxValue": 100,
"thresholdMarkers": true,
"thresholdLabels": false
}
}, {
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": ["rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)"],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "none",
"id": 7,
"interval": null,
"links": [],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"targets": [{
"expr": "prometheus_local_storage_indexing_queue_length",
"intervalFactor": 2,
"refId": "A",
"step": 4
}],
"thresholds": "500,4000",
"title": "Internal Storage Queue Length",
"type": "singlestat",
"valueFontSize": "70%",
"valueMaps": [{
"op": "=",
"text": "Empty",
"value": "0"
}],
"valueName": "current",
"mappingTypes": [{
"name": "value to text",
"value": 1
}, {
"name": "range to text",
"value": 2
}],
"rangeMaps": [{
"from": "null",
"to": "null",
"text": "N/A"
}],
"mappingType": 1,
"gauge": {
"show": false,
"minValue": 0,
"maxValue": 100,
"thresholdMarkers": true,
"thresholdLabels": false
}
}, {
"content": "<img src=\"http://prometheus.io/assets/prometheus_logo_grey.svg\" alt=\"Prometheus logo\" style=\"height: 40px;\">\n<span style=\"font-family: 'Open Sans', 'Helvetica Neue', Helvetica; font-size: 25px;vertical-align: text-top;color: #bbbfc2;margin-left: 10px;\">Prometheus</span>\n\n<p style=\"margin-top: 10px;\">You're using Prometheus, an open-source systems monitoring and alerting toolkit originally built at SoundCloud. For more information, check out the <a href=\"http://www.grafana.org/\">Grafana</a> and <a href=\"http://prometheus.io/\">Prometheus</a> projects.</p>",
"editable": true,
"error": false,
"id": 9,
"links": [],
"mode": "html",
"span": 3,
"style": {},
"title": "",
"transparent": true,
"type": "text"
}],
"title": "New row"
}, {
"collapse": false,
"editable": true,
"height": 227,
"panels": [{
"aliasColors": {
"prometheus": "#C15C17",
"{instance=\"localhost:9090\",job=\"prometheus\"}": "#C15C17"
},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 3,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 9,
"stack": false,
"steppedLine": false,
"targets": [{
"expr": "rate(prometheus_local_storage_ingested_samples_total[5m])",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{job}}",
"metric": "",
"refId": "A",
"step": 2
}],
"timeFrom": null,
"timeShift": null,
"title": "Samples ingested (rate-5m)",
"tooltip": {
"shared": true,
"value_type": "cumulative",
"ordering": "alphabetical",
"msResolution": false
},
"type": "graph",
"yaxes": [{
"show": true,
"min": null,
"max": null,
"logBase": 1,
"format": "short"
}, {
"show": true,
"min": null,
"max": null,
"logBase": 1,
"format": "short"
}],
"xaxis": {
"show": true
}
}, {
"content": "#### Samples Ingested\nThis graph displays the count of samples ingested by the Prometheus server, as measured over the last 5 minutes, per time series in the range vector. When troubleshooting an issue on IRC or Github, this is often the first stat requested by the Prometheus team. ",
"editable": true,
"error": false,
"id": 8,
"links": [],
"mode": "markdown",
"span": 2.995914043583536,
"style": {},
"title": "",
"transparent": true,
"type": "text"
}],
"title": "New row"
}, {
"collapse": false,
"editable": true,
"height": "250px",
"panels": [{
"aliasColors": {
"prometheus": "#F9BA8F",
"{instance=\"localhost:9090\",interval=\"5s\",job=\"prometheus\"}": "#F9BA8F"
},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 2,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 5,
"stack": false,
"steppedLine": false,
"targets": [{
"expr": "rate(prometheus_target_interval_length_seconds_count[5m])",
"intervalFactor": 2,
"legendFormat": "{{job}}",
"refId": "A",
"step": 2
}],
"timeFrom": null,
"timeShift": null,
"title": "Target Scrapes (last 5m)",
"tooltip": {
"shared": true,
"value_type": "cumulative",
"ordering": "alphabetical",
"msResolution": false
},
"type": "graph",
"yaxes": [{
"show": true,
"min": null,
"max": null,
"logBase": 1,
"format": "short"
}, {
"show": true,
"min": null,
"max": null,
"logBase": 1,
"format": "short"
}],
"xaxis": {
"show": true
}
}, {
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 14,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [{
"expr": "prometheus_target_interval_length_seconds{quantile!=\"0.01\", quantile!=\"0.05\"}",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{quantile}} ({{interval}})",
"metric": "",
"refId": "A",
"step": 2
}],
"timeFrom": null,
"timeShift": null,
"title": "Scrape Duration",
"tooltip": {
"shared": true,
"value_type": "cumulative",
"ordering": "alphabetical",
"msResolution": false
},
"type": "graph",
"yaxes": [{
"show": true,
"min": null,
"max": null,
"logBase": 1,
"format": "short"
}, {
"show": true,
"min": null,
"max": null,
"logBase": 1,
"format": "short"
}],
"xaxis": {
"show": true
}
}, {
"content": "#### Scrapes\nPrometheus scrapes metrics from instrumented jobs, either directly or via an intermediary push gateway for short-lived jobs. Target scrapes will show how frequently targets are scraped, as measured over the last 5 minutes, per time series in the range vector. Scrape Duration will show how long the scrapes are taking, with percentiles available as series. ",
"editable": true,
"error": false,
"id": 11,
"links": [],
"mode": "markdown",
"span": 3,
"style": {},
"title": "",
"transparent": true,
"type": "text"
}],
"title": "New row"
}, {
"collapse": false,
"editable": true,
"height": "250px",
"panels": [{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"decimals": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 12,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"hideEmpty": true,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 9,
"stack": false,
"steppedLine": false,
"targets": [{
"expr": "prometheus_evaluator_duration_seconds{quantile!=\"0.01\", quantile!=\"0.05\"}",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{quantile}}",
"refId": "A",
"step": 2
}],
"timeFrom": null,
"timeShift": null,
"title": "Rule Eval Duration",
"tooltip": {
"shared": true,
"value_type": "cumulative",
"ordering": "alphabetical",
"msResolution": false
},
"type": "graph",
"yaxes": [{
"show": true,
"min": null,
"max": null,
"logBase": 1,
"format": "percentunit",
"label": ""
}, {
"show": true,
"min": null,
"max": null,
"logBase": 1,
"format": "short"
}],
"xaxis": {
"show": true
}
}, {
"content": "#### Rule Evaluation Duration\nThis graph panel plots the duration for all evaluations to execute. The 50th percentile, 90th percentile and 99th percentile are shown as three separate series to help identify outliers that may be skewing the data.",
"editable": true,
"error": false,
"id": 15,
"links": [],
"mode": "markdown",
"span": 3,
"style": {},
"title": "",
"transparent": true,
"type": "text"
}],
"title": "New row"
}],
"time": {
"from": "now-5m",
"to": "now"
},
"timepicker": {
"now": true,
"refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"],
"time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
},
"templating": {
"list": []
},
"annotations": {
"list": []
},
"refresh": false,
"schemaVersion": 12,
"version": 0,
"links": [{
"icon": "info",
"tags": [],
"targetBlank": true,
"title": "Grafana Docs",
"tooltip": "",
"type": "link",
"url": "http://www.grafana.org/docs"
}, {
"icon": "info",
"tags": [],
"targetBlank": true,
"title": "Prometheus Docs",
"type": "link",
"url": "http://prometheus.io/docs/introduction/overview/"
}],
"gnetId": 2,
"description": "The official, pre-built Prometheus Stats Dashboard."
}
grafana-net-737-dashboard.json: |
{
"__inputs": [{
"name": "DS_PROMETHEUS",
"label": "prometheus",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}],
"__requires": [{
"type": "panel",
"id": "singlestat",
"name": "Singlestat",
"version": ""
}, {
"type": "panel",
"id": "graph",
"name": "Graph",
"version": ""
}, {
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "3.1.0"
}, {
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
}],
"id": null,
"title": "Kubernetes Pod Resources",
"description": "Shows resource usage of Kubernetes pods.",
"tags": [
"kubernetes"
],
"style": "dark",
"timezone": "browser",
"editable": true,
"hideControls": false,
"sharedCrosshair": false,
"rows": [{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"height": "180px",
"id": 4,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [{
"name": "value to text",
"value": 1
}, {
"name": "range to text",
"value": 2
}],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [{
"from": "null",
"text": "N/A",
"to": "null"
}],
"span": 4,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [{
"expr": "sum (container_memory_working_set_bytes{id=\"/\",instance=~\"^$instance$\"}) / sum (machine_memory_bytes{instance=~\"^$instance$\"}) * 100",
"interval": "",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A",
"step": 2
}],
"thresholds": "65, 90",
"timeFrom": "1m",
"timeShift": null,
"title": "Memory Working Set",
"transparent": false,
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [{
"op": "=",
"text": "N/A",
"value": "null"
}],
"valueName": "current"
}, {
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"decimals": 2,
"editable": true,
"error": false,
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"height": "180px",
"id": 6,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [{
"name": "value to text",
"value": 1
}, {
"name": "range to text",
"value": 2
}],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [{
"from": "null",
"text": "N/A",
"to": "null"
}],
"span": 4,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [{
"expr": "sum(rate(container_cpu_usage_seconds_total{id=\"/\",instance=~\"^$instance$\"}[1m])) / sum (machine_cpu_cores{instance=~\"^$instance$\"}) * 100",
"interval": "10s",
"intervalFactor": 1,
"refId": "A",
"step": 10
}],
"thresholds": "65, 90",
"timeFrom": "1m",
"timeShift": null,
"title": "Cpu Usage",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [{
"op": "=",
"text": "N/A",
"value": "null"
}],
"valueName": "current"
}, {
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"decimals": 2,
"editable": true,
"error": false,
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"height": "180px",
"id": 7,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [{
"name": "value to text",
"value": 1
}, {
"name": "range to text",
"value": 2
}],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [{
"from": "null",
"text": "N/A",
"to": "null"
}],
"span": 4,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [{
"expr": "sum(container_fs_usage_bytes{id=\"/\",instance=~\"^$instance$\"}) / sum(container_fs_limit_bytes{id=\"/\",instance=~\"^$instance$\"}) * 100",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "",
"metric": "",
"refId": "A",
"step": 10
}],
"thresholds": "65, 90",
"timeFrom": "1m",
"timeShift": null,
"title": "Filesystem Usage",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [{
"op": "=",
"text": "N/A",
"value": "null"
}],
"valueName": "current"
}, {
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"decimals": 2,
"editable": true,
"error": false,
"format": "bytes",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"height": "1px",
"hideTimeOverride": true,
"id": 9,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [{
"name": "value to text",
"value": 1
}, {
"name": "range to text",
"value": 2
}],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "20%",
"prefix": "",
"prefixFontSize": "20%",
"rangeMaps": [{
"from": "null",
"text": "N/A",
"to": "null"
}],
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [{
"expr": "sum(container_memory_working_set_bytes{id=\"/\",instance=~\"^$instance$\"})",
"interval": "10s",
"intervalFactor": 1,
"refId": "A",
"step": 10
}],
"thresholds": "",
"timeFrom": "1m",
"title": "Used",
"type": "singlestat",
"valueFontSize": "50%",
"valueMaps": [{
"op": "=",
"text": "N/A",
"value": "null"
}],
"valueName": "current"
}, {
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"decimals": 2,
"editable": true,
"error": false,
"format": "bytes",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"height": "1px",
"hideTimeOverride": true,
"id": 10,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [{
"name": "value to text",
"value": 1
}, {
"name": "range to text",
"value": 2
}],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [{
"from": "null",
"text": "N/A",
"to": "null"
}],
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [{
"expr": "sum (machine_memory_bytes{instance=~\"^$instance$\"})",
"interval": "10s",
"intervalFactor": 1,
"refId": "A",
"step": 10
}],
"thresholds": "",
"timeFrom": "1m",
"title": "Total",
"type": "singlestat",
"valueFontSize": "50%",
"valueMaps": [{
"op": "=",
"text": "N/A",
"value": "null"
}],
"valueName": "current"
}, {
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"decimals": 2,
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"height": "1px",
"hideTimeOverride": true,
"id": 11,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [{
"name": "value to text",
"value": 1
}, {
"name": "range to text",
"value": 2
}],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": " cores",
"postfixFontSize": "30%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [{
"from": "null",
"text": "N/A",
"to": "null"
}],
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [{
"expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",instance=~\"^$instance$\"}[1m]))",
"interval": "10s",
"intervalFactor": 1,
"refId": "A",
"step": 10
}],
"thresholds": "",
"timeFrom": "1m",
"timeShift": null,
"title": "Used",
"type": "singlestat",
"valueFontSize": "50%",
"valueMaps": [{
"op": "=",
"text": "N/A",
"value": "null"
}],
"valueName": "current"
}, {
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"decimals": 2,
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"height": "1px",
"hideTimeOverride": true,
"id": 12,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [{
"name": "value to text",
"value": 1
}, {
"name": "range to text",
"value": 2
}],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": " cores",
"postfixFontSize": "30%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [{
"from": "null",
"text": "N/A",
"to": "null"
}],
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [{
"expr": "sum (machine_cpu_cores{instance=~\"^$instance$\"})",
"interval": "10s",
"intervalFactor": 1,
"refId": "A",
"step": 10
}],
"thresholds": "",
"timeFrom": "1m",
"title": "Total",
"type": "singlestat",
"valueFontSize": "50%",
"valueMaps": [{
"op": "=",
"text": "N/A",
"value": "null"
}],
"valueName": "current"
}, {
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"decimals": 2,
"editable": true,
"error": false,
"format": "bytes",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"height": "1px",
"hideTimeOverride": true,
"id": 13,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [{
"name": "value to text",
"value": 1
}, {
"name": "range to text",
"value": 2
}],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [{
"from": "null",
"text": "N/A",
"to": "null"
}],
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [{
"expr": "sum(container_fs_usage_bytes{id=\"/\",instance=~\"^$instance$\"})",
"interval": "10s",
"intervalFactor": 1,
"refId": "A",
"step": 10
}],
"thresholds": "",
"timeFrom": "1m",
"title": "Used",
"type": "singlestat",
"valueFontSize": "50%",
"valueMaps": [{
"op": "=",
"text": "N/A",
"value": "null"
}],
"valueName": "current"
}, {
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"decimals": 2,
"editable": true,
"error": false,
"format": "bytes",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"height": "1px",
"hideTimeOverride": true,
"id": 14,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [{
"name": "value to text",
"value": 1
}, {
"name": "range to text",
"value": 2
}],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [{
"from": "null",
"text": "N/A",
"to": "null"
}],
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [{
"expr": "sum (container_fs_limit_bytes{id=\"/\",instance=~\"^$instance$\"})",
"interval": "10s",
"intervalFactor": 1,
"refId": "A",
"step": 10
}],
"thresholds": "",
"timeFrom": "1m",
"title": "Total",
"type": "singlestat",
"valueFontSize": "50%",
"valueMaps": [{
"op": "=",
"text": "N/A",
"value": "null"
}],
"valueName": "current"
}, {
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"decimals": 2,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)",
"thresholdLine": false
},
"height": "200px",
"id": 32,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": 200,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [{
"expr": "sum(rate(container_network_receive_bytes_total{instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m]))",
"interval": "",
"intervalFactor": 2,
"legendFormat": "receive",
"metric": "network",
"refId": "A",
"step": 240
}, {
"expr": "- sum(rate(container_network_transmit_bytes_total{instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m]))",
"interval": "",
"intervalFactor": 2,
"legendFormat": "transmit",
"metric": "network",
"refId": "B",
"step": 240
}],
"timeFrom": null,
"timeShift": null,
"title": "Network",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"transparent": false,
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [{
"format": "Bps",
"label": "transmit / receive",
"logBase": 1,
"max": null,
"min": null,
"show": true
}, {
"format": "Bps",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}]
}],
"showTitle": true,
"title": "all pods"
}, {
"collapse": false,
"editable": true,
"height": "250px",
"panels": [{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"decimals": 3,
"editable": true,
"error": false,
"fill": 0,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"height": "",
"id": 17,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"hideEmpty": true,
"hideZero": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [{
"expr": "sum(rate(container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m])) by (pod_name)",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{ pod_name }}",
"metric": "container_cpu",
"refId": "A",
"step": 240
}],
"timeFrom": null,
"timeShift": null,
"title": "Cpu Usage",
"tooltip": {
"msResolution": true,
"shared": false,
"sort": 2,
"value_type": "cumulative"
},
"transparent": false,
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [{
"format": "none",
"label": "cores",
"logBase": 1,
"max": null,
"min": null,
"show": true
}, {
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}]
}, {
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"decimals": 2,
"editable": true,
"error": false,
"fill": 0,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 33,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"hideEmpty": true,
"hideZero": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [{
"expr": "sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}) by (pod_name)",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{ pod_name }}",
"metric": "",
"refId": "A",
"step": 240
}],
"timeFrom": null,
"timeShift": null,
"title": "Memory Working Set",
"tooltip": {
"msResolution": false,
"shared": false,
"sort": 2,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [{
"format": "bytes",
"label": "used",
"logBase": 1,
"max": null,
"min": null,
"show": true
}, {
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}]
}, {
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"decimals": 2,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 16,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"hideEmpty": true,
"hideZero": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": 200,
"sort": "avg",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [{
"expr": "sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m])) by (pod_name)",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{ pod_name }} < in",
"metric": "network",
"refId": "A",
"step": 240
}, {
"expr": "- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m])) by (pod_name)",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{ pod_name }} > out",
"metric": "network",
"refId": "B",
"step": 240
}],
"timeFrom": null,
"timeShift": null,
"title": "Network",
"tooltip": {
"msResolution": false,
"shared": false,
"sort": 2,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [{
"format": "Bps",
"label": "transmit / receive",
"logBase": 1,
"max": null,
"min": null,
"show": true
}, {
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}]
}, {
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"decimals": 2,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 34,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"hideEmpty": true,
"hideZero": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": 200,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [{
"expr": "sum(container_fs_usage_bytes{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}) by (pod_name)",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{ pod_name }}",
"metric": "network",
"refId": "A",
"step": 240
}],
"timeFrom": null,
"timeShift": null,
"title": "Filesystem",
"tooltip": {
"msResolution": false,
"shared": false,
"sort": 2,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [{
"format": "bytes",
"label": "used",
"logBase": 1,
"max": null,
"min": null,
"show": true
}, {
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}]
}],
"showTitle": true,
"title": "each pod"
}],
"time": {
"from": "now-3d",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"templating": {
"list": [{
"allValue": ".*",
"current": {},
"datasource": "${DS_PROMETHEUS}",
"hide": 0,
"includeAll": true,
"label": "Instance",
"multi": false,
"name": "instance",
"options": [],
"query": "label_values(instance)",
"refresh": 1,
"regex": "",
"type": "query"
}, {
"current": {},
"datasource": "${DS_PROMETHEUS}",
"hide": 0,
"includeAll": true,
"label": "Namespace",
"multi": true,
"name": "namespace",
"options": [],
"query": "label_values(namespace)",
"refresh": 1,
"regex": "",
"type": "query"
}]
},
"annotations": {
"list": []
},
"refresh": false,
"schemaVersion": 12,
"version": 8,
"links": [],
"gnetId": 737
}
prometheus-datasource.json: |
{
"name": "prometheus",
"type": "prometheus",
"url": "http://prometheus:9090",
"access": "proxy",
"basicAuth": false
}
kind: ConfigMap
metadata:
creationTimestamp: null
name: grafana-import-dashboards
namespace: monitoring
---
apiVersion: batch/v1
kind: Job
metadata:
name: grafana-import-dashboards
namespace: monitoring
labels:
app: grafana
component: import-dashboards
spec:
template:
metadata:
name: grafana-import-dashboards
labels:
app: grafana
component: import-dashboards
spec:
serviceAccountName: prometheus-k8s
initContainers:
- name: wait-for-grafana
image: giantswarm/tiny-tools
args:
- /bin/sh
- -c
- >
set -x;
while [ $(curl -Lsw '%{http_code}' "http://grafana:3000" -o /dev/null) -ne 200 ]; do
echo '.'
sleep 15;
done
containers:
- name: grafana-import-dashboards
image: giantswarm/tiny-tools
command: ["/bin/sh", "-c"]
workingDir: /opt/grafana-import-dashboards
args:
- >
for file in *-datasource.json ; do
if [ -e "$file" ] ; then
echo "importing $file" &&
curl --silent --fail --show-error \
--request POST http://${GF_ADMIN_USER}:${GF_ADMIN_PASSWORD}@grafana:3000/api/datasources \
--header "Content-Type: application/json" \
--data-binary "@$file" ;
echo "" ;
fi
done ;
for file in *-dashboard.json ; do
if [ -e "$file" ] ; then
echo "importing $file" &&
( echo '{"dashboard":'; \
cat "$file"; \
echo ',"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]}' ) \
| jq -c '.' \
| curl --silent --fail --show-error \
--request POST http://${GF_ADMIN_USER}:${GF_ADMIN_PASSWORD}@grafana:3000/api/dashboards/import \
--header "Content-Type: application/json" \
--data-binary "@-" ;
echo "" ;
fi
done
env:
- name: GF_ADMIN_USER
valueFrom:
secretKeyRef:
name: grafana
key: admin-username
- name: GF_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: grafana
key: admin-password
volumeMounts:
- name: config-volume
mountPath: /opt/grafana-import-dashboards
restartPolicy: Never
volumes:
- name: config-volume
configMap:
name: grafana-import-dashboards
---
# apiVersion: extensions/v1beta1
# kind: Ingress
# metadata:
# name: grafana
# namespace: monitoring
# spec:
# rules:
# - host: <yourchoice>.<cluster-id>.k8s.gigantic.io
# http:
# paths:
# - path: /
# backend:
# serviceName: grafana
# servicePort: 3000
---
apiVersion: v1
kind: Secret
data:
admin-password: YWRtaW4=
admin-username: YWRtaW4=
metadata:
name: grafana
namespace: monitoring
type: Opaque
---
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: monitoring
labels:
app: grafana
component: core
spec:
type: NodePort
ports:
- port: 3000
selector:
app: grafana
component: core
---
apiVersion: v1
data:
prometheus.yaml: |
global:
scrape_interval: 10s
scrape_timeout: 10s
evaluation_interval: 10s
rule_files:
- "/etc/prometheus-rules/*.rules"
scrape_configs:
# https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L37
- job_name: 'kubernetes-nodes'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:10255'
target_label: __address__
# https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L79
- job_name: 'kubernetes-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: (.+)(?::\d+);(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
# https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L119
- job_name: 'kubernetes-services'
metrics_path: /probe
params:
module: [http_2xx]
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_name
# https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L156
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: (.+):(?:\d+);(\d+)
replacement: ${1}:${2}
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- source_labels: [__meta_kubernetes_pod_container_port_number]
action: keep
regex: 9\d{3}
- job_name: 'kubernetes-cadvisor'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
kind: ConfigMap
metadata:
creationTimestamp: null
name: prometheus-core
namespace: monitoring
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus-core
namespace: monitoring
labels:
app: prometheus
component: core
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
name: prometheus-main
labels:
app: prometheus
component: core
spec:
serviceAccountName: prometheus-k8s
containers:
- name: prometheus
image: prom/prometheus:v1.7.0
args:
- '-storage.local.retention=12h'
- '-storage.local.memory-chunks=500000'
- '-config.file=/etc/prometheus/prometheus.yaml'
- '-alertmanager.url=http://alertmanager:9093/'
ports:
- name: webui
containerPort: 9090
resources:
requests:
cpu: 500m
memory: 500M
limits:
cpu: 500m
memory: 500M
volumeMounts:
- name: config-volume
mountPath: /etc/prometheus
- name: rules-volume
mountPath: /etc/prometheus-rules
volumes:
- name: config-volume
configMap:
name: prometheus-core
- name: rules-volume
configMap:
name: prometheus-rules
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: kube-state-metrics
namespace: monitoring
labels:
app: kube-state-metrics
spec:
replicas: 1
selector:
matchLabels:
app: kube-state-metrics
template:
metadata:
labels:
app: kube-state-metrics
spec:
serviceAccountName: kube-state-metrics
containers:
- name: kube-state-metrics
image: gcr.io/google_containers/kube-state-metrics:v0.5.0
ports:
- containerPort: 8080
---
# ---
# apiVersion: rbac.authorization.k8s.io/v1beta1
# kind: ClusterRoleBinding
# metadata:
# name: kube-state-metrics
# roleRef:
# apiGroup: rbac.authorization.k8s.io
# kind: ClusterRole
# name: kube-state-metrics
# subjects:
# - kind: ServiceAccount
# name: kube-state-metrics
# namespace: monitoring
# ---
# apiVersion: rbac.authorization.k8s.io/v1beta1
# kind: ClusterRole
# metadata:
# name: kube-state-metrics
# rules:
# - apiGroups: [""]
# resources:
# - nodes
# - pods
# - services
# - resourcequotas
# - replicationcontrollers
# - limitranges
# verbs: ["list", "watch"]
# - apiGroups: ["apps"]
# resources:
# - daemonsets
# - deployments
# - replicasets
# verbs: ["list", "watch"]
# ---
apiVersion: v1
kind: ServiceAccount
metadata:
name: kube-state-metrics
namespace: monitoring
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: 'true'
name: kube-state-metrics
namespace: monitoring
labels:
app: kube-state-metrics
spec:
ports:
- name: kube-state-metrics
port: 8080
protocol: TCP
selector:
app: kube-state-metrics
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-directory-size-metrics
namespace: monitoring
labels:
app: node-directory-size-metrics
annotations:
description: |
This `DaemonSet` provides metrics in Prometheus format about disk usage on the nodes.
The container `read-du` reads in sizes of all directories below /mnt and writes that to `/tmp/metrics`. It only reports directories larger then `100M` for now.
The other container `caddy` just hands out the contents of that file on request via `http` on `/metrics` at port `9102` which are the defaults for Prometheus.
These are scheduled on every node in the Kubernetes cluster.
To choose directories from the node to check, just mount them on the `read-du` container below `/mnt`.
spec:
selector:
matchLabels:
app: node-directory-size-metrics
template:
metadata:
labels:
app: node-directory-size-metrics
annotations:
prometheus.io/scrape: 'true'
prometheus.io/port: '9102'
description: |
This `Pod` provides metrics in Prometheus format about disk usage on the node.
The container `read-du` reads in sizes of all directories below /mnt and writes that to `/tmp/metrics`. It only reports directories larger then `100M` for now.
The other container `caddy` just hands out the contents of that file on request on `/metrics` at port `9102` which are the defaults for Prometheus.
This `Pod` is scheduled on every node in the Kubernetes cluster.
To choose directories from the node to check just mount them on `read-du` below `/mnt`.
spec:
containers:
- name: read-du
image: giantswarm/tiny-tools
imagePullPolicy: Always
# FIXME threshold via env var
# The
command:
- fish
- --command
- |
touch /tmp/metrics-temp
while true
for directory in (du --bytes --separate-dirs --threshold=100M /mnt)
echo $directory | read size path
echo "node_directory_size_bytes{path=\"$path\"} $size" \
>> /tmp/metrics-temp
end
mv /tmp/metrics-temp /tmp/metrics
sleep 300
end
volumeMounts:
- name: host-fs-var
mountPath: /mnt/var
readOnly: true
- name: metrics
mountPath: /tmp
- name: caddy
image: dockermuenster/caddy:0.9.3
command:
- "caddy"
- "-port=9102"
- "-root=/var/www"
ports:
- containerPort: 9102
volumeMounts:
- name: metrics
mountPath: /var/www
volumes:
- name: host-fs-var
hostPath:
path: /var
- name: metrics
emptyDir:
medium: Memory
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: prometheus-node-exporter
namespace: monitoring
labels:
app: prometheus
component: node-exporter
spec:
selector:
matchLabels:
app: prometheus
template:
metadata:
name: prometheus-node-exporter
labels:
app: prometheus
component: node-exporter
spec:
containers:
- image: prom/node-exporter:v0.14.0
name: prometheus-node-exporter
ports:
- name: prom-node-exp
#^ must be an IANA_SVC_NAME (at most 15 characters, ..)
containerPort: 9100
hostPort: 9100
hostNetwork: true
hostPID: true
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: 'true'
name: prometheus-node-exporter
namespace: monitoring
labels:
app: prometheus
component: node-exporter
spec:
clusterIP: None
ports:
- name: prometheus-node-exporter
port: 9100
protocol: TCP
selector:
app: prometheus
component: node-exporter
type: ClusterIP
---
apiVersion: v1
data:
cpu-usage.rules: |
ALERT NodeCPUUsage
IF (100 - (avg by (instance) (irate(node_cpu{name="node-exporter",mode="idle"}[5m])) * 100)) > 75
FOR 2m
LABELS {
severity="page"
}
ANNOTATIONS {
SUMMARY = "{{$labels.instance}}: High CPU usage detected",
DESCRIPTION = "{{$labels.instance}}: CPU usage is above 75% (current value is: {{ $value }})"
}
instance-availability.rules: |
ALERT InstanceDown
IF up == 0
FOR 1m
LABELS { severity = "page" }
ANNOTATIONS {
summary = "Instance {{ $labels.instance }} down",
description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.",
}
low-disk-space.rules: |
ALERT NodeLowRootDisk
IF ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"} ) / node_filesystem_size{mountpoint="/root-disk"} * 100) > 75
FOR 2m
LABELS {
severity="page"
}
ANNOTATIONS {
SUMMARY = "{{$labels.instance}}: Low root disk space",
DESCRIPTION = "{{$labels.instance}}: Root disk usage is above 75% (current value is: {{ $value }})"
}
ALERT NodeLowDataDisk
IF ((node_filesystem_size{mountpoint="/data-disk"} - node_filesystem_free{mountpoint="/data-disk"} ) / node_filesystem_size{mountpoint="/data-disk"} * 100) > 75
FOR 2m
LABELS {
severity="page"
}
ANNOTATIONS {
SUMMARY = "{{$labels.instance}}: Low data disk space",
DESCRIPTION = "{{$labels.instance}}: Data disk usage is above 75% (current value is: {{ $value }})"
}
mem-usage.rules: |
ALERT NodeSwapUsage
IF (((node_memory_SwapTotal-node_memory_SwapFree)/node_memory_SwapTotal)*100) > 75
FOR 2m
LABELS {
severity="page"
}
ANNOTATIONS {
SUMMARY = "{{$labels.instance}}: Swap usage detected",
DESCRIPTION = "{{$labels.instance}}: Swap usage usage is above 75% (current value is: {{ $value }})"
}
ALERT NodeMemoryUsage
IF (((node_memory_MemTotal-node_memory_MemAvailable)/(node_memory_MemTotal)*100)) > 75
FOR 2m
LABELS {
severity="page"
}
ANNOTATIONS {
SUMMARY = "{{$labels.instance}}: High memory usage detected",
DESCRIPTION = "{{$labels.instance}}: Memory usage is above 75% (current value is: {{ $value }})"
}
kind: ConfigMap
metadata:
creationTimestamp: null
name: prometheus-rules
namespace: monitoring
---
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: monitoring
labels:
app: prometheus
component: core
annotations:
prometheus.io/scrape: 'true'
spec:
type: NodePort
ports:
- port: 9090
protocol: TCP
name: webui
selector:
app: prometheus
component: core
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment