Skip to content

Instantly share code, notes, and snippets.

@ilude
Last active January 22, 2024 08:15
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ilude/95e0c1441f960d005b25f0aa8460babe to your computer and use it in GitHub Desktop.
Save ilude/95e0c1441f960d005b25f0aa8460babe to your computer and use it in GitHub Desktop.
Prometheus/Grafana setup using Joyride and Traefik
version: '2.4'
services:
grafana:
container_name: grafana
image: grafana/grafana
restart: unless-stopped
networks:
- traefik
env_file:
- .env
volumes:
- ./etc/grafana/provisioning:/etc/grafana/provisioning
- ./etc/grafana/dashboards:/etc/grafana/dashboards
- grafana:/var/lib/grafana
labels:
- autoheal=true
- joyride.host.name=grafana.ilude.com
- com.centurylinklabs.watchtower.enable=true
- traefik.enable=true
- traefik.http.routers.grafana.rule=Host(`grafana.ilude.com`)
- traefik.http.routers.grafana.entrypoints=websecure
- traefik.http.services.grafana.loadbalancer.server.scheme=http
- traefik.http.services.grafana.loadbalancer.server.port=3000
prometheus:
container_name: prometheus
image: prom/prometheus:latest
restart: unless-stopped
networks:
- traefik
links:
- prometheus-alertmanager
- prometheus-blackbox
command:
- '--config.file=/etc/prometheus/conf/prometheus.yml'
- '--storage.tsdb.path=/prometheus/data'
- '--log.level=debug'
- '--web.enable-admin-api'
# - '--web.external-url=http://prometheus:9090/'
# - '--web.route-prefix=/'
# ports:
# - 9090:9090
volumes:
- ./etc/prometheus/conf:/etc/prometheus/conf:ro
- prometheus:/prometheus
labels:
- autoheal=true
- joyride.host.name=prometheus.ilude.com
- com.centurylinklabs.watchtower.enable=true
- traefik.enable=true
- traefik.http.routers.prometheus.rule=Host(`prometheus.ilude.com`)
- traefik.http.routers.prometheus.entrypoints=websecure
- traefik.http.services.prometheus.loadbalancer.server.scheme=http
- traefik.http.services.prometheus.loadbalancer.server.port=9090
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
container_name: cadvisor
restart: unless-stopped
networks:
- traefik
# ports:
# - 8080:8080
privileged: true
devices:
- /dev/kmsg:/dev/kmsg
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /cgroup:/cgroup:ro
labels:
- autoheal=true
- org.label-schema.group=monitoring
- traefik.enable=false
proxmox-exporter:
container_name: proxmox-exporter
image: ghcr.io/ilude/prometheus-pve-exporter
restart: unless-stopped
networks:
- traefik
env_file:
- .env
# ports:
# - 127.0.0.1:9221:9221
labels:
- autoheal=true
- traefik.enable=false
prometheus-blackbox:
container_name: prometheus-blackbox
image: prom/blackbox-exporter:latest
restart: unless-stopped
networks:
- traefik
command:
- '--config.file=/etc/prometheus/conf/blackbox.yml'
privileged: true
cap_add:
- CAP_NET_RAW
# ports:
# - 9115:9115
volumes:
- ./etc/prometheus/conf:/etc/prometheus/conf:ro
labels:
- autoheal=true
- joyride.host.name=blackbox.ilude.com
- com.centurylinklabs.watchtower.enable=true
- traefik.enable=true
- traefik.http.routers.blackbox.rule=Host(`blackbox.ilude.com`)
- traefik.http.routers.blackbox.entrypoints=websecure
- traefik.http.routers.blackbox.tls=true
- traefik.http.routers.blackbox.tls.certresolver=letsencrypt
- traefik.http.services.blackbox.loadbalancer.server.scheme=http
- traefik.http.services.blackbox.loadbalancer.server.port=9115
prometheus-alertmanager:
container_name: prometheus-alertmanager
image: prom/alertmanager:latest
restart: unless-stopped
networks:
- traefik
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--log.level=debug'
# ports:
# - 9093:9093
volumes:
- ./etc/prometheus/conf:/etc/alertmanager:rw
- prometheus:/alertmanager:rw
labels:
- autoheal=true
- joyride.host.name=alertmanager.ilude.com
- com.centurylinklabs.watchtower.enable=true
- traefik.enable=true
- traefik.http.routers.alertmanager.rule=Host(`alertmanager.ilude.com`)
- traefik.http.routers.alertmanager.entrypoints=websecure
- traefik.http.routers.alertmanager.tls=true
- traefik.http.routers.alertmanager.tls.certresolver=letsencrypt
- traefik.http.services.alertmanager.loadbalancer.server.scheme=http
- traefik.http.services.alertmanager.loadbalancer.server.port=9093
volumes:
prometheus:
external: false
grafana:
external: false
networks:
traefik:
external: true
CF_API_EMAIL=
CF_API_KEY=
GF_SECURITY_ADMIN_USER=
GF_SECURITY_ADMIN_PASSWORD=
GF_USERS_ALLOW_SIGN_UP=false
GF_SMTP_ENABLED=true
GF_SMTP_HOST=smtp.example.com:25
GF_FROM_ADDRESS=grafana@example.com
GF_EHLO_IDENTITY=grafana.example.com
#GF_PANELS_DISABLE_SANITIZE_HTML=true
PVE_USER=
PVE_TOKEN_NAME=
PVE_TOKEN_VALUE=
PVE_VERIFY_SSL=false
groups:
- name: infrastructure
rules:
- alert: node-down
expr: probe_success{job=~"blackbox"} == 0
for: 30s
annotations:
identifier: "{{ $labels.instance }}.{{ $labels.group }}"
description: "_{{ $labels.job }}_ is alerting on _{{ $labels.instance }}_"
fail_msg: "is down. "
restore_msg: "is back up. "
- name: targets
rules:
- alert: monitor_service_down
expr: up == 0
for: 30s
labels:
severity: critical
annotations:
summary: "Monitor service non-operational"
description: "Service {{ $labels.instance }} is down."
- name: host
rules:
- alert: high_cpu_load
expr: node_load1 > 1.5
for: 30s
labels:
severity: warning
annotations:
summary: "Server under high load"
description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
- alert: high_memory_load
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
for: 30s
labels:
severity: warning
annotations:
summary: "Server memory is almost full"
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
- alert: high_storage_load
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
for: 30s
labels:
severity: warning
annotations:
summary: "Server storage is almost full"
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
##################
# Notif to email #
##################
global:
smtp_smarthost: 'smtp.example.com:25'
smtp_from: 'alertmanager-<hostname>@example.com'
smtp_hello: 'alertmanager.example.com'
route:
group_by: ['service']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: devops
receivers:
- name: 'devops'
email_configs:
- to: '<email>'
send_resolved: true
- name: pushover-receiver
pushover_configs:
- token: <token>
user_key: <key>
##################
# Notif to slack #
##################
#receivers:
# - name: 'devops'
# slack_configs:
# - channel: alerts
# send_resolved: true
# api_url: https://hooks.slack.com/services/<token-webhook-api-slack>
# title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Monitoring Notification'
# text: >-
# {{ range .Alerts }}
# *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
# *Description:* {{ .Annotations.description }}
# *Graph:* <{{ .GeneratorURL }}|:chart_with_upwards_trend:> *Runbook:* <{{ .Annotations.runbook }}|:spiral_note_pad:>
# *Details:*
# {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
# {{ end }}
# {{ end }}
modules:
icmp_ipv4:
prober: icmp
icmp:
preferred_ip_protocol: ip4
# icmp_example:
# prober: icmp
# timeout: 5s
# icmp:
# preferred_ip_protocol: 'ip4'
# source_ip_address: '192.168.16.67'
[
{
"Targets": [
"<ip address>"
],
"Labels": {
"Name": "Camera 1",
"Type": "amcrest",
"Ignore": "false"
}
},
{
"Targets": [
"<ip address>"
],
"Labels": {
"Name": "Camera 2",
"Type": "reolink",
"Ignore": "false"
}
},
{
"Targets": [
"<ip address>"
],
"Labels": {
"Name": "<pingable computer name>",
"Type": "computer",
"Ignore": "true"
}
}
]
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_timeout: 10s
# Alertmanager configuration
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- prometheus-alertmanager:9093
rule_files:
- 'alerts.yml'
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'alertmanager'
static_configs:
- targets: ['prometheus-alertmanager:9093']
- job_name: 'traefik'
static_configs:
- targets: ['traefik:8082']
- job_name: 'pve'
static_configs:
- targets:
- pve-1.example.com # Proxmox VE node.
- pve-2.example.com
metrics_path: /pve
params:
module: [default]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: proxmox-exporter:9221 # PVE exporter.
#################
# Node Exporter #
#################
- job_name: 'node'
static_configs:
- targets: ['pve-1.example.com:9100', 'pve-2.example.com:9100']
labels:
group: 'node_exporter'
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
- job_name: 'cameras'
metrics_path: /probe
params:
module: [icmp_ipv4]
file_sd_configs:
- files:
- cameras.json
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: prometheus-blackbox:9115 # This is your blackbox exporter.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment