Skip to content

Instantly share code, notes, and snippets.

@proffalken
Created January 18, 2022 12:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save proffalken/4b3500151c71d4a9e67530f97e316bc4 to your computer and use it in GitHub Desktop.
Save proffalken/4b3500151c71d4a9e67530f97e316bc4 to your computer and use it in GitHub Desktop.
job "platform-jobs" {
datacenters = ["dc1"]
# system job, runs on all nodes
type = "system"
update {
min_healthy_time = "10s"
healthy_deadline = "5m"
progress_deadline = "10m"
auto_revert = true
}
group "monitoring" {
count = 1
restart {
attempts = 3
interval = "10m"
delay = "30s"
mode = "fail"
}
network {
port "api" {
to = 8686
}
port "vstats" {
to = 9598
static = 9598
}
port "cadvisor" {
to = 8080
}
}
ephemeral_disk {
size = 256
sticky = true
}
task "cadvisor" {
driver = "docker"
config {
dns_servers = ["192.168.20.7", "192.168.1.21"]
image = "gcr.io/cadvisor/cadvisor:latest"
network_mode = "bridge"
volumes = [
"/:/rootfs:ro",
"/var/run:/var/run:rw",
"/sys:/sys:ro",
"/var/lib/docker/:/var/lib/docker:ro",
"/etc/localtime:/etc/localtime:ro"
]
ports = ["cadvisor"]
}
resources {
cpu = 500 # 500 MHz
memory = 256 # 512M
}
service {
name = "cadvisor"
tags = [
"platform",
"cadvisor",
"_app=cadvisor",
"_env=prod",
"traefik.http.routers.cadvisor.tls=true",
"traefik.http.routers.cadvisor.tls.domains[0].main=cadvisor.service.consul",
"traefik.http.routers.cadvisor.tls.certresolver=le"
]
port = "cadvisor"
check {
name = "Container Advisor"
type = "tcp"
interval = "10s"
timeout = "2s"
}
}
}
task "vector" {
driver = "docker"
config {
dns_servers = ["192.168.20.7", "192.168.1.21"]
image = "timberio/vector:0.14.X-alpine"
ports = ["api", "vstats"]
volumes = [
"/:/rootfs:ro",
"/var/run:/var/run:rw",
"/sys:/sys:ro",
"/var/lib/docker/:/var/lib/docker:ro",
"/etc/localtime:/etc/localtime:ro"
]
}
# Vector won't start unless the sinks(backends) configured are healthy
env {
VECTOR_CONFIG = "local/vector.toml"
VECTOR_REQUIRE_HEALTHY = "true"
}
# resource limits are a good idea because you don't want your log collection to consume all resources available
resources {
cpu = 256 # 500 MHz
memory = 256 # 256MB
}
# template with Vector's configuration
template {
destination = "local/vector.toml"
change_mode = "signal"
change_signal = "SIGHUP"
# overriding the delimiters to [[ ]] to avoid conflicts with Vector's native templating, which also uses {{ }}
left_delimiter = "[["
right_delimiter = "]]"
data=<<EOH
data_dir = "alloc/data/vector/"
[api]
enabled = true
address = "0.0.0.0:8686"
playground = true
[sources.logs]
type = "docker_logs"
[sources.int_logs]
type = "internal_logs"
[sources.int_metrics]
type = "internal_metrics"
scrape_interval_secs = 2
[transforms.modify]
type = "remap"
inputs = ["logs"]
source = '''
# Parse Syslog input. The "!" means that the script should abort on error.
. = parse_syslog!(.message)
'''
[sinks.out]
type = "console"
inputs = [ "logs" ]
encoding.codec = "json"
[sinks.prom]
type = "prometheus"
inputs = ["int_metrics"]
address = "0.0.0.0:9598"
default_namespace = "vector"
[sinks.loki]
type = "loki"
inputs = ["logs", "int_logs"]
endpoint = "https://loki.service.consul/"
encoding.codec = "json"
healthcheck.enabled = true
# since . is used by Vector to denote a parent-child relationship, and Nomad's Docker labels contain ".",
# we need to escape them twice, once for TOML, once for Vector
labels.job = "{{ label.com\\.hashicorp\\.nomad\\.job_name }}"
labels.task = "{{ label.com\\.hashicorp\\.nomad\\.task_name }}"
labels.group = "{{ label.com\\.hashicorp\\.nomad\\.task_group_name }}"
labels.namespace = "{{ label.com\\.hashicorp\\.nomad\\.namespace }}"
labels.node = "{{ label.com\\.hashicorp\\.nomad\\.node_name }}"
# remove fields that have been converted to labels to avoid having the field twice
remove_label_fields = true
EOH
}
service {
name = "vector"
tags = [
"platform",
"vector",
"_app=vector",
"_env=prod"
]
check {
port = "api"
type = "http"
path = "/health"
interval = "30s"
timeout = "5s"
}
}
service {
name = "vstats"
tags = [
"platform",
"vector",
"_app=vstats",
"_env=prod",
"_nomad_host=${node.unique.name}"
]
check {
port = "vstats"
type = "http"
path = "/metrics"
interval = "30s"
timeout = "5s"
}
}
kill_timeout = "30s"
}
}
group "proxies" {
count = 1
restart {
attempts = 3
interval = "10m"
delay = "30s"
mode = "fail"
}
network {
port "ui" {
to = 8080
static = 8080
}
port "http" {
to = 80
static = 80
}
port "https" {
to = 443
static = 443
}
}
ephemeral_disk {
size = 256
sticky = true
}
task "traefik" {
template {
data = <<EOF
{{ with secret "secret/traefik" }}
DO_AUTH_TOKEN="{{ .Data.DO_AUTH_TOKEN }}"
{{ end }}
EOF
destination = "secrets/.env"
env = true
}
driver = "docker"
config {
dns_servers = ["192.168.20.7", "192.168.1.21"]
image = "traefik:v2.5"
network_mode = "bridge"
volumes = [
"/media/traefik/config:/etc/traefik:rw",
"/etc/localtime:/etc/localtime:ro"
]
ports = ["ui", "http", "https"]
}
resources {
cpu = 1024 # 500 MHz
memory = 1024 # 512M
}
service {
name = "traefik"
tags = [
"platform",
"cadvisor",
"_app=traefik",
"_env=prod",
"_nomad_host=${node.unique.name}",
"traefik.http.routers.traefik.tls=true",
"traefik.http.routers.traefik.tls.domains[0].main=traefik.service.consul",
"traefik.http.routers.traefik.tls.certresolver=le"
]
port = "ui"
check {
name = "Traefik"
type = "tcp"
interval = "10s"
timeout = "2s"
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment