Skip to content

Instantly share code, notes, and snippets.

@Gurpartap
Last active March 24, 2022 03:07
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Gurpartap/943f8e435c110231e9f553ff1d9a5e34 to your computer and use it in GitHub Desktop.
Save Gurpartap/943f8e435c110231e9f553ff1d9a5e34 to your computer and use it in GitHub Desktop.
Cadence job file for the HashiStack (consul and nomad)

use cadence-setup-schema.nomad for automatic schema setup and update

# submit the parameterized batch job to cluster:
nomad job run ./cadence-setup-schema.nomad

# run this job (but without any param):
nomad job dispatch cadence-setup-schema

for a simple setup, cadence-server.nomad does it all.

nomad job run ./cadence-server.nomad

use cadence-web.nomad for web ui service.

nomad job run ./cadence-web.nomad

run services individually

see files 4 to 7 to run each server service component individually. comes handy when you must scale, allocate resources, and/or measure metrics more precisely.

individual service jobs can be mixed with cadence-server.nomad on the same cluster without conflict.

job cadence-setup-schema {
datacenters = ["dc1"]
type = "batch"
priority = 100
# constraint {
# attribute = "${meta.tags}"
# operator = "set_contains"
# value = "cadence-services"
# }
parameterized {
payload = "forbidden"
}
meta {
keyspace = "cadence"
visibility_keyspace = "cadence_visibility"
}
group cadence-setup-schema {
task cadence-setup-schema {
driver = "docker"
kill_timeout = "45s"
config {
image = "ubercadence/server:0.11.0-auto-setup"
command = "bash"
args = ["/opt/cadence/bin/setup-schema.sh"]
volumes = [
"local/setup-schema.sh:/opt/cadence/bin/setup-schema.sh"
]
network_mode = "host"
}
env {
SKIP_SCHEMA_SETUP = false
# change requires db reset
NUM_HISTORY_SHARDS = 4
LOG_LEVEL = "info"
CASSANDRA_SEEDS = "cassandra-cluster1-node1.node.consul,cassandra-cluster1-node2.node.consul,cassandra-cluster1-node3.node.consul"
DB = "cassandra"
RF = 3
KEYSPACE = "${NOMAD_META_keyspace}"
VISIBILITY_KEYSPACE = "${NOMAD_META_visibility_keyspace}"
}
template {
change_mode = "noop"
destination = "local/setup-schema.sh"
// language=sh
data = <<EOH
#!/bin/bash
set -x
DB="${DB:-cassandra}"
RF=${RF:-3}
# cassandra env
export KEYSPACE="${KEYSPACE:-cadence}"
export VISIBILITY_KEYSPACE="${VISIBILITY_KEYSPACE:-cadence_visibility}"
setup_cassandra_schema() {
SCHEMA_DIR=$CADENCE_HOME/schema/cassandra/cadence/versioned
cadence-cassandra-tool --ep $CASSANDRA_SEEDS create -k $KEYSPACE --rf $RF
cadence-cassandra-tool --ep $CASSANDRA_SEEDS -k $KEYSPACE setup-schema -v 0.0
cadence-cassandra-tool --ep $CASSANDRA_SEEDS -k $KEYSPACE update-schema -d $SCHEMA_DIR
VISIBILITY_SCHEMA_DIR=$CADENCE_HOME/schema/cassandra/visibility/versioned
cadence-cassandra-tool --ep $CASSANDRA_SEEDS create -k $VISIBILITY_KEYSPACE --rf $RF
cadence-cassandra-tool --ep $CASSANDRA_SEEDS -k $VISIBILITY_KEYSPACE setup-schema -v 0.0
cadence-cassandra-tool --ep $CASSANDRA_SEEDS -k $VISIBILITY_KEYSPACE update-schema -d $VISIBILITY_SCHEMA_DIR
}
setup_schema() {
if [ "$DB" == "cassandra" ]; then
echo 'setup cassandra schema'
setup_cassandra_schema
fi
}
wait_for_cassandra() {
server=`echo $CASSANDRA_SEEDS | awk -F ',' '{print $1}'`
until cqlsh --cqlversion=3.4.4 $server < /dev/null; do
echo 'waiting for cassandra to start up'
sleep 1
done
echo 'cassandra started'
}
wait_for_db() {
if [ "$DB" == "cassandra" ]; then
wait_for_cassandra
fi
}
wait_for_db
setup_schema
EOH
}
}
}
}
job cadence-server {
datacenters = ["dc1"]
type = "service"
priority = 75
# constraint {
# attribute = "${meta.tags}"
# operator = "set_contains"
# value = "cadence-services"
# }
group cadence-server {
count = 3
constraint {
distinct_hosts = true
}
task cadence-server {
driver = "docker"
kill_timeout = "45s"
config {
image = "ubercadence/server:0.11.0"
network_mode = "host"
volumes = [
"local/dynamicconfig.yml:/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
]
}
service {
name = "cadence-frontend"
port = "frontend"
check {
type = "tcp"
interval = "5s"
timeout = "15s"
initial_status = "passing"
}
}
service {
name = "cadence-history"
port = "history"
check {
type = "tcp"
interval = "5s"
timeout = "15s"
initial_status = "passing"
}
}
service {
name = "cadence-matching"
port = "matching"
check {
type = "tcp"
interval = "5s"
timeout = "15s"
initial_status = "passing"
}
}
service {
name = "cadence-worker"
port = "worker"
check {
type = "tcp"
interval = "5s"
timeout = "15s"
initial_status = "passing"
}
}
service {
name = "cadence-server"
tags = ["metrics", "metrics-port=${NOMAD_PORT_prometheus}"]
}
env {
# change requires db reset
NUM_HISTORY_SHARDS = 4
LOG_LEVEL = "info"
SERVICES = "frontend,history,matching,worker"
BIND_ON_IP = "${NOMAD_IP_frontend}"
CASSANDRA_SEEDS = "cassandra-cluster1-node1.node.consul,cassandra-cluster1-node2.node.consul,cassandra-cluster1-node3.node.consul"
DB = "cassandra"
RF = 3
KEYSPACE = "cadence"
VISIBILITY_KEYSPACE = "cadence_visibility"
SKIP_SCHEMA_SETUP = true
DYNAMIC_CONFIG_FILE_PATH = "/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
RINGPOP_BOOTSTRAP_MODE = "dns"
RINGPOP_SEEDS = "cadence-frontend.service.consul:7933,cadence-history.service.consul:7934,cadence-matching.service.consul:7935,cadence-worker.service.consul:7939"
PROMETHEUS_ENDPOINT = "${NOMAD_ADDR_prometheus}"
}
template {
change_mode = "noop"
destination = "local/dynamicconfig.yml"
data = <<EOH
---
system.minRetentionDays:
- value: 0
constraints: {}
system.historyArchivalStatus:
- value: "disabled"
constraints: {}
system.visibilityArchivalStatus:
- value: "disabled"
constraints: {}
frontend.enableClientVersionCheck:
- value: true
constraints: {}
frontend.visibilityListMaxQPS:
- value: 100
constraints: {}
history.EnableConsistentQueryByDomain:
- value: true
constraints: {}
EOH
}
resources {
cpu = 2999
memory = 2048
network {
mbits = 100
port frontend {
static = 7933
}
port history {
static = 7934
}
port matching {
static = 7935
}
port worker {
static = 7939
}
port prometheus {}
}
}
meta {
last_run_at = "Tue Apr 14 23:16:50 IST 2020"
}
}
restart {
attempts = 5
delay = "5s"
mode = "delay"
interval = "1m"
}
}
migrate {
max_parallel = 1
health_check = "checks"
min_healthy_time = "15s"
healthy_deadline = "60s"
}
update {
max_parallel = 1
min_healthy_time = "15s"
healthy_deadline = "1m"
progress_deadline = "2m"
auto_revert = true
auto_promote = true
canary = 1
stagger = "5s"
}
}
job cadence-web {
datacenters = ["dc1"]
type = "service"
priority = 25
# constraint {
# attribute = "${meta.tags}"
# operator = "set_contains"
# value = "cadence-services"
# }
group cadence-web {
count = 1
task cadence-web {
driver = "docker"
kill_timeout = "45s"
config {
image = "ubercadence/web:latest"
port_map = {
http = 8088
}
}
# restarts job when cadence-frontend service changes
template {
env = true
destination = "${NOMAD_SECRETS_DIR}/env"
data = <<EOF
CADENCE_TCHANNEL_PEERS={{range $index, $service := service "cadence-frontend" }}{{if ne $index 0}},{{end}}{{$service.Address}}:{{$service.Port}}{{end}}
EOF
}
service {
name = "cadence-web"
port = "http"
check {
type = "http"
path = "/"
interval = "5s"
timeout = "3s"
}
}
resources {
cpu = 1000
memory = 768
network {
mbits = 100
port http {
static = 8088
}
}
}
meta {
last_run_at = "Tue Apr 14 23:16:50 IST 2020"
}
}
restart {
attempts = 3
delay = "10s"
interval = "1m"
mode = "delay"
}
}
migrate {
max_parallel = 1
health_check = "checks"
min_healthy_time = "10s"
healthy_deadline = "60s"
}
update {
max_parallel = 1
min_healthy_time = "15s"
healthy_deadline = "2m"
progress_deadline = "3m"
auto_revert = true
auto_promote = true
canary = 1
stagger = "5s"
}
}
job cadence-frontend {
datacenters = ["dc1"]
type = "service"
priority = 75
# constraint {
# attribute = "${meta.tags}"
# operator = "set_contains"
# value = "cadence-services"
# }
group cadence-frontend {
count = 3
constraint {
distinct_hosts = true
}
task cadence-frontend {
driver = "docker"
kill_timeout = "45s"
config {
image = "ubercadence/server:0.11.0"
network_mode = "host"
volumes = [
"local/dynamicconfig.yml:/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
]
}
service {
name = "cadence-frontend"
port = "frontend"
tags = ["metrics", "metrics-port=${NOMAD_PORT_prometheus}"]
check {
type = "tcp"
interval = "5s"
timeout = "15s"
initial_status = "passing"
}
}
env {
# change requires db reset
NUM_HISTORY_SHARDS = 4
LOG_LEVEL = "info"
SERVICES = "frontend"
BIND_ON_IP = "${NOMAD_IP_frontend}"
CASSANDRA_SEEDS = "cassandra-cluster1-node1.node.consul,cassandra-cluster1-node2.node.consul,cassandra-cluster1-node3.node.consul"
DB = "cassandra"
RF = 3
KEYSPACE = "cadence"
VISIBILITY_KEYSPACE = "cadence_visibility"
SKIP_SCHEMA_SETUP = true
DYNAMIC_CONFIG_FILE_PATH = "/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
RINGPOP_BOOTSTRAP_MODE = "dns"
RINGPOP_SEEDS = "cadence-frontend.service.consul:7933,cadence-history.service.consul:7934,cadence-matching.service.consul:7935,cadence-worker.service.consul:7939"
PROMETHEUS_ENDPOINT = "${NOMAD_ADDR_prometheus}"
}
template {
change_mode = "noop"
destination = "local/dynamicconfig.yml"
data = <<EOH
---
system.minRetentionDays:
- value: 0
constraints: {}
system.historyArchivalStatus:
- value: "disabled"
constraints: {}
system.visibilityArchivalStatus:
- value: "disabled"
constraints: {}
frontend.enableClientVersionCheck:
- value: true
constraints: {}
frontend.visibilityListMaxQPS:
- value: 100
constraints: {}
EOH
}
resources {
cpu = 1500
memory = 256
network {
mbits = 100
port frontend {
static = 7933
}
port prometheus {}
}
}
meta {
last_run_at = "Tue Apr 14 23:16:50 IST 2020"
}
}
restart {
attempts = 5
delay = "5s"
mode = "delay"
interval = "1m"
}
}
migrate {
max_parallel = 1
health_check = "checks"
min_healthy_time = "15s"
healthy_deadline = "60s"
}
update {
max_parallel = 1
min_healthy_time = "15s"
healthy_deadline = "1m"
progress_deadline = "2m"
auto_revert = true
auto_promote = true
canary = 1
stagger = "5s"
}
}
job cadence-history {
datacenters = ["dc1"]
type = "service"
priority = 75
# constraint {
# attribute = "${meta.tags}"
# operator = "set_contains"
# value = "cadence-services"
# }
group cadence-history {
count = 3
constraint {
distinct_hosts = true
}
task cadence-history {
driver = "docker"
kill_timeout = "45s"
config {
image = "ubercadence/server:0.11.0"
network_mode = "host"
volumes = [
"local/dynamicconfig.yml:/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
]
}
service {
name = "cadence-history"
port = "history"
tags = ["metrics", "metrics-port=${NOMAD_PORT_prometheus}"]
check
type = "tcp"
interval = "5s"
timeout = "15s"
initial_status = "passing"
}
}
env {
# change requires db reset
NUM_HISTORY_SHARDS = 4
LOG_LEVEL = "info"
SERVICES = "history"
BIND_ON_IP = "${NOMAD_IP_history}"
CASSANDRA_SEEDS = "cassandra-cluster1-node3.node.consul,cassandra-cluster1-node2.node.consul,cassandra-cluster1-node1.node.consul"
DB = "cassandra"
RF = 3
KEYSPACE = "cadence"
VISIBILITY_KEYSPACE = "cadence_visibility"
SKIP_SCHEMA_SETUP = true
DYNAMIC_CONFIG_FILE_PATH = "/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
RINGPOP_BOOTSTRAP_MODE = "dns"
RINGPOP_SEEDS = "cadence-frontend.service.consul:7933,cadence-history.service.consul:7934,cadence-matching.service.consul:7935,cadence-worker.service.consul:7939"
PROMETHEUS_ENDPOINT = "${NOMAD_ADDR_prometheus}"
}
template {
change_mode = "noop"
destination = "local/dynamicconfig.yml"
data = <<EOH
---
system.minRetentionDays:
- value: 0
constraints: {}
system.historyArchivalStatus:
- value: "disabled"
constraints: {}
system.visibilityArchivalStatus:
- value: "disabled"
constraints: {}
history.EnableConsistentQueryByDomain:
- value: true
constraints: {}
EOH
}
resources {
cpu = 1999
memory = 1536
network {
mbits = 100
port history {
static = 7934
}
port prometheus {}
}
}
meta {
last_run_at = "Tue Apr 14 23:16:50 IST 2020"
}
}
restart {
attempts = 5
delay = "5s"
mode = "delay"
interval = "1m"
}
}
migrate {
max_parallel = 1
health_check = "checks"
min_healthy_time = "15s"
healthy_deadline = "60s"
}
update {
max_parallel = 1
min_healthy_time = "15s"
healthy_deadline = "1m"
progress_deadline = "2m"
auto_revert = true
auto_promote = true
canary = 1
stagger = "5s"
}
}
job cadence-matching {
datacenters = ["dc1"]
type = "service"
priority = 75
# constraint {
# attribute = "${meta.tags}"
# operator = "set_contains"
# value = "cadence-services"
# }
group cadence-matching {
count = 3
constraint {
distinct_hosts = true
}
task cadence-matching {
driver = "docker"
kill_timeout = "45s"
config {
image = "ubercadence/server:0.11.0"
network_mode = "host"
volumes = [
"local/dynamicconfig.yml:/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
]
}
service {
name = "cadence-matching"
port = "matching"
tags = ["metrics", "metrics-port=${NOMAD_PORT_prometheus}"]
check {
type = "tcp"
interval = "5s"
timeout = "15s"
initial_status = "passing"
}
}
env {
# change requires db reset
NUM_HISTORY_SHARDS = 4
LOG_LEVEL = "info"
SERVICES = "matching"
BIND_ON_IP = "${NOMAD_IP_matching}"
CASSANDRA_SEEDS = "cassandra-cluster1-node1.node.consul,cassandra-cluster1-node2.node.consul,cassandra-cluster1-node3.node.consul"
DB = "cassandra"
RF = 3
KEYSPACE = "cadence"
VISIBILITY_KEYSPACE = "cadence_visibility"
SKIP_SCHEMA_SETUP = true
DYNAMIC_CONFIG_FILE_PATH = "/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
RINGPOP_BOOTSTRAP_MODE = "dns"
RINGPOP_SEEDS = "cadence-frontend.service.consul:7933,cadence-history.service.consul:7934,cadence-matching.service.consul:7935,cadence-worker.service.consul:7939"
PROMETHEUS_ENDPOINT = "${NOMAD_ADDR_prometheus}"
}
template {
change_mode = "noop"
destination = "local/dynamicconfig.yml"
data = <<EOH
---
system.minRetentionDays:
- value: 0
constraints: {}
system.historyArchivalStatus:
- value: "disabled"
constraints: {}
system.visibilityArchivalStatus:
- value: "disabled"
constraints: {}
EOH
}
resources {
cpu = 500
memory = 256
network {
mbits = 100
port matching {
static = 7935
}
port prometheus {}
}
}
meta {
last_run_at = "Tue Apr 14 23:16:50 IST 2020"
}
}
restart {
attempts = 5
delay = "5s"
mode = "delay"
interval = "1m"
}
}
migrate {
max_parallel = 1
health_check = "checks"
min_healthy_time = "15s"
healthy_deadline = "60s"
}
update {
max_parallel = 1
min_healthy_time = "15s"
healthy_deadline = "1m"
progress_deadline = "2m"
auto_revert = true
auto_promote = true
canary = 1
stagger = "5s"
}
}
job cadence-worker {
datacenters = ["dc1"]
type = "service"
priority = 75
# constraint {
# attribute = "${meta.tags}"
# operator = "set_contains"
# value = "cadence-services"
# }
group cadence-worker {
count = 3
constraint {
distinct_hosts = true
}
task cadence-worker {
driver = "docker"
kill_timeout = "45s"
config {
image = "ubercadence/server:0.11.0"
network_mode = "host"
volumes = [
"local/dynamicconfig.yml:/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
]
}
service {
name = "cadence-worker"
port = "worker"
tags = ["metrics", "metrics-port=${NOMAD_PORT_prometheus}"]
check {
type = "tcp"
interval = "5s"
timeout = "15s"
initial_status = "passing"
}
}
env {
# change requires db reset
NUM_HISTORY_SHARDS = 4
LOG_LEVEL = "info"
SERVICES = "worker"
BIND_ON_IP = "${NOMAD_IP_worker}"
CASSANDRA_SEEDS = "cassandra-cluster1-node1.node.consul,cassandra-cluster1-node2.node.consul,cassandra-cluster1-node3.node.consul"
DB = "cassandra"
RF = 3
KEYSPACE = "cadence"
VISIBILITY_KEYSPACE = "cadence_visibility"
SKIP_SCHEMA_SETUP = true
DYNAMIC_CONFIG_FILE_PATH = "/etc/cadence/config/dynamicconfig/dynamicconfig.yml"
RINGPOP_BOOTSTRAP_MODE = "dns"
RINGPOP_SEEDS = "cadence-frontend.service.consul:7933,cadence-history.service.consul:7934,cadence-matching.service.consul:7935,cadence-worker.service.consul:7939"
PROMETHEUS_ENDPOINT = "${NOMAD_ADDR_prometheus}"
}
template {
change_mode = "noop"
destination = "local/dynamicconfig.yml"
data = <<EOH
---
system.minRetentionDays:
- value: 0
constraints: {}
system.historyArchivalStatus:
- value: "disabled"
constraints: {}
system.visibilityArchivalStatus:
- value: "disabled"
constraints: {}
EOH
}
resources {
cpu = 500
memory = 256
network {
mbits = 100
port worker {
static = 7939
}
port prometheus {}
}
}
meta {
last_run_at = "Tue Apr 14 23:16:50 IST 2020"
}
}
restart {
attempts = 5
delay = "5s"
mode = "delay"
interval = "1m"
}
}
migrate {
max_parallel = 1
health_check = "checks"
min_healthy_time = "15s"
healthy_deadline = "60s"
}
update {
max_parallel = 1
min_healthy_time = "15s"
healthy_deadline = "1m"
progress_deadline = "2m"
auto_revert = true
auto_promote = true
canary = 1
stagger = "5s"
}
}
@Gurpartap
Copy link
Author

Gurpartap commented Apr 14, 2020

To have prometheus collect metrics automatically for the above services, set this in your prometheus.yml:

scrape_configs:
  - job_name: 'consul'
    consul_sd_configs:
    - services: []
    relabel_configs:
      - source_labels: [__meta_consul_tags]
        regex: .*,metrics,.*
        action: keep
      - source_labels: [__meta_consul_service]
        target_label: job
      - source_labels: [__meta_consul_node]
        target_label: hostname
      - source_labels: ['__address__', '__meta_consul_tags']
        regex:         '(.*):.*;.*,metrics-port=(\d+),.*'
        target_label:  '__address__'
        replacement:   '$1:$2'

If you use nomad to also run your prometheus instance, here's prometheus.nomad:

# not configured to persist metrics data

job prometheus {
  datacenters = ["dc1"]
  type        = "service"

  # constraint {
  #   attribute = "${meta.tags}"
  #   operator  = "set_contains"
  #   value     = "workerpool1"
  # }

  group prometheus {
    count = 1

    task prometheus {
      driver       = "docker"
      kill_timeout = "60s"

      config {
        image        = "prom/prometheus:latest"
        args         = [
          # defaults from https://github.com/prometheus/prometheus/blob/master/Dockerfile
          "--config.file=/etc/prometheus/prometheus.yml",
          "--storage.tsdb.path=/prometheus",
          "--web.console.libraries=/usr/share/prometheus/console_libraries",
          "--web.console.templates=/usr/share/prometheus/consoles",
          # custom overrides
          "--storage.tsdb.retention.size=1GB",
          "--storage.tsdb.wal-compression",
          "--web.enable-admin-api"
        ]
        volumes      = [
          "local/prometheus.yml:/etc/prometheus/prometheus.yml"
        ]
        network_mode = "host"
      }

      resources {
        cpu    = 350
        memory = 1536

        network {
          mbits = 10

          port prometheus_ui {
            static = 9090
          }
        }
      }

      service {
        name = "prometheus"
        port = "prometheus_ui"
        check {
          name     = "prometheus_ui port alive"
          type     = "http"
          path     = "/-/healthy"
          interval = "10s"
          timeout  = "2s"
        }
      }

      template {
        change_mode = "noop"
        destination = "local/prometheus.yml"
        # language=yml
        data        = <<EOH
---
global:
  scrape_interval:     15s
  evaluation_interval: 5s

scrape_configs:
  # …
  # other configs
  # …

  - job_name: 'consul'
    consul_sd_configs:
    - services: []
    relabel_configs:
      - source_labels: [__meta_consul_tags]
        regex: .*,metrics,.*
        action: keep
      - source_labels: [__meta_consul_service]
        target_label: job
      - source_labels: [__meta_consul_node]
        target_label: node
      - source_labels: ['__address__', '__meta_consul_tags']
        regex:         '(.*):.*;.*,metrics-port=(\d+),.*'
        target_label:  '__address__'
        replacement:   '$1:$2'

EOH
      }
    }

    restart {
      attempts = 3
      delay    = "10s"
      interval = "3m"
      mode     = "delay"
    }
  }
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment