Skip to content

Instantly share code, notes, and snippets.

@optiz0r
Created December 14, 2020 08:35
Show Gist options
  • Save optiz0r/38699ed3a8ec09aa945a3efa902efd5b to your computer and use it in GitHub Desktop.
Save optiz0r/38699ed3a8ec09aa945a3efa902efd5b to your computer and use it in GitHub Desktop.
Choria Playbook to upgrade a nomad cluster

Upgrade Nomad cluster

This Choria Playbook will automate the steps to do a simple version upgrade on a nomad cluster.

  • Upgrades servers first
  • Then upgrades clients
  • Sleeps in between each upgrade to allow things to settle
  • Aborts on any error

Dependencies

  • yum/dnf based distro (because the playbook issues a yum clean)
  • Custom facts (site specific):
    • group the cluster identifier to upgrade
    • hostname_environment a site-specific identifier for the environment, obtained by parsing the hostname

Usage

mco playbook run site_nomad::upgrade_cluster --modulepath site:modules --cluster foo --environment d --new_version 1.0.0

# modules/site_nomad/plans/upgrade_agent.pp
# @summary Upgrades nomad on a set of cluster agents
#
# @param nodes
# The nodes to upgrade
#
# @param new_version
# The new version to use (must be an upgrade)
#
# @param sleep
# How long to sleep in between node upgrades (in seconds)
#
plan site_nomad::upgrade_agent (
Choria::Nodes $nodes = [],
String $new_version,
Integer $sleep = 60,
) {
# Flush the yum caches before starting to make sure the new version is visible
# Wait 60s before starting. This is what gives us the settle time
# between nodes
choria::task(
'mcollective',
'action' => 'package.yum_clean',
'nodes' => $nodes,
'pre_sleep' => $sleep,
'properties' => {
'mode' => 'expire-cache',
},
)
# Install the new version
choria::task(
'mcollective',
'action' => 'package.install',
'nodes' => $nodes,
'properties' => {
'package' => 'nomad',
'version' => $new_version,
},
)
# Trigger a background
choria::task(
'mcollective',
'action' => 'puppet.runonce',
'nodes' => $nodes,
'properties' => {
'force' => true,
}
)
choria::task(
'mcollective',
'action' => 'puppet.status',
'nodes' => $nodes,
'assert' => 'applying=false',
'pre_sleep' => 10,
'tries' => 10,
'try_sleep' => 30,
'silent' => true,
)
}
# site_nomad/plans/upgrade_cluster.pp
# @summary Upgrades a nomad cluster to a newer version of nomad in sequence
#
# @param new_version
# The RPM version string (EVR) for the new version to upgrade to, e.g. `1.0.0`
#
# @param cluster
# The cluster name (which is typically the business groups `group` name, e.g. `foo`)
#
# @param environment
# The environment to upgrade, either `d` for development, or `p` for production
#
plan site_nomad::upgrade_cluster (
String $new_version,
String $cluster,
Enum['d', 'p'] $environment,
) {
# Query to find list of nomad servers (whether or not those servers are also clients)
$server_query = @("EOF"/L)
inventory[certname] {
facts.group = '${cluster}'
and facts.hostname_environment = '${environment}'
and resources {
type = 'Class'
and title = 'Site_nomad'
and parameters.use_server = true
}
}
| EOF
# Query to find list of nomad clients (which are not also servers)
$client_query = @("EOF"/L)
inventory[certname] {
facts.group = '${cluster}'
and facts.hostname_environment = '${environment}'
and resources {
type = 'Class'
and title = 'Site_nomad'
and parameters.use_server = false
and parameters.use_client = true
}
}
| EOF
$servers = choria::discover(
'pql',
'query' => $server_query,
'test' => false,
'at_least' => 1,
'when_empty' => 'Could not find any nomad servers to upgrade'
)
$clients = choria::discover(
'pql',
'query' => $client_query,
'test' => false,
'empty_ok' => true,
)
$all_nodes = [$servers, $clients].flatten
# Disable puppet on all nodes before starting to prevent
# conflicts
choria::task(
'mcollective',
'action' => 'puppet.disable',
'nodes' => $all_nodes,
'fail_ok' => true,
'silent' => true,
'properties' => {
'message' => 'Upgrading nomad cluster via choria playbook'
}
)
choria::task(
'mcollective',
'action' => 'puppet.status',
'nodes' => $all_nodes,
'assert' => 'applying=false',
'tries' => 10,
'try_sleep' => 30,
'silent' => true,
)
# Upgrade the servers one by one
$servers.choria::in_groups_of(1) |$nodes| {
# Re-enable puppet on this node
choria::task(
'mcollective',
'action' => 'puppet.enable',
'nodes' => $nodes,
'fail_ok' => true,
'silent' => true,
)
# Run the upgrade on this node
choria::run_playbook(
'site_nomad::upgrade_agent',
'nodes' => $nodes,
'new_version' => $new_version,
'sleep' => 60,
)
}
# Upgrade the clients one by one
$clients.choria::in_groups_of(1) |$nodes| {
# Re-enable puppet on this node
choria::task(
'mcollective',
'action' => 'puppet.enable',
'nodes' => $nodes,
'fail_ok' => true,
'silent' => true,
)
# Run the upgrade on this node
choria::run_playbook(
'site_nomad::upgrade_agent',
'nodes' => $nodes,
'new_version' => $new_version,
'sleep' => 120,
)
}
# Return the list of updated nodes
$all_nodes
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment