Skip to content

Instantly share code, notes, and snippets.

@TJM
Created April 30, 2021 17:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TJM/d5d0f71bd5c23d6ad874b6cd68333151 to your computer and use it in GitHub Desktop.
Save TJM/d5d0f71bd5c23d6ad874b6cd68333151 to your computer and use it in GitHub Desktop.
An idea for cluster patching (one at a time) using pe_patch. The `patching.pp` is a copy of pe_patch::group_patching, except for a modification to take $targets directly. The `cluster_patching.pp` is my attempt at a "one at a time" wrapper.
# Wrapper for patchy:patching Plan for running 'patching' one node at a time, rather than all at once
plan patchy::cluster_patching (
TargetSpec $targets,
Optional[Enum['always', 'never', 'patched', 'smart']] $reboot = 'patched',
Optional[String] $yum_params = undef,
Optional[String] $dpkg_params = undef,
Optional[String] $zypper_params = undef,
Optional[Integer] $patch_task_timeout = 3600,
Optional[Integer] $health_check_runinterval = 1800,
Optional[Integer] $reboot_wait_time = 600,
Optional[Boolean] $security_only = false,
Optional[Boolean] $run_health_check = true,
Optional[Boolean] $clean_cache = false,
Optional[Boolean] $health_check_noop = false,
Optional[Boolean] $health_check_use_cached_catalog = false,
Optional[Boolean] $health_check_service_enabled = true,
Optional[Boolean] $health_check_service_running = true,
Optional[Pe_patch::Absolutepath] $post_reboot_scriptpath = undef,
){
# Get Target List,
$target_array = get_targets($targets)
# Recreate Argument List
$args = {
'reboot' => $reboot,
'yum_params' => $yum_params,
'dpkg_params' => $dpkg_params,
'zypper_params' => undef,
'patch_task_timeout' => $patch_task_timeout,
'health_check_runinterval' => $health_check_runinterval,
'reboot_wait_time' => $reboot_wait_time,
'security_only' => $security_only,
'run_health_check' => $run_health_check,
'clean_cache' => $clean_cache,
'health_check_noop' => $health_check_noop,
'health_check_use_cached_catalog' => $health_check_use_cached_catalog,
'health_check_service_enabled' => $health_check_service_enabled,
'health_check_service_running' => $health_check_service_running,
'post_reboot_scriptpath' => $post_reboot_scriptpath,
}
$results = $target_array.reduce({}) |$memo, $target| {
$result = run_plan('patchy::patching', $target, $args)
$memo + { $target.name => $result }
}
# Output the results
return($results)
}
# This is a copy of pe_patch::group_patching, but adapted to take $targets instead of a patch_group
plan patchy::patching (
TargetSpec $targets,
Optional[Enum['always', 'never', 'patched', 'smart']] $reboot = 'patched',
Optional[String] $yum_params = undef,
Optional[String] $dpkg_params = undef,
Optional[String] $zypper_params = undef,
Optional[Integer] $patch_task_timeout = 3600,
Optional[Integer] $health_check_runinterval = 1800,
Optional[Integer] $reboot_wait_time = 600,
Optional[Boolean] $security_only = false,
Optional[Boolean] $run_health_check = true,
Optional[Boolean] $clean_cache = false,
Optional[Boolean] $health_check_noop = false,
Optional[Boolean] $health_check_use_cached_catalog = false,
Optional[Boolean] $health_check_service_enabled = true,
Optional[Boolean] $health_check_service_running = true,
Optional[Pe_patch::Absolutepath] $post_reboot_scriptpath = undef,
){
# Get Target List,
$target_array = get_targets($targets)
unless $target_array.empty {
### Health Check, Input: $target_array, Output: $patch_ready ###
### Add'l result params: $puppet_not_healthy, $pre_patch_puppet_run_failed ###
if $run_health_check {
# Check the health of the puppet agent on all nodes
# Ensure puppet configuration is as expected, agent hasn't been disabled
# with puppet agent --disable, puppet ssl verify passes, the puppet
# service is in the right state, all servers are reachable, and the
# last puppet run didn't have failures.
$agent_health = run_task('pe_patch::agent_health', $target_array,
target_runinterval => $health_check_runinterval,
target_noop_state => $health_check_noop,
target_use_cached_catalog_state => $health_check_use_cached_catalog,
target_service_enabled => $health_check_service_enabled,
target_service_running => $health_check_service_running,
'_catch_errors' => true)
# Pull out list of those that are ok/in error
$puppet_healthy = $agent_health.ok_set.names
$puppet_not_healthy = $agent_health.error_set.results.map | $error | { $error.error.details }
if $puppet_healthy.empty {
$patch_ready = []
} else {
$pre_patch_run_puppet_check = run_task('enterprise_tasks::run_puppet', $puppet_healthy,
max_timeout => 256,
'_catch_errors' => true)
$patch_ready = $pre_patch_run_puppet_check.ok_set.names
$pre_patch_puppet_run_failed = $pre_patch_run_puppet_check.error_set.names
}
} else {
$patch_ready = $certnames
}
### Patching, Input: $patch_ready, Output: $post_patch_ready ###
### Add'l result params: $not_patched, $reboot_timed_out ###
if $patch_ready.empty {
$post_patch_ready = []
} else {
# So we can detect when a node has rebooted
$begin_boot_time_results = without_default_logging() || {
run_task('pe_patch::last_boot_time', $patch_ready)
}
# Actually carry out the patching on all healthy nodes
$patch_result = run_task('pe_patch::patch_server',
$patch_ready,
yum_params => $yum_params,
dpkg_params => $dpkg_params,
zypper_params => $zypper_params,
timeout => $patch_task_timeout,
reboot => $reboot,
security_only => $security_only,
clean_cache => $clean_cache,
'_catch_errors' => true)
# Pull out list of those that are ok/in error
$patched = $patch_result.ok_set.names
$not_patched = $patch_result.error_set.names
$rebooting_result = $patch_result.ok_set.results.filter | $result | { $result.value['was_rebooted'] }
$rebooting = $rebooting_result.map | $result | { $result.target.name }
### Wait for Reboot ###
if $rebooting.empty {
$post_patch_ready = $patched
} else {
# Adapted from puppetlabs-reboot
$start_time = Timestamp()
$wait_results = without_default_logging() || {
$reboot_wait_time.reduce({'pending' => $rebooting, 'ok' => []}) |$memo, $_| {
if ($memo['pending'].empty or $memo['timed_out']) {
break()
}
$plural = $memo['pending'].size > 1 ? {
true => 's',
default => '',
}
out::message("Waiting for ${$memo['pending'].size} node${plural} to reboot. Note that a failed pe_patch::last_boot_time task is normal while a target is in the middle of rebooting, and may be safely ignored.")
$current_boot_time_results = run_task('pe_patch::last_boot_time', $memo['pending'], _catch_errors => true)
$failed_results = $current_boot_time_results.filter |$current_boot_time_res| {
# If we errored, need to check again, since it's probably still rebooting
if !$current_boot_time_res.ok {
true
} else {
# If the boot time is the same as it was before we patched,
# we haven't rebooted yet and need to check again.
$target_name = $current_boot_time_res.target.name
$begin_boot_time_res = $begin_boot_time_results.find($target_name)
$current_boot_time_res.value == $begin_boot_time_res.value
}
}
# Turn array of results into ResultSet to we can extract Targets
$failed_targets = ResultSet($failed_results).targets.map |$t| { $t.name }
$ok_targets = $memo['pending'] - $failed_targets
$elapsed_time_sec = Integer(Timestamp() - $start_time)
$timed_out = $elapsed_time_sec >= $reboot_wait_time
if !$failed_targets.empty and !$timed_out {
# Wait for targets to be available again before rechecking. If we end up failing
# this wait on any of those nodes, we'll catch it in the next iteration.
pe_patch::sleep(30)
$remaining_time = $reboot_wait_time - $elapsed_time_sec
wait_until_available($failed_targets, wait_time => $remaining_time, retry_interval => 1, '_catch_errors' => true)
}
({
'pending' => $failed_targets,
'ok' => $memo['ok'] + $ok_targets,
'timed_out' => $timed_out,
})
}
}
$reboot_timed_out = $wait_results['pending']
$post_patch_ready = $patched - $reboot_timed_out
}
}
### Post reboot script, Input: $post_patch_ready, Output: None ###
# Run the post_reboot_scriptpath, if defined. Don't fail the plan
# if the script fails. The user will be able to see the result in
# the console.
if $post_reboot_scriptpath {
run_command($post_reboot_scriptpath, $post_patch_ready, '_catch_errors' => true)
}
### Post patching health check, Input: $post_patch_ready, Output: $post_patch_puppet_run_passed ###
### Add'l result params: $post_patch_puppet_run_failed ###
if $post_patch_ready.empty or !$run_health_check {
$post_patch_puppet_run_passed = $post_patch_ready
} else {
# Sometimes a puppet run immediately after reboot fails, so give it a bit of time.
pe_patch::sleep(30)
$post_puppet_check = run_task('enterprise_tasks::run_puppet', $post_patch_ready,
max_timeout => 256,
'_catch_errors' => true)
$post_patch_puppet_run_passed = $post_puppet_check.ok_set.names
$post_patch_puppet_run_failed = $post_puppet_check.error_set.names
}
}
### Defaults ###
# Note: $targets and $target_array are always defined,
# so no need to set a default value here.
$puppet_not_healthy_result = defined('$puppet_not_healthy') ? {
true => $puppet_not_healthy,
default => [],
}
$pre_patch_puppet_run_failed_result = defined('$pre_patch_puppet_run_failed') ? {
true => $pre_patch_puppet_run_failed,
default => [],
}
$patched_result = defined('$patched') ? {
true => $patched,
default => [],
}
$not_patched_result = defined('$not_patched') ? {
true => $not_patched,
default => [],
}
$post_patch_puppet_run_failed_result = defined('$post_patch_puppet_run_failed') ? {
true => $post_patch_puppet_run_failed,
default => [],
}
$reboot_timed_out_result = defined('$reboot_timed_out') ? {
true => $reboot_timed_out,
default => [],
}
# Output the results
return({
'targets' => $targets,
'patchable_nodes' => $target_array,
'puppet_health_check_failed' => $puppet_not_healthy_result,
'pre_patch_puppet_run_failed' => $pre_patch_puppet_run_failed_result,
'patching_failed' => $not_patched_result,
'post_patch_puppet_run_failed' => $post_patch_puppet_run_failed_result,
'reboot_timed_out' => $reboot_timed_out_result,
'nodes_patched' => $patched_result,
'counts' => {
'patchable_nodes_count' => $target_array.length,
'puppet_health_check_failed' => $puppet_not_healthy_result.length,
'pre_patch_puppet_run_failed' => $pre_patch_puppet_run_failed_result.length,
'patching_failed' => $not_patched_result.length,
'post_patch_puppet_run_failed' => $post_patch_puppet_run_failed_result.length,
'reboot_timed_out' => $reboot_timed_out_result.length,
'nodes_patched' => $patched_result.length,
}
})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment