Skip to content

Instantly share code, notes, and snippets.

@tom-butler
Created June 2, 2020 06:42
Show Gist options
  • Save tom-butler/e058d4301dfb82d6c0d4502c7a7b3b1d to your computer and use it in GitHub Desktop.
Save tom-butler/e058d4301dfb82d6c0d4502c7a7b3b1d to your computer and use it in GitHub Desktop.
A shell script for replacing instances in an ASG whilst ensuring the deployments stay healthy.
#!/bin/bash
# set -e
# shellcheck disable=SC2178,SC2128
# time to wait for deployments to be healthy
wait_limit=${1:-900} # default set to 15 mins
# nodes to patch in a single go
max_nodes=${2:-50} # default set to max 50 nodes
# Helper function that will return the name of the newest instance in the cluster
get-newest-instance-name(){
name=$(kubectl get nodes -o custom-columns=":metadata.creationTimestamp,:metadata.name" --no-headers | sort -k1 -r | awk '{print $2}' | head -1)
echo "$name"
}
# Helper function that will return the name of the oldest instance in the cluster
get-oldest-instance-name(){
name=$(kubectl get nodes -o custom-columns=":metadata.name" --sort-by=.metadata.creationTimestamp --no-headers | head -1)
echo "$name"
}
# Helper function that will return the creationTimestamp of the oldest instnce timestamp in the cluster
get-oldest-instance-timestamp(){
iso=$(kubectl get nodes -o custom-columns=":metadata.creationTimestamp" --sort-by=.metadata.creationTimestamp --no-headers | head -1)
epoch=$(date -d"$iso" +%s)
echo "$epoch"
}
# Checks all deployments are healthy
wait-for-deployments(){
# Wait max 15 mins per nodes for deployments to be healthy
max_wait=$wait_limit
ready=false
while [[ $max_wait -gt 0 ]] && [ $ready == false ]; do
# Check if deployments are healthy
ready=true
# deployment: an array of all the deployment names
mapfile -t deployment < <(kubectl get deployments --all-namespaces -o custom-columns=":metadata.name" --sort-by=.metadata.name --no-headers)
# available: an array of all available replicas same order as deployment
mapfile -t available < <(kubectl get deployments --all-namespaces -o custom-columns=":status.availableReplicas" --sort-by=.metadata.name --no-headers)
# desired: an array of all desired replicas same order as deployment
mapfile -t desired < <(kubectl get deployments --all-namespaces -o custom-columns=":status.replicas" --sort-by=.metadata.name --no-headers)
count="${#deployment[@]}"
for (( i=0; i<count; i++ )); do
if ! [[ "${desired[$i]}" =~ ^[0-9]+$ ]]; then
echo "Warning: Deployment ${deployment[$i]} doesn't have any replicas"
continue
fi
if [ "${available[$i]}" -lt "${desired[$i]}" ]; then
echo "Deployment ${deployment[$i]} not ready, desired pods: ${desired[$i]}, available pods: ${available[$i]}"
ready=false
fi
done
echo "Deployments ready: $ready"
if [ $ready = false ]; then
sleep 10
max_wait=$((max_wait - 10))
echo "Waited 10 seconds. Still waiting max. $max_wait"
fi
done
}
# From a node name, find the ASG the node is hosted in
find-asg(){
instanceid=$(kubectl get nodes "$1" -o jsonpath='{.metadata.labels.instance-id}')
asg=$(aws ec2 describe-tags --filters "Name=resource-id,Values=$instanceid" "Name=key,Values=aws:autoscaling:groupName" | jq -r '.Tags[0].Value')
echo "$asg"
}
# Scale up the ASG
scale-up-asg(){
max=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names "$1" | jq '.AutoScalingGroups[0].MaxSize')
desired=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names "$1" | jq '.AutoScalingGroups[0].DesiredCapacity')
desired=$((desired+1))
if [ "$desired" -le "$max" ]; then
aws autoscaling set-desired-capacity --auto-scaling-group-name "$1" --desired-capacity "$desired"
else
echo "Warning: Autoscaling Group at max, cannot scale up prematurely"
fi
}
# Scale down the ASG (this expects the default settings of removing oldest node are set)
scale-down-asg(){
min=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names "$1" | jq '.AutoScalingGroups[0].MinSize')
desired=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names "$1" | jq '.AutoScalingGroups[0].DesiredCapacity')
desired=$((desired-1))
if [ "$desired" -ge "$min" ]; then
aws autoscaling set-desired-capacity --auto-scaling-group-name "$1" --desired-capacity "$desired"
else
echo "Warning: Autoscaling Group at min, cannot scale down"
fi
}
echo "Checking deployments are healthy"
wait-for-deployments
if [ $ready = false ]; then
echo "Deployments not in healthy state"
# ensure cluster autoscaler is back online
kubectl scale deployment/cluster-autoscaler-aws-cluster-autoscaler -n admin --replicas 1
exit 1
fi
start_time=$(date '+%s')
echo "Starting to patch at time: $start_time"
# ensure we have 2 dns pods running
kubectl scale deployments/coredns --replicas=2 -n kube-system
# disable cluster autoscaler as it messes with stuff
kubectl scale deployment/cluster-autoscaler-aws-cluster-autoscaler -n admin --replicas 0
node_count=0
# Run until we have patched every node
until [ "$start_time" -lt "$(get-oldest-instance-timestamp)" ]; do
oldest_node=$(get-oldest-instance-name)
echo "Draining node $oldest_node"
# Scale up asg
asg=$(find-asg "$oldest_node")
scale-up-asg "$asg"
# Give it 60 seconds to create a new node
sleep 60
# Wait until newest node is ready
newest_node=$(get-newest-instance-name)
while [[ $(kubectl get node $newest_node -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}') != "True" ]]; do
echo "Waiting for newest node to be ready: $newest_node" && sleep 1;
done
# Taint node with noschedule, then drain the pods off it, if something breaks move on
kubectl cordon "$oldest_node" && kubectl drain "$oldest_node" --delete-local-data --ignore-daemonsets --force || echo "Warning: node could not be drained, continuing"
# Wait until all deployments are healthy
echo "Waiting for deployments to be healthy"
wait-for-deployments
# scale down asg
scale-down-asg "$asg"
# Remove the node from kubernetes (So we don't keep trying to remove the same node)
kubectl delete node "$oldest_node"
node_count=$((node_count+1))
if [ "$node_count" -ge "$max_nodes" ]; then
echo "Patched max number of nodes, finishing"
break
fi
done
#ensure cluster autoscaler is back online
kubectl scale deployment/cluster-autoscaler-aws-cluster-autoscaler -n admin --replicas 1
echo "Patching complete, patched $node_count nodes"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment