Last active
November 16, 2023 14:55
-
-
Save DominicWatson/76e393e04e9c65439c3eff948d19e25a to your computer and use it in GitHub Desktop.
A first hacky go at a script to help with zero-downtime cluster autoscaler downsizing of nodes (where you have deployments with only 1 replica)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM bitnami/kubectl as kubectl | |
FROM ubuntu | |
COPY --from=kubectl /opt/bitnami/kubectl/bin/kubectl /usr/local/bin/ | |
COPY scripts/* /usr/bin/ | |
RUN chmod +x /usr/bin/k8sdrain.sh && chmod +x /usr/bin/job.sh | |
CMD [ "/usr/bin/job.sh" ] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Hacktastic method to query k8s api to trigger auto "safe drain" script from here: https://gist.github.com/juliohm1978/fcfd21b26f9431c01978 | |
# We: | |
# * Parse autoscaler logs to detect nodes it wants to drain in the last 5 mins | |
# * Slim these down to nodes that do not have critical blockers | |
# * Slim these down to nodes that are blocked due to PDBs (PodDisruptionBudgets) | |
# * Check they are not already cordoned | |
# * Perform the k8sdrain.sh script to cordon the node and perform rolling restarts on all deployments & replicasets affected | |
# | |
# This is quite specific to our needs, but hopefully offers someone a starting point to implement something that suits them | |
AUTOSCALER_POD=$( kubectl get pods -n kube-system -l app="cluster-autoscaler" -o custom-columns=:metadata.name --no-headers ) | |
CANDIDATES="$( kubectl logs -n kube-system $AUTOSCALER_POD | grep -i " for removal\$" | tail -n 5 )" | |
SINCE=$( date '+%H:%M:%S.%N' -d "5 minutes ago" ) | |
IFS=$'\n' | |
for CANDIDATE in $CANDIDATES | |
do | |
NODE=$( echo $CANDIDATE | awk '{print $5}' ) | |
TS=$( echo $CANDIDATE | awk '{print $2}' ) | |
if [[ $TS > $SINCE ]] ; then | |
CRITICALPROBS=$( kubectl logs -n kube-system $AUTOSCALER_POD | grep -i "$NODE" | grep "cannot be removed: non-daemonset, non-mirrored, non-pdb-assigned kube-system pod present" | wc -l ) | |
if [[ $CRITICALPROBS=="0" ]] ; then | |
PDBPROBS=$( kubectl logs -n kube-system $AUTOSCALER_POD | grep -i "$NODE" | grep "cannot be removed: not enough pod disruption budget to move" | wc -l ) | |
if [[ $PDBPROBS > 0 ]] ; then | |
NODEINFO=$( kubectl get node $NODE | grep "$NODE" ) | |
CORDONED=$( echo $NODEINFO | grep -i "SchedulingDisabled" | wc -l ) | |
if [[ -n $NODEINFO && "$CORDONED"=="0" ]] ; then | |
echo "Found a node for safe draining. Draining single node: $NODE..." | |
# where /usr/bin/k8sdrain.sh is the gist from here: https://gist.github.com/juliohm1978/fcfd21b26f9431c01978 | |
/usr/bin/k8sdrain.sh $NODE restart | |
exit 0 | |
fi | |
fi | |
fi | |
fi | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
kind: ClusterRole | |
apiVersion: rbac.authorization.k8s.io/v1 | |
metadata: | |
name: autoscaler-drain-helper-role | |
rules: | |
- apiGroups: | |
- "" | |
- "extensions" | |
- "apps" | |
resources: | |
- nodes | |
- endpoints | |
- pods | |
- pods/log | |
- deployments | |
- statefulsets | |
verbs: | |
- get | |
- list | |
- watch | |
- update | |
- patch | |
--- | |
apiVersion: v1 | |
kind: ServiceAccount | |
metadata: | |
name: autoscaler-drain-helper | |
namespace: autoscaler-drain-helper | |
--- | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: ClusterRoleBinding | |
metadata: | |
name: autoscaler-drain-helper-api-access | |
roleRef: | |
apiGroup: rbac.authorization.k8s.io | |
kind: ClusterRole | |
name: autoscaler-drain-helper-role | |
subjects: | |
- kind: ServiceAccount | |
name: autoscaler-drain-helper | |
namespace: autoscaler-drain-helper | |
--- | |
apiVersion: batch/v1 | |
kind: CronJob | |
metadata: | |
name: autoscaler-drain-helper | |
namespace: autoscaler-drain-helper | |
spec: | |
schedule: "*/15 * * * *" | |
jobTemplate: | |
spec: | |
template: | |
metadata: | |
labels: | |
app: autoscaler-drain-helper | |
annotations: | |
build_id: "$CI_JOB_ID" | |
spec: | |
serviceAccountName: autoscaler-drain-helper | |
imagePullSecrets: | |
- name: gitlab-registry | |
restartPolicy: Never | |
containers: | |
- name: autoscaler-drain-helper-container | |
env: | |
{{SECRETS}} | |
image: $CI_REGISTRY_IMAGE:$CI_REGISTRY_TAG | |
# ^^ this bit specific to our Gitlab setup, but basically the built image from the Dockerfile above |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
In its current state, this has some significant problems. The main problem is that, at this point, the autoscaler doesn't bother calculating whether or not it would drain the node if it could. This means we can end up draining nodes only to bring up new ones and this can cycle.
Clearly it would be better to more closely emulate what the autoscaler is doing and probably write this in go or some such. Doing some more playing now though.