Skip to content

Instantly share code, notes, and snippets.

@freyes
Created December 14, 2020 03:12
Show Gist options
  • Save freyes/aefecee169d6f65f7a3b88df2d3ba76b to your computer and use it in GitHub Desktop.
Save freyes/aefecee169d6f65f7a3b88df2d3ba76b to your computer and use it in GitHub Desktop.
#!/bin/bash -eu
#
# Author: Felipe Reyes <felipe.reyes@canonical.com>
#
# References:
# - "Kubelet api server connection check in health checker" in kubernetes/node-problem-detector
# https://github.com/kubernetes/node-problem-detector/pull/489
# - "(1.17) Kubelet won't reconnect to Apiserver after NIC failure (use of closed network connection)" in kubernetes/kubernetes
# https://github.com/kubernetes/kubernetes/issues/87615
print_usage () {
echo "Usage:
./monitor_kubelet_restart_if_needed.sh -n <OCCURRENCES> --window <SECONDS> --cooldown <SECOND>
-n <OCCURRENCES> number of log line matches that need to be found to restart kubelet (default: 10).
--cooldown <SECONDS> number of seconds to wait since kubelet started before start evaluating the logs (default: 120).
--window <SECONDS> grep the logs for the last SECONDS available as reported by journalctl --since= (default: 600).
"
}
SERVICE="snap.kubelet.daemon.service"
PATTERN="use of closed network connection"
TIME_WINDOW=600 # in seconds
COOLDOWN=120 # in seconds
MAX_ALLOWED_OCCURRENCES=10 # how many occurences of the $PATTERN are allowed within $TIME_WINDOW
KUBELET_UPTIME="$(systemctl show -p ExecMainStartTimestamp $SERVICE | cut -d'=' -f2)"
KUBELET_UPTIME_SECS=$(date -d "$KUBELET_UPTIME" +%s)
NOW=$(date +%s)
print_log() {
local LEVEL=$1;shift
local MSG="$@"
echo "$(date) - $LEVEL - $MSG"
}
min() {
if [ $1 -ge $2 ]; then
echo $2
else
echo $1
fi
}
while (($# > 0)); do
case "$1" in
-n)
shift
MAX_ALLOWED_OCCURRENCES=$1
;;
--cooldown)
shift
COOLDOWN=$1
;;
--window)
shift
TIME_WINDOW=$1
;;
*)
print_usage
exit 1
;;
esac
shift
done
# how long to wait before starting to monitor the logs
THRESHOLD=$(( $NOW - $COOLDOWN ))
# we'll do nothing if kubelet was started after the threshold which is within
# the last 10 minutes window.
SINCE_STARTED=$(( $NOW - $KUBELET_UPTIME_SECS ))
if [ $THRESHOLD -le $KUBELET_UPTIME_SECS ]; then
print_log DEBUG "kubelet process was started $SINCE_STARTED seconds ago, it's still in the cooldown period of $COOLDOWN seconds"
exit 0;
fi
# we look back the last $TIME_WINDOW seconds or since kubelet started whatever
# is the smallest, because we don't want to look the logs of the previous
# instance of kubelet.
SECS_OF_LOG=$(min $TIME_WINDOW $SINCE_STARTED)
OCCURRENCES=$(journalctl -u $SERVICE --since "-${TIME_WINDOW}s" | grep -i "$PATTERN" | wc -l)
if [ $OCCURRENCES -ge $MAX_ALLOWED_OCCURRENCES ]; then
print_log INFO "Found $OCCURRENCES in the last $SECS_OF_LOG seconds, restarting kubelet now"
systemctl restart $SERVICE
exit $?
else
print_log DEBUG "Found $OCCURRENCES in the last $SECS_OF_LOG seconds"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment