Created
December 14, 2020 03:12
-
-
Save freyes/aefecee169d6f65f7a3b88df2d3ba76b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash -eu | |
# | |
# Author: Felipe Reyes <felipe.reyes@canonical.com> | |
# | |
# References: | |
# - "Kubelet api server connection check in health checker" in kubernetes/node-problem-detector | |
# https://github.com/kubernetes/node-problem-detector/pull/489 | |
# - "(1.17) Kubelet won't reconnect to Apiserver after NIC failure (use of closed network connection)" in kubernetes/kubernetes | |
# https://github.com/kubernetes/kubernetes/issues/87615 | |
print_usage () { | |
echo "Usage: | |
./monitor_kubelet_restart_if_needed.sh -n <OCCURRENCES> --window <SECONDS> --cooldown <SECOND> | |
-n <OCCURRENCES> number of log line matches that need to be found to restart kubelet (default: 10). | |
--cooldown <SECONDS> number of seconds to wait since kubelet started before start evaluating the logs (default: 120). | |
--window <SECONDS> grep the logs for the last SECONDS available as reported by journalctl --since= (default: 600). | |
" | |
} | |
SERVICE="snap.kubelet.daemon.service" | |
PATTERN="use of closed network connection" | |
TIME_WINDOW=600 # in seconds | |
COOLDOWN=120 # in seconds | |
MAX_ALLOWED_OCCURRENCES=10 # how many occurences of the $PATTERN are allowed within $TIME_WINDOW | |
KUBELET_UPTIME="$(systemctl show -p ExecMainStartTimestamp $SERVICE | cut -d'=' -f2)" | |
KUBELET_UPTIME_SECS=$(date -d "$KUBELET_UPTIME" +%s) | |
NOW=$(date +%s) | |
print_log() { | |
local LEVEL=$1;shift | |
local MSG="$@" | |
echo "$(date) - $LEVEL - $MSG" | |
} | |
min() { | |
if [ $1 -ge $2 ]; then | |
echo $2 | |
else | |
echo $1 | |
fi | |
} | |
while (($# > 0)); do | |
case "$1" in | |
-n) | |
shift | |
MAX_ALLOWED_OCCURRENCES=$1 | |
;; | |
--cooldown) | |
shift | |
COOLDOWN=$1 | |
;; | |
--window) | |
shift | |
TIME_WINDOW=$1 | |
;; | |
*) | |
print_usage | |
exit 1 | |
;; | |
esac | |
shift | |
done | |
# how long to wait before starting to monitor the logs | |
THRESHOLD=$(( $NOW - $COOLDOWN )) | |
# we'll do nothing if kubelet was started after the threshold which is within | |
# the last 10 minutes window. | |
SINCE_STARTED=$(( $NOW - $KUBELET_UPTIME_SECS )) | |
if [ $THRESHOLD -le $KUBELET_UPTIME_SECS ]; then | |
print_log DEBUG "kubelet process was started $SINCE_STARTED seconds ago, it's still in the cooldown period of $COOLDOWN seconds" | |
exit 0; | |
fi | |
# we look back the last $TIME_WINDOW seconds or since kubelet started whatever | |
# is the smallest, because we don't want to look the logs of the previous | |
# instance of kubelet. | |
SECS_OF_LOG=$(min $TIME_WINDOW $SINCE_STARTED) | |
OCCURRENCES=$(journalctl -u $SERVICE --since "-${TIME_WINDOW}s" | grep -i "$PATTERN" | wc -l) | |
if [ $OCCURRENCES -ge $MAX_ALLOWED_OCCURRENCES ]; then | |
print_log INFO "Found $OCCURRENCES in the last $SECS_OF_LOG seconds, restarting kubelet now" | |
systemctl restart $SERVICE | |
exit $? | |
else | |
print_log DEBUG "Found $OCCURRENCES in the last $SECS_OF_LOG seconds" | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment