Skip to content

Instantly share code, notes, and snippets.

@juan-lee
Last active September 3, 2018 17:10
Show Gist options
  • Save juan-lee/882063d529b13373989d6600c2c7eebd to your computer and use it in GitHub Desktop.
Save juan-lee/882063d529b13373989d6600c2c7eebd to your computer and use it in GitHub Desktop.
AKS Node Health Monitor
[Unit]
Description=Kubernetes health monitoring for container runtime
After=kube-master-configuration.service
[Service]
Restart=always
RestartSec=10
RemainAfterExit=yes
RemainAfterExit=yes
ExecStartPre=/bin/chmod 544 /usr/local/bin/health-monitor.sh
ExecStart=/usr/local/bin/health-monitor.sh container-runtime
[Install]
WantedBy=kubernetes.target
#!/usr/bin/env bash
# Copyright 2016 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script is for master and node instance health monitoring, which is
# packed in kube-manifest tarball. It is executed through a systemd service
# in cluster/gce/gci/<master/node>.yaml. The env variables come from an env
# file provided by the systemd service.
# This script originated at https://github.com/kubernetes/kubernetes/blob/master/cluster/gce/gci/health-monitor.sh
# and has been modified for AKS.
set -o nounset
set -o pipefail
# We simply kill the process when there is a failure. Another systemd service will
# automatically restart the process.
function container_runtime_monitoring {
local -r max_attempts=5
local attempt=1
local -r crictl="${KUBE_HOME}/bin/crictl"
local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}"
# We still need to use `docker ps` when container runtime is "docker". This is because
# dockershim is still part of kubelet today. When kubelet is down, crictl pods
# will also fail, and docker will be killed. This is undesirable especially when
# docker live restore is disabled.
local healthcheck_command="docker ps"
if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then
healthcheck_command="${crictl} pods"
fi
# Container runtime startup takes time. Make initial attempts before starting
# killing the container runtime.
until timeout 60 ${healthcheck_command} > /dev/null; do
if (( attempt == max_attempts )); then
echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness."
break
fi
echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..."
sleep "$(( 2 ** attempt++ ))"
done
while true; do
if ! timeout 60 ${healthcheck_command} > /dev/null; then
echo "Container runtime ${container_runtime_name} failed!"
if [[ "$container_runtime_name" == "docker" ]]; then
# Dump stack of docker daemon for investigation.
# Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to
# the exec root directory, which is /var/run/docker/ on Ubuntu and COS.
pkill -SIGUSR1 dockerd
fi
systemctl kill --kill-who=main "${container_runtime_name}"
# Wait for a while, as we don't want to kill it again before it is really up.
sleep 120
else
sleep "${SLEEP_SECONDS}"
fi
done
}
function kubelet_monitoring {
echo "Wait for 2 minutes for kubelet to be functional"
# TODO(andyzheng0831): replace it with a more reliable method if possible.
sleep 120
local -r max_seconds=10
local output=""
while [ 1 ]; do
if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10255/healthz 2>&1); then
# Print the response and/or errors.
echo $output
echo "Kubelet is unhealthy!"
systemctl kill kubelet
# Wait for a while, as we don't want to kill it again before it is really up.
sleep 60
else
sleep "${SLEEP_SECONDS}"
fi
done
}
############## Main Function ################
if [[ "$#" -ne 1 ]]; then
echo "Usage: health-monitor.sh <container-runtime/kubelet>"
exit 1
fi
KUBE_HOME="/usr/local/bin"
KUBE_ENV="/etc/default/kube-env"
if [[ -e "${KUBE_ENV}" ]]; then
source "${KUBE_ENV}"
fi
SLEEP_SECONDS=10
component=$1
echo "Start kubernetes health monitoring for ${component}"
if [[ "${component}" == "container-runtime" ]]; then
container_runtime_monitoring
elif [[ "${component}" == "kubelet" ]]; then
kubelet_monitoring
else
echo "Health monitoring for component "${component}" is not supported!"
fi
#!/usr/bin/env bash
set -e
# install health-monitor.sh
curl -o /usr/local/bin/health-monitor.sh https://gist.githubusercontent.com/juan-lee/882063d529b13373989d6600c2c7eebd/raw/46346b4f68a32cf3044ac1a85ab2e6c7fff47db7/health-monitor.sh
chown root:root /usr/local/bin/health-monitor.sh
chmod 0755 /usr/local/bin/health-monitor.sh
# install docker-monitor.service
curl -o /etc/systemd/system/docker-monitor.service https://gist.githubusercontent.com/juan-lee/882063d529b13373989d6600c2c7eebd/raw/46346b4f68a32cf3044ac1a85ab2e6c7fff47db7/docker-monitor.service
chown root:root /etc/systemd/system/docker-monitor.service
chmod 0644 /etc/systemd/system/docker-monitor.service
# install kubelet-monitor.service
curl -o /etc/systemd/system/kubelet-monitor.service https://gist.githubusercontent.com/juan-lee/882063d529b13373989d6600c2c7eebd/raw/46346b4f68a32cf3044ac1a85ab2e6c7fff47db7/kubelet-monitor.service
chown root:root /etc/systemd/system/kubelet-monitor.service
chmod 0644 /etc/systemd/system/kubelet-monitor.service
# enable services
systemctl daemon-reload
systemctl enable docker-monitor
systemctl restart docker-monitor
systemctl enable kubelet-monitor
systemctl restart kubelet-monitor
[Unit]
Description=Kubernetes health monitoring for kubelet
After=kube-master-configuration.service
[Service]
Restart=always
RestartSec=10
RemainAfterExit=yes
RemainAfterExit=yes
ExecStartPre=/bin/chmod 544 /usr/local/bin/health-monitor.sh
ExecStart=/usr/local/bin/health-monitor.sh kubelet
[Install]
WantedBy=kubernetes.target
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment