Last active
September 3, 2018 17:10
-
-
Save juan-lee/882063d529b13373989d6600c2c7eebd to your computer and use it in GitHub Desktop.
AKS Node Health Monitor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[Unit] | |
Description=Kubernetes health monitoring for container runtime | |
After=kube-master-configuration.service | |
[Service] | |
Restart=always | |
RestartSec=10 | |
RemainAfterExit=yes | |
RemainAfterExit=yes | |
ExecStartPre=/bin/chmod 544 /usr/local/bin/health-monitor.sh | |
ExecStart=/usr/local/bin/health-monitor.sh container-runtime | |
[Install] | |
WantedBy=kubernetes.target |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# Copyright 2016 The Kubernetes Authors. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# This script is for master and node instance health monitoring, which is | |
# packed in kube-manifest tarball. It is executed through a systemd service | |
# in cluster/gce/gci/<master/node>.yaml. The env variables come from an env | |
# file provided by the systemd service. | |
# This script originated at https://github.com/kubernetes/kubernetes/blob/master/cluster/gce/gci/health-monitor.sh | |
# and has been modified for AKS. | |
set -o nounset | |
set -o pipefail | |
# We simply kill the process when there is a failure. Another systemd service will | |
# automatically restart the process. | |
function container_runtime_monitoring { | |
local -r max_attempts=5 | |
local attempt=1 | |
local -r crictl="${KUBE_HOME}/bin/crictl" | |
local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" | |
# We still need to use `docker ps` when container runtime is "docker". This is because | |
# dockershim is still part of kubelet today. When kubelet is down, crictl pods | |
# will also fail, and docker will be killed. This is undesirable especially when | |
# docker live restore is disabled. | |
local healthcheck_command="docker ps" | |
if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then | |
healthcheck_command="${crictl} pods" | |
fi | |
# Container runtime startup takes time. Make initial attempts before starting | |
# killing the container runtime. | |
until timeout 60 ${healthcheck_command} > /dev/null; do | |
if (( attempt == max_attempts )); then | |
echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." | |
break | |
fi | |
echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." | |
sleep "$(( 2 ** attempt++ ))" | |
done | |
while true; do | |
if ! timeout 60 ${healthcheck_command} > /dev/null; then | |
echo "Container runtime ${container_runtime_name} failed!" | |
if [[ "$container_runtime_name" == "docker" ]]; then | |
# Dump stack of docker daemon for investigation. | |
# Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to | |
# the exec root directory, which is /var/run/docker/ on Ubuntu and COS. | |
pkill -SIGUSR1 dockerd | |
fi | |
systemctl kill --kill-who=main "${container_runtime_name}" | |
# Wait for a while, as we don't want to kill it again before it is really up. | |
sleep 120 | |
else | |
sleep "${SLEEP_SECONDS}" | |
fi | |
done | |
} | |
function kubelet_monitoring { | |
echo "Wait for 2 minutes for kubelet to be functional" | |
# TODO(andyzheng0831): replace it with a more reliable method if possible. | |
sleep 120 | |
local -r max_seconds=10 | |
local output="" | |
while [ 1 ]; do | |
if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10255/healthz 2>&1); then | |
# Print the response and/or errors. | |
echo $output | |
echo "Kubelet is unhealthy!" | |
systemctl kill kubelet | |
# Wait for a while, as we don't want to kill it again before it is really up. | |
sleep 60 | |
else | |
sleep "${SLEEP_SECONDS}" | |
fi | |
done | |
} | |
############## Main Function ################ | |
if [[ "$#" -ne 1 ]]; then | |
echo "Usage: health-monitor.sh <container-runtime/kubelet>" | |
exit 1 | |
fi | |
KUBE_HOME="/usr/local/bin" | |
KUBE_ENV="/etc/default/kube-env" | |
if [[ -e "${KUBE_ENV}" ]]; then | |
source "${KUBE_ENV}" | |
fi | |
SLEEP_SECONDS=10 | |
component=$1 | |
echo "Start kubernetes health monitoring for ${component}" | |
if [[ "${component}" == "container-runtime" ]]; then | |
container_runtime_monitoring | |
elif [[ "${component}" == "kubelet" ]]; then | |
kubelet_monitoring | |
else | |
echo "Health monitoring for component "${component}" is not supported!" | |
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
set -e | |
# install health-monitor.sh | |
curl -o /usr/local/bin/health-monitor.sh https://gist.githubusercontent.com/juan-lee/882063d529b13373989d6600c2c7eebd/raw/46346b4f68a32cf3044ac1a85ab2e6c7fff47db7/health-monitor.sh | |
chown root:root /usr/local/bin/health-monitor.sh | |
chmod 0755 /usr/local/bin/health-monitor.sh | |
# install docker-monitor.service | |
curl -o /etc/systemd/system/docker-monitor.service https://gist.githubusercontent.com/juan-lee/882063d529b13373989d6600c2c7eebd/raw/46346b4f68a32cf3044ac1a85ab2e6c7fff47db7/docker-monitor.service | |
chown root:root /etc/systemd/system/docker-monitor.service | |
chmod 0644 /etc/systemd/system/docker-monitor.service | |
# install kubelet-monitor.service | |
curl -o /etc/systemd/system/kubelet-monitor.service https://gist.githubusercontent.com/juan-lee/882063d529b13373989d6600c2c7eebd/raw/46346b4f68a32cf3044ac1a85ab2e6c7fff47db7/kubelet-monitor.service | |
chown root:root /etc/systemd/system/kubelet-monitor.service | |
chmod 0644 /etc/systemd/system/kubelet-monitor.service | |
# enable services | |
systemctl daemon-reload | |
systemctl enable docker-monitor | |
systemctl restart docker-monitor | |
systemctl enable kubelet-monitor | |
systemctl restart kubelet-monitor |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[Unit] | |
Description=Kubernetes health monitoring for kubelet | |
After=kube-master-configuration.service | |
[Service] | |
Restart=always | |
RestartSec=10 | |
RemainAfterExit=yes | |
RemainAfterExit=yes | |
ExecStartPre=/bin/chmod 544 /usr/local/bin/health-monitor.sh | |
ExecStart=/usr/local/bin/health-monitor.sh kubelet | |
[Install] | |
WantedBy=kubernetes.target |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment