Skip to content

Instantly share code, notes, and snippets.

@portante
Last active December 4, 2021 13:47
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save portante/2b91dd7d49636c7e40fa53fb7ed1388b to your computer and use it in GitHub Desktop.
Save portante/2b91dd7d49636c7e40fa53fb7ed1388b to your computer and use it in GitHub Desktop.
A script to help get fluentd pods running on all labeled nodes of an OpenShift cluster; we need this because Kube currently does not support priority or preemption which we could use to ensure fluentd are always scheduled and run on properly labeled nodes.
#!/bin/bash
function finish {
rm -rf $TMPDIR
}
trap finish EXIT
TMPDIR=$(mktemp -d)
oc get nodes -o name > $TMPDIR/all-nodes
oc get nodes -o name -l logging-infra-fluentd=true > $TMPDIR/labeled-nodes
oc get pods -n logging -l component=fluentd -o wide | grep Running > $TMPDIR/fluentd-pods-running
let total_nodes=$(wc -l $TMPDIR/all-nodes | awk '{ print $1 }')
let total_nodes_labeled=$(wc -l $TMPDIR/labeled-nodes | awk '{ print $1 }')
let total_fluentd=$(wc -l $TMPDIR/fluentd-pods-running | awk '{ print $1 }')
if [ $total_nodes -ne $total_nodes_labeled ]; then
echo "*** Warning *** there are only $total_nodes_labeled of $total_nodes labeled for fluentd"
fi
if [ $total_nodes_labeled -eq $total_fluentd ]; then
echo "Nothing to do, all $total_nodes_labeled labeled nodes have fluentd pods running"
exit 0
else
let missing=$total_nodes_labeled-$total_fluentd
echo "Missing $missing fluentd pods, fixing ..."
fi
function check_fluentd_state {
node=$1
pod=$2
state=$3
if [ "x$node" = "x" -o "x$pod" = "x" -o "x$state" = "x" ]; then
echo "Logic bomb! -- expected node (\"$node\"), pod (\"$pod\"), and state (\"$state\") arguments"
exit 1
fi
if [ "$state" = "Running" ]; then
echo "Node $node is now running $pod."
let isnowrunning=1
else
if [ "$state" = "Pending" -o "$state" = "ContainerCreating" -o "$state" = "Evicted" ]; then
let isnowrunning=0
else
let isnowrunning=1
echo "** Warning: pod $pod now exists, but is in a bad state, \"$state\", skipping"
fi
fi
return $isnowrunning
}
function check_fluentd_running_mngnode {
node=$1
oc adm manage-node $(basename $node) --list-pods 2> $TMPDIR/mng-node.err | grep -F logging-fluentd > $TMPDIR/mng-node.out
if [ -z $TMPDIR/mng-node.out ]; then
let isnowrunning=0
else
pod=$(grep logging-fluentd $TMPDIR/mng-node.out | awk '{ print $2 }')
if [ "x$pod" = "x" ]; then
let isnowrunning=0
else
state=$(awk '{ print $4 }' $TMPDIR/mng-node.out)
check_fluentd_state $node $pod $state
let isnowrunning=$?
fi
fi
return $isnowrunning
}
for node in $(awk '{ print $1 }' $TMPDIR/labeled-nodes); do
# Does this labeled node have a running fluentd?
let isrunning=$(grep -c -F $(basename $node) $TMPDIR/fluentd-pods-running 2> /dev/null)
if [ $isrunning -eq 1 ]; then
: # Fluentd pod is already running, nothing to do ...
else
# Double check it is properly labeled
oc describe $node > $TMPDIR/describe-node.out
label=$(grep logging-infra-fluentd $TMPDIR/describe-node.out 2> /dev/null | awk '{print $1}')
if [ -z "$label" ]; then
echo "Logic Bomb! -- Node $(basename $node) missing label logging-infra-fluentd"
exit 1
fi
if [ ${label#*=} != "true" ]; then
echo "Logic Bomb! -- Node $(basename $node) incorrectly labeled with ${label}"
exit 1
fi
let count=$(grep -c logging-fluentd $TMPDIR/describe-node.out)
if [ $count -gt 1 ]; then
echo "Logic bomb! -- We have more than one fluentd pod on $node, ($count)"
exit 1
fi
if [ $count -eq 1 ]; then
pod=$(grep logging-fluentd $TMPDIR/describe-node.out | awk '{ print $2 }')
state=$(oc get pod -n logging $pod | grep -v "STATUS" | awk '{ print $3 }')
check_fluentd_state $node $pod $state
let isrunning=$?
else
let isrunning=0
fi
if [ $isrunning -eq 0 ]; then
echo "fixing $node ..."
oc adm cordon $(basename $node) > $TMPDIR/node-cordoned-state 2>&1
if [ $? -ne 0 ]; then
echo "*** Warning *** unable to cordon off $node:"
cat $TMPDIR/node-cordoned-state
exit 1
else
let cordoned=$(grep -c -E " cordoned$" $TMPDIR/node-cordoned-state)
if [ $cordoned -ne 1 ]; then
echo "*** Warning *** unable to cordon off $node: \"$(cat $TMPDIR/node-cordoned-state)\""
cat $TMPDIR/node-cordoned-state
exit 1
fi
fi
oc adm manage-node $(basename $node) --list-pods > $TMPDIR/mng-node.out 2> $TMPDIR/mng-node.err
if [ $? -eq 0 ]; then
grep -v 'NAMESPACE' $TMPDIR/mng-node.out > $TMPDIR/pods.lis
let count=$(grep -c logging-fluentd $TMPDIR/pods.lis)
if [ $count -gt 1 ]; then
echo "Logic bomb! -- We have more than one fluentd pod on $node, ($count)"
exit 1
fi
if [ $count -eq 1 ]; then
pod=$(grep logging-fluentd $TMPDIR/pods.lis | awk '{ print $2 }')
state=$(grep logging-fluentd $TMPDIR/pods.lis | awk '{ print $4 }')
check_fluentd_state $node $pod $state
let isnowrunning=$?
else
let isnowrunning=0
fi
if [ $isnowrunning -eq 0 ]; then
head -n 5 $TMPDIR/pods.lis | awk '{print "oc delete pod -n " $1 " " $2 }' > $TMPDIR/delete-pods
cat $TMPDIR/delete-pods | bash
check_fluentd_running_mngnode $node
isnowrunning=$?
while [ $isnowrunning -eq 0 ]; do
echo "Sleeping 5 seconds for fluentd pod to show up..."
sleep 5
check_fluentd_running_mngnode $node
isnowrunning=$?
done
fi
else
echo "*** Warning *** Error listing pods for deletion from $node:"
cat $TMPDIR/mng-node.out
cat TMPDIR/mng-node.err
fi
oc adm uncordon $(basename $node) > $TMPDIR/node-cordoned-state 2>&1
if [ $? -ne 0 ]; then
echo "*** Warning *** unable to uncordon $node:"
cat $TMPDIR/node-cordoned-state
exit 1
else
let cordoned=$(grep -c -E " uncordoned$" $TMPDIR/node-cordoned-state)
if [ $cordoned -ne 1 ]; then
echo "*** Warning *** unable to cordon off $node:"
cat $TMPDIR/node-cordoned-state
exit 1
fi
fi
fi
fi
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment