Skip to content

Instantly share code, notes, and snippets.

@sallyom
Last active February 9, 2021 15:44
Show Gist options
  • Save sallyom/958da6dfe70b825c49233c33a35b1d47 to your computer and use it in GitHub Desktop.
Save sallyom/958da6dfe70b825c49233c33a35b1d47 to your computer and use it in GitHub Desktop.
testing with OpenShift nested libvirt cluster and setting time ahead 370 days
#!/bin/bash
set -euxo pipefail
final-check () {
if
! oc wait co --all --for='condition=Available=True' --timeout=20s 1>/dev/null || \
! oc wait co --all --for='condition=Progressing=False' --timeout=20s 1>/dev/null || \
! oc wait co --all --for='condition=Degraded=False' --timeout=20s 1>/dev/null; then
echo "Some ClusterOperators Degraded=True,Progressing=True,or Available=False"
oc get co
exit 1
else
echo "All ClusterOperators reporting healthy"
oc get co
oc get clusterversion
fi
exit 0
}
trap final-check EXIT
approveCSRs () {
pendingCSRs=$(oc get csr | grep Pending | wc -l)
if [ $pendingCSRs -ne 0 ]; then
echo "Approving pending csrs"
oc get csr -o name | xargs oc adm certificate approve
sleep 30
fi
}
# TODO: Need to improve this nodesReady check
checkNodesReady () {
nodesReady=0
retries=0
while [ $nodesReady -ne 5 ] && [ $retries -lt 100 ]; do
approveCSRs
nodesReady=$(oc wait --for=condition=Ready node --all --timeout=30s| wc -l)
if [ $nodesReady -eq 5 ]; then
echo "All nodes Ready"
fi
(( retries++ ))
done
if [ $nodesReady -ne 5 ]; then
echo "Some nodes NotReady"
oc get nodes
exit 1
fi
}
jumpstartNodes () {
approveCSRs
# jumpstart any stuck nodes, during recovery nodes will be rebooted
nodesDisabled=$(oc get nodes | grep "NotReady" | awk '{ print $1 }')
if [ ! -z "${nodesDisabled}" ]; then
nodeDisabledList=( $nodesDisabled )
for i in "${nodeDisabledList[@]}"
do
echo "Restarting stuck node ${i}..."
sudo virsh destroy "${i}"
sleep 30
sudo virsh start "${i}"
sleep 60
done
checkNodesReady
fi
}
checkDegradedCOs () {
retries=0
# image-pruner job in openshift-image-registry namespace may be stuck due to time skew. This would not
# happen if time was progressing naturally. Kill image-prune jobs here.
oc delete jobs --all -n openshift-image-registry
# supposedly fixed but still lingering pod trip up insights-operator: https://bugzilla.redhat.com/show_bug.cgi?id=1919778
oc delete pods --all -n openshift-insights --force --grace-period=0
sleep 10
while ! oc wait co --all --for='condition=Degraded=False' --timeout=20s && [ $retries -lt 100 ]; do
(( retries++ ))
done
}
checkProgressingCOs () {
retries=0
# image-pruner job in openshift-image-registry namespace may be stuck due to time skew. This would not
# happen if time was progressing naturally. Kill image-prune jobs here.
oc delete jobs --all -n openshift-image-registry
sleep 10
while ! oc wait co --all --for='condition=Progressing=False' --timeout=20s && [ $retries -lt 100 ]; do
jumpstartNodes
(( retries++ ))
done
}
checkAvailableCOs () {
retries=0
while ! oc wait co --all --for='condition=Available=True' --timeout=20s && [ $retries -lt 100 ]; do
jumpstartNodes
(( retries++ ))
done
}
sudo systemctl stop chronyd
SKEW=${1:-+400d}
OC=${OC:-oc}
SSH=${SSH:-ssh}
control_nodes=$( ${OC} get nodes --selector='node-role.kubernetes.io/master' --template='{{ range $index, $_ := .items }}{{ range .status.addresses }}{{ if (eq .type "InternalIP") }}{{ if $index }} {{end }}{{ .address }}{{ end }}{{ end }}{{ end }}' )
compute_nodes=$( ${OC} get nodes --selector='!node-role.kubernetes.io/master' --template='{{ range $index, $_ := .items }}{{ range .status.addresses }}{{ if (eq .type "InternalIP") }}{{ if $index }} {{end }}{{ .address }}{{ end }}{{ end }}{{ end }}' )
function run-on {
for n in ${1}; do ${SSH} core@"${n}" sudo 'bash -eEuxo pipefail' <<< ${2}; done
}
ssh-keyscan -H ${control_nodes} ${compute_nodes} >> ~/.ssh/known_hosts
run-on "${control_nodes} ${compute_nodes}" "systemctl stop kubelet"
# Destroy all containers on compute_nodes.
run-on "${compute_nodes}" "crictl rm --all -f"
# Destroy all containers on control_nodes except KAS and etcd.
run-on "${control_nodes}" '
kas_id=$( crictl ps --name="^kube-apiserver$" -q )
# [[ -n "${kas_id}" ]]
etcd_id=$( crictl ps --name="^etcd$" -q )
# [[ -n "${etcd_id}" ]]
other_ids=$( crictl ps --all -q | ( grep -v -e "${kas_id}" -e "${etcd_id}" || true ) )
if [ -n "${other_ids}" ]; then
crictl rm -f ${other_ids}
fi;
'
# Delete all pods, especially the operators. Makes sure it needs KCM and KS working when starting again.
${OC} delete pods --all -n openshift-kube-apiserver-operator --force --grace-period=0
${OC} delete pods --all -n openshift-kube-apiserver --force --grace-period=0
${OC} delete pods --all -n openshift-etcd-operator --force --grace-period=0
${OC} delete pods --all -n openshift-etcd --force --grace-period=0
${OC} delete pods -A --all --force --grace-period=0
# Delete all clusteroperator status to avoid stale status when the operator pod isn't started.
export bearer=$( oc -n openshift-cluster-version serviceaccounts get-token default ) && export server=$( oc whoami --show-server ) && for co in $( oc get co --template='{{ range .items }}{{ printf "%s\n" .metadata.name }}{{ end }}' ); do curl -k -X PATCH -H "Authorization: Bearer ${bearer}" -H "Accept: application/json" -H "Content-Type: application/merge-patch+json" ${server}/apis/config.openshift.io/v1/clusteroperators/${co}/status -d '{"status": null}' && echo; done
# Destroy the remaining containers on control_nodes
run-on "${control_nodes}" "crictl rm --all -f"
run-on "${control_nodes} ${compute_nodes}" "systemctl disable chronyd --now"
# Set time only as a difference to the synced time so we don't introduce a skew between the machines which would break etcd, leader election and others.
run-on "${control_nodes} ${compute_nodes}" "
timedatectl status
timedatectl set-ntp false
timedatectl set-time '${SKEW}'
timedatectl status
"
run-on "${control_nodes} ${compute_nodes}" "sleep 10 && systemctl start kubelet"
# now set date for host
sudo timedatectl set-time ${SKEW}
# wait for connectivity
# allow 4 minutes for date to propagate and to regain connectivity
set +e
retries=0
while ! oc get csr && [ $retries -lt 25 ]; do
if [ $retries -eq 24 ]; then
exit 1
fi
sleep 10
(( retries++ ))
done
set +eu
checkNodesReady
checkAvailableCOs
checkProgressingCOs
checkDegradedCOs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment