Last active
February 9, 2021 15:44
-
-
Save sallyom/958da6dfe70b825c49233c33a35b1d47 to your computer and use it in GitHub Desktop.
testing with OpenShift nested libvirt cluster and setting time ahead 370 days
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -euxo pipefail | |
final-check () { | |
if | |
! oc wait co --all --for='condition=Available=True' --timeout=20s 1>/dev/null || \ | |
! oc wait co --all --for='condition=Progressing=False' --timeout=20s 1>/dev/null || \ | |
! oc wait co --all --for='condition=Degraded=False' --timeout=20s 1>/dev/null; then | |
echo "Some ClusterOperators Degraded=True,Progressing=True,or Available=False" | |
oc get co | |
exit 1 | |
else | |
echo "All ClusterOperators reporting healthy" | |
oc get co | |
oc get clusterversion | |
fi | |
exit 0 | |
} | |
trap final-check EXIT | |
approveCSRs () { | |
pendingCSRs=$(oc get csr | grep Pending | wc -l) | |
if [ $pendingCSRs -ne 0 ]; then | |
echo "Approving pending csrs" | |
oc get csr -o name | xargs oc adm certificate approve | |
sleep 30 | |
fi | |
} | |
# TODO: Need to improve this nodesReady check | |
checkNodesReady () { | |
nodesReady=0 | |
retries=0 | |
while [ $nodesReady -ne 5 ] && [ $retries -lt 100 ]; do | |
approveCSRs | |
nodesReady=$(oc wait --for=condition=Ready node --all --timeout=30s| wc -l) | |
if [ $nodesReady -eq 5 ]; then | |
echo "All nodes Ready" | |
fi | |
(( retries++ )) | |
done | |
if [ $nodesReady -ne 5 ]; then | |
echo "Some nodes NotReady" | |
oc get nodes | |
exit 1 | |
fi | |
} | |
jumpstartNodes () { | |
approveCSRs | |
# jumpstart any stuck nodes, during recovery nodes will be rebooted | |
nodesDisabled=$(oc get nodes | grep "NotReady" | awk '{ print $1 }') | |
if [ ! -z "${nodesDisabled}" ]; then | |
nodeDisabledList=( $nodesDisabled ) | |
for i in "${nodeDisabledList[@]}" | |
do | |
echo "Restarting stuck node ${i}..." | |
sudo virsh destroy "${i}" | |
sleep 30 | |
sudo virsh start "${i}" | |
sleep 60 | |
done | |
checkNodesReady | |
fi | |
} | |
checkDegradedCOs () { | |
retries=0 | |
# image-pruner job in openshift-image-registry namespace may be stuck due to time skew. This would not | |
# happen if time was progressing naturally. Kill image-prune jobs here. | |
oc delete jobs --all -n openshift-image-registry | |
# supposedly fixed but still lingering pod trip up insights-operator: https://bugzilla.redhat.com/show_bug.cgi?id=1919778 | |
oc delete pods --all -n openshift-insights --force --grace-period=0 | |
sleep 10 | |
while ! oc wait co --all --for='condition=Degraded=False' --timeout=20s && [ $retries -lt 100 ]; do | |
(( retries++ )) | |
done | |
} | |
checkProgressingCOs () { | |
retries=0 | |
# image-pruner job in openshift-image-registry namespace may be stuck due to time skew. This would not | |
# happen if time was progressing naturally. Kill image-prune jobs here. | |
oc delete jobs --all -n openshift-image-registry | |
sleep 10 | |
while ! oc wait co --all --for='condition=Progressing=False' --timeout=20s && [ $retries -lt 100 ]; do | |
jumpstartNodes | |
(( retries++ )) | |
done | |
} | |
checkAvailableCOs () { | |
retries=0 | |
while ! oc wait co --all --for='condition=Available=True' --timeout=20s && [ $retries -lt 100 ]; do | |
jumpstartNodes | |
(( retries++ )) | |
done | |
} | |
sudo systemctl stop chronyd | |
SKEW=${1:-+400d} | |
OC=${OC:-oc} | |
SSH=${SSH:-ssh} | |
control_nodes=$( ${OC} get nodes --selector='node-role.kubernetes.io/master' --template='{{ range $index, $_ := .items }}{{ range .status.addresses }}{{ if (eq .type "InternalIP") }}{{ if $index }} {{end }}{{ .address }}{{ end }}{{ end }}{{ end }}' ) | |
compute_nodes=$( ${OC} get nodes --selector='!node-role.kubernetes.io/master' --template='{{ range $index, $_ := .items }}{{ range .status.addresses }}{{ if (eq .type "InternalIP") }}{{ if $index }} {{end }}{{ .address }}{{ end }}{{ end }}{{ end }}' ) | |
function run-on { | |
for n in ${1}; do ${SSH} core@"${n}" sudo 'bash -eEuxo pipefail' <<< ${2}; done | |
} | |
ssh-keyscan -H ${control_nodes} ${compute_nodes} >> ~/.ssh/known_hosts | |
run-on "${control_nodes} ${compute_nodes}" "systemctl stop kubelet" | |
# Destroy all containers on compute_nodes. | |
run-on "${compute_nodes}" "crictl rm --all -f" | |
# Destroy all containers on control_nodes except KAS and etcd. | |
run-on "${control_nodes}" ' | |
kas_id=$( crictl ps --name="^kube-apiserver$" -q ) | |
# [[ -n "${kas_id}" ]] | |
etcd_id=$( crictl ps --name="^etcd$" -q ) | |
# [[ -n "${etcd_id}" ]] | |
other_ids=$( crictl ps --all -q | ( grep -v -e "${kas_id}" -e "${etcd_id}" || true ) ) | |
if [ -n "${other_ids}" ]; then | |
crictl rm -f ${other_ids} | |
fi; | |
' | |
# Delete all pods, especially the operators. Makes sure it needs KCM and KS working when starting again. | |
${OC} delete pods --all -n openshift-kube-apiserver-operator --force --grace-period=0 | |
${OC} delete pods --all -n openshift-kube-apiserver --force --grace-period=0 | |
${OC} delete pods --all -n openshift-etcd-operator --force --grace-period=0 | |
${OC} delete pods --all -n openshift-etcd --force --grace-period=0 | |
${OC} delete pods -A --all --force --grace-period=0 | |
# Delete all clusteroperator status to avoid stale status when the operator pod isn't started. | |
export bearer=$( oc -n openshift-cluster-version serviceaccounts get-token default ) && export server=$( oc whoami --show-server ) && for co in $( oc get co --template='{{ range .items }}{{ printf "%s\n" .metadata.name }}{{ end }}' ); do curl -k -X PATCH -H "Authorization: Bearer ${bearer}" -H "Accept: application/json" -H "Content-Type: application/merge-patch+json" ${server}/apis/config.openshift.io/v1/clusteroperators/${co}/status -d '{"status": null}' && echo; done | |
# Destroy the remaining containers on control_nodes | |
run-on "${control_nodes}" "crictl rm --all -f" | |
run-on "${control_nodes} ${compute_nodes}" "systemctl disable chronyd --now" | |
# Set time only as a difference to the synced time so we don't introduce a skew between the machines which would break etcd, leader election and others. | |
run-on "${control_nodes} ${compute_nodes}" " | |
timedatectl status | |
timedatectl set-ntp false | |
timedatectl set-time '${SKEW}' | |
timedatectl status | |
" | |
run-on "${control_nodes} ${compute_nodes}" "sleep 10 && systemctl start kubelet" | |
# now set date for host | |
sudo timedatectl set-time ${SKEW} | |
# wait for connectivity | |
# allow 4 minutes for date to propagate and to regain connectivity | |
set +e | |
retries=0 | |
while ! oc get csr && [ $retries -lt 25 ]; do | |
if [ $retries -eq 24 ]; then | |
exit 1 | |
fi | |
sleep 10 | |
(( retries++ )) | |
done | |
set +eu | |
checkNodesReady | |
checkAvailableCOs | |
checkProgressingCOs | |
checkDegradedCOs |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment