Last active
August 8, 2019 15:37
-
-
Save csrwng/478138a529d05fc05ff5096cccdd5a43 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
export GCP_SHARED_CREDENTIALS_FILE=/tmp/cluster/gce.json | |
export CLUSTER_TYPE=gcp | |
export HOME=/tmp/home | |
export ARTIFACT_DIR=/tmp/artifacts | |
export GCP_PROJECT=openshift-dev-installer | |
export ENABLE_FIPS=false | |
set -euo pipefail | |
export PATH=/usr/libexec/origin:$PATH | |
mkdir -p "${HOME}" | |
mkdir -p "${ARTIFACT_DIR}" | |
mkdir -p "${ARTIFACT_DIR}/junit" | |
export KUBECONFIG=/tmp/cluster/admin.kubeconfig | |
# set up cloud-provider-specific env vars | |
export KUBE_SSH_BASTION="$( oc --insecure-skip-tls-verify get node -l node-role.kubernetes.io/master -o 'jsonpath={.items[0].status.addresses[?(@.type=="ExternalIP")].address}' ):22" | |
export KUBE_SSH_KEY_PATH=/tmp/cluster/ssh-privatekey | |
if [[ "${CLUSTER_TYPE}" == "gcp" ]]; then | |
export GOOGLE_APPLICATION_CREDENTIALS="${GCP_SHARED_CREDENTIALS_FILE}" | |
export KUBE_SSH_USER=core | |
mkdir -p ~/.ssh | |
cp /tmp/cluster/ssh-privatekey ~/.ssh/google_compute_engine || true | |
export PROVIDER_ARGS='-provider=gce -gce-zone=us-east1-c -gce-project=openshift-dev-installer' | |
export TEST_PROVIDER='{"type":"gce","zone":"us-east1-c","projectid":"openshift-dev-installer"}' | |
elif [[ "${CLUSTER_TYPE}" == "aws" ]]; then | |
mkdir -p ~/.ssh | |
cp /tmp/cluster/ssh-privatekey ~/.ssh/kube_aws_rsa || true | |
export PROVIDER_ARGS="-provider=aws -gce-zone=us-east-1" | |
# TODO: make openshift-tests auto-discover this from cluster config | |
export TEST_PROVIDER='{"type":"aws","region":"us-east-1","zone":"us-east-1a","multizone":true,"multimaster":true}' | |
export KUBE_SSH_USER=core | |
elif [[ "${CLUSTER_TYPE}" == "azure4" ]]; then | |
export TEST_PROVIDER='azure' | |
fi | |
# create fips enable helper | |
function enable_fips() { | |
for name in $(oc get machineconfigpool --template='{{range .items}}{{.metadata.name}}{{"\n"}}{{end}}') | |
do | |
cat > /tmp/fips-mc.yaml <<'EOF' | |
apiVersion: machineconfiguration.openshift.io/v1 | |
kind: MachineConfig | |
metadata: | |
labels: | |
machineconfiguration.openshift.io/role: "{{name}}" | |
name: 99-fips-"{{name}}" | |
spec: | |
fips: true | |
EOF | |
sed -i "s/\"{{name}}\"/${name}/g" /tmp/fips-mc.yaml | |
oc create -f /tmp/fips-mc.yaml | |
done | |
for i in $(seq 0 10); do oc wait machineconfigpool --all --for=condition=Updating --timeout=5m && break; done | |
for i in $(seq 0 10); do oc wait machineconfigpool --all --for=condition=Updated --timeout=5m && break; sleep 30; done | |
} | |
if [[ "${ENABLE_FIPS}" == true ]]; then | |
enable_fips | |
fi | |
mkdir -p /tmp/output | |
cd /tmp/output | |
function retry() { | |
local RETRY_IGNORE_EXIT_CODE="${RETRY_IGNORE_EXIT_CODE:-}" | |
local ATTEMPTS="${1}" | |
local rc=0 | |
shift | |
for i in $(seq 0 $((ATTEMPTS-1))); do | |
echo "--> ${@}" | |
set +e | |
"${@}" | |
rc="$?" | |
set -e | |
echo "--> exit code: $rc" | |
test "${rc}" = 0 && break | |
sleep 10 | |
done | |
if [ "${RETRY_IGNORE_EXIT_CODE}" != "" ]; then return 0; else return "${rc}"; fi | |
} | |
function setup_ssh_bastion() { | |
echo "Setting up ssh bastion" | |
mkdir -p ~/.ssh || true | |
cp "${KUBE_SSH_KEY_PATH}" ~/.ssh/id_rsa | |
chmod 0600 ~/.ssh/id_rsa | |
if ! whoami &> /dev/null; then | |
if [ -w /etc/passwd ]; then | |
echo "${USER_NAME:-default}:x:$(id -u):0:${USER_NAME:-default} user:${HOME}:/sbin/nologin" >> /etc/passwd | |
fi | |
fi | |
curl https://raw.githubusercontent.com/eparis/ssh-bastion/master/deploy/deploy.sh | bash | |
for i in $(seq 0 60) | |
do | |
BASTION_HOST=$(oc get service -n openshift-ssh-bastion ssh-bastion -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') | |
if [ ! -z "${BASTION_HOST}" ]; then break; fi | |
sleep 10 | |
done | |
} | |
function bastion_ssh() { | |
retry 60 \ | |
ssh -o LogLevel=error -o ConnectionAttempts=100 -o ConnectTimeout=30 -o StrictHostKeyChecking=no \ | |
-o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o LogLevel=error -o ServerAliveInterval=30 -o ConnectionAttempts=100 -o ConnectTimeout=30 -W %h:%p core@${BASTION_HOST} 2>/dev/null" \ | |
$@ | |
} | |
function restore-cluster-state() { | |
echo "Placing file /etc/rollback-test with contents A" | |
cat > /tmp/machineconfig.yaml <<'EOF' | |
apiVersion: machineconfiguration.openshift.io/v1 | |
kind: MachineConfig | |
metadata: | |
labels: | |
machineconfiguration.openshift.io/role: master | |
name: 99-rollback-test | |
spec: | |
config: | |
ignition: | |
version: 2.2.0 | |
storage: | |
files: | |
- contents: | |
source: data:,A | |
filesystem: root | |
mode: 420 | |
path: /etc/rollback-test | |
EOF | |
oc create -f /tmp/machineconfig.yaml | |
function wait_for_machineconfigpool_to_apply() { | |
for i in $(seq 0 10); do oc wait machineconfigpool/master --for=condition=Updating --timeout=5m && break; done | |
for i in $(seq 0 10); do oc wait machineconfigpool/master --for=condition=Updated --timeout=5m && break; sleep 30; done | |
} | |
wait_for_machineconfigpool_to_apply | |
setup_ssh_bastion | |
echo "Make etcd backup on first master - /usr/local/bin/etcd-snapshot-backup.sh" | |
FIRST_MASTER=$(oc get node -l node-role.kubernetes.io/master= -o name | head -n1 | cut -d '/' -f 2) | |
bastion_ssh "core@${FIRST_MASTER}" "sudo -i /bin/bash -x /usr/local/bin/etcd-snapshot-backup.sh /root/assets/backup/snapshot.db && sudo -i cp /root/assets/backup/snapshot.db /tmp/snapshot.db && sudo -i chown core:core /tmp/snapshot.db" | |
# TODO: upgrade conditionally here | |
echo "Update rollback-test machineconfig" | |
oc patch machineconfig 99-rollback-test -n openshift-machine-api --patch '{"spec":{"config":{"storage":{"files":[{"contents":{"source":"data:,B","verification":{}},"filesystem":"root","mode":420,"path":"/etc/rollback-test"}]}}}}' --type=merge | |
wait_for_machineconfigpool_to_apply | |
echo "Distribute snapshot across all masters" | |
mapfile -t MASTERS < <(oc get node -l node-role.kubernetes.io/master= -o name | cut -d '/' -f 2) | |
for master in "${MASTERS[@]}" | |
do | |
scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" ${KUBE_SSH_KEY_PATH} "core@${master}":/home/core/.ssh/id_rsa | |
bastion_ssh "core@${master}" "sudo -i chmod 0600 /home/core/.ssh/id_rsa" | |
bastion_ssh "core@${FIRST_MASTER}" "scp -o StrictHostKeyChecking=no /tmp/snapshot.db core@${master}:/tmp/snapshot.db" | |
done | |
echo "Collect etcd names" | |
for master in "${MASTERS[@]}" | |
do | |
bastion_ssh "core@${master}" 'echo "etcd-member-$(hostname -f)" > /tmp/etcd_name && source /run/etcd/environment && echo "https://${ETCD_DNS_NAME}:2380" > /tmp/etcd_uri' | |
bastion_ssh "core@${FIRST_MASTER}" "mkdir -p /tmp/etcd/${master} && scp -o StrictHostKeyChecking=no core@${master}:/tmp/etcd_name /tmp/etcd/${master}/etcd_name && scp -o StrictHostKeyChecking=no core@${master}:/tmp/etcd_uri /tmp/etcd/${master}/etcd_uri" | |
bastion_ssh "core@${FIRST_MASTER}" "cat /tmp/etcd/${master}/etcd_name" | |
bastion_ssh "core@${FIRST_MASTER}" "cat /tmp/etcd/${master}/etcd_uri" | |
done | |
echo "Assemble etcd connection string" | |
bastion_ssh "core@${FIRST_MASTER}" 'rm -rf /tmp/etcd/connstring && mapfile -t MASTERS < <(ls /tmp/etcd) && echo ${MASTERS[@]} && for master in "${MASTERS[@]}"; do echo -n "$(cat /tmp/etcd/${master}/etcd_name)=$(cat /tmp/etcd/${master}/etcd_uri)," >> /tmp/etcd/connstring; done && sed -i '"'$ s/.$//'"' /tmp/etcd/connstring' | |
echo "Restore etcd cluster from snapshot" | |
for master in "${MASTERS[@]}" | |
do | |
echo "Running /usr/local/bin/etcd-snapshot-restore.sh on ${master}" | |
bastion_ssh "core@${FIRST_MASTER}" "scp -o StrictHostKeyChecking=no /tmp/etcd/connstring core@${master}:/tmp/etcd_connstring" | |
bastion_ssh "core@${master}" 'sudo -i /bin/bash -x /usr/local/bin/etcd-snapshot-restore.sh /tmp/snapshot.db $(cat /tmp/etcd_connstring)' | |
done | |
echo "Wait for API server to come up" | |
for i in $(seq 0 10) | |
do | |
oc get nodes && break | |
sleep 30 | |
done | |
echo "Wait for MCO to rollout new configs" | |
for i in $(seq 0 10); do oc get machineconfigpool/master > /dev/null && break; sleep 30; done | |
wait_for_machineconfigpool_to_apply | |
echo "Wait for all kube-apiserver pods to come back" | |
for master in ${MASTERS[@]} | |
do | |
oc get pod/kube-apiserver-${master} -n openshift-kube-apiserver -o name | |
oc wait pod/kube-apiserver-${master} -n openshift-kube-apiserver --for condition=Ready --timeout=5m | |
done | |
echo "Verify 99-rollback-test machineconfig" | |
MC="$(oc get machineconfig/99-rollback-test -o jsonpath='{.spec.config.storage.files[0].contents.source}')" | |
if [[ "${MC}" != "data:,A" ]]; then | |
echo "Unexpected MachineConfig output: ${MC}" | |
exit 1 | |
fi | |
echo "Verify /etc/rollback-test contents" | |
rc=0 | |
for master in "${MASTERS[@]}" | |
do | |
bastion_ssh core@${master} 'sudo -i test "$(cat /etc/rollback-test)" == "A"' | |
done | |
if [[ "${rc}" == "1" ]]; then exit 1; fi | |
echo "Removing ssh-bastion" | |
oc delete project openshift-ssh-bastion | |
echo "Remove existing openshift-apiserver pods" | |
# This would ensure "Pod 'openshift-apiserver/apiserver-xxx' is not healthy: container openshift-apiserver has restarted more than 5 times" test won't fail | |
oc delete pod --all -n openshift-apiserver | |
} | |
function recover-from-etcd-quorum-loss() { | |
setup_ssh_bastion | |
# Machine API won't let the user to destroy the node which runs the controller | |
echo "Finding two masters to destroy" | |
MAPI_POD=$(oc get pod -l k8s-app=controller -n openshift-machine-api --no-headers -o name) | |
SURVIVING_MASTER_NODE=$(oc get ${MAPI_POD} -n openshift-machine-api -o jsonpath='{.spec.nodeName}') | |
mapfile -t MASTER_NODES_TO_REMOVE < <(oc get nodes -l node-role.kubernetes.io/master= -o name | grep -v "${SURVIVING_MASTER_NODE}") | |
MASTER_MACHINES_TO_REMOVE=() | |
for master in ${MASTER_NODES_TO_REMOVE[@]} | |
do | |
MASTER_MACHINES_TO_REMOVE+=($(oc get ${master} -o jsonpath='{.metadata.annotations.machine\.openshift\.io\/machine}' | cut -d '/' -f 2)) | |
done | |
echo "Prepare etcd connstring" | |
bastion_ssh "core@${SURVIVING_MASTER_NODE}" 'source /run/etcd/environment && echo "etcd-member-$(hostname -f)=https://${ETCD_DNS_NAME}:2380" > /tmp/etcd_connstring' | |
echo "Destroy two masters" | |
# Scale down etcd quorum guard | |
oc scale --replicas=0 deployment.apps/etcd-quorum-guard -n openshift-machine-config-operator | |
for machine in ${MASTER_MACHINES_TO_REMOVE[@]} | |
do | |
retry 10 oc --request-timeout=5s -n openshift-machine-api delete machine ${machine} | |
done | |
echo "Confirm meltdown" | |
sleep 30 | |
oc --request-timeout=5s get nodes && exit 1 | |
echo "Restore single etcd - /usr/local/bin/etcd-snapshot-restore.sh" | |
bastion_ssh core@${SURVIVING_MASTER_NODE} 'sudo -i /bin/bash -x /usr/local/bin/etcd-snapshot-restore.sh /root/assets/backup/etcd/member/snap/db $(cat /tmp/etcd_connstring)' | |
echo "Wait for API server to come up" | |
retry 30 oc get nodes | |
# Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1707006 | |
echo "Restart SDN" | |
retry 10 oc delete pods -l app=sdn -n openshift-sdn --wait=false | |
echo "Create two masters via Machine API" | |
retry 10 oc get machines -n openshift-machine-api | |
# Clone existing masters, update IDs and oc apply | |
SURVIVING_MASTER_MACHINE=$(oc get machine -l machine.openshift.io/cluster-api-machine-role=master -n openshift-machine-api -o name | cut -d '/' -f 2) | |
SURVIVING_MASTER_NUM=${SURVIVING_MASTER_MACHINE##*-} | |
SURVIVING_MASTER_PREFIX=${SURVIVING_MASTER_MACHINE%-*} | |
retry 10 sh -c 'oc get --export machine ${SURVIVING_MASTER_MACHINE} -n openshift-machine-api -o yaml > /tmp/machine.template' | |
MASTER_INDEX=0 | |
for i in $(seq 0 1); do | |
if [[ "${MASTER_INDEX}" == "${SURVIVING_MASTER_NUM}" ]]; then MASTER_INDEX=$((MASTER_INDEX+1)); fi | |
cat /tmp/machine.template \ | |
| sed 's;selfLink.*;;g' \ | |
| sed "s;name: ${SURVIVING_MASTER_PREFIX}-${SURVIVING_MASTER_NUM};name: ${SURVIVING_MASTER_PREFIX}-${MASTER_INDEX};" > /tmp/machine_${i}.yaml | |
RETRY_IGNORE_EXIT_CODE=1 retry 5 oc create -n openshift-machine-api -f /tmp/machine_${i}.yaml | |
MASTER_INDEX=$((MASTER_INDEX+1)) | |
done | |
echo "Waiting for machines to be created" | |
set +e | |
NEW_MASTER_IPS=() | |
for i in $(seq 0 60); do | |
NEW_MASTER_IPS=($(oc -n openshift-machine-api \ | |
get machines \ | |
-l machine.openshift.io/cluster-api-machine-role=master \ | |
-o jsonpath='{range .items[*]}{.status.addresses[?(@.type=="InternalIP")].address}{"\n"}{end}' || true)) | |
if [[ "${#NEW_MASTER_IPS[@]}" == "3" ]]; then break; fi | |
sleep 30 | |
done | |
oc get machines -n openshift-machine-api | |
set -e | |
if [[ "${#NEW_MASTER_IPS[@]}" != "3" ]]; then | |
echo "${NEW_MASTER_IPS[@]}" | |
exit 1 | |
fi | |
echo "Verify new master nodes have joined the cluster" | |
FOUND_MASTERS=0 | |
for i in $(seq 1 60) | |
do | |
FOUND_MASTERS=($(oc --request-timeout=5s get nodes -l node-role.kubernetes.io/master= -o name --no-headers || true)) | |
if [[ "${#FOUND_MASTERS[@]}" == "3" ]]; then break; fi | |
sleep 30 | |
done | |
oc get nodes | |
if [[ "${#FOUND_MASTERS[@]}" != "3" ]]; then | |
echo "${FOUND_MASTERS[@]}" | |
exit 1 | |
fi | |
echo "Update DNS and LB" | |
# aws cli magic | |
easy_install --user pip | |
~/.local/bin/pip install --user boto3 | |
cat > /tmp/update_route_53.py <<'PYTHON_EOF' | |
import boto3 | |
import os | |
import sys | |
if len(sys.argv) < 4: | |
print("Usage: ./update_route_53.py <DOMAIN> <RECORD> <IP>") | |
sys.exit(1) | |
domain = sys.argv[1] | |
record = sys.argv[2] | |
ip = sys.argv[3] | |
print("record: %s" % record) | |
print("ip: %s" % ip) | |
client = boto3.client('route53') | |
r = client.list_hosted_zones_by_name(DNSName=domain, MaxItems="1") | |
zone_id = r['HostedZones'][0]['Id'].split('/')[-1] | |
response = client.change_resource_record_sets( | |
HostedZoneId=zone_id, | |
ChangeBatch= { | |
'Comment': 'add %s -> %s' % (record, ip), | |
'Changes': [ | |
{ | |
'Action': 'UPSERT', | |
'ResourceRecordSet': { | |
'Name': record, | |
'Type': 'A', | |
'TTL': 60, | |
'ResourceRecords': [{'Value': ip}] | |
} | |
}] | |
}) | |
PYTHON_EOF | |
DOMAIN=$(oc whoami --show-server | grep -oP "api.\\K([^\\:]*)") | |
for i in "${!NEW_MASTER_IPS[@]}"; do | |
ETCD_NAME="etcd-${i}.${DOMAIN}" | |
python /tmp/update_route_53.py "${DOMAIN}" "${ETCD_NAME}" "${NEW_MASTER_IPS[$i]}" | |
done | |
echo "Run etcd-signer" | |
SURVIVING_MASTER_NODE_SHORT=${SURVIVING_MASTER_NODE%%.*} | |
curl -O https://raw.githubusercontent.com/hexfusion/openshift-recovery/master/manifests/kube-etcd-cert-signer.yaml.template | |
sed "s;__MASTER_HOSTNAME__;${SURVIVING_MASTER_NODE_SHORT};g" kube-etcd-cert-signer.yaml.template > kube-etcd-cert-signer.yaml | |
retry 10 oc create -f kube-etcd-cert-signer.yaml | |
retry 10 oc get pod/etcd-signer -n openshift-config -o name | |
retry 10 oc wait pod/etcd-signer -n openshift-config --for condition=ready | |
echo "Grow etcd cluster to full membership" | |
SURVIVING_MASTER_IP=$(oc get nodes ${SURVIVING_MASTER_NODE} -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}') | |
SETUP_ETCD_ENVIRONMENT=$(oc adm release info --image-for setup-etcd-environment) | |
KUBE_CLIENT_AGENT=$(oc adm release info --image-for kube-client-agent) | |
MASTERS=($(oc -n openshift-machine-api \ | |
get machines \ | |
-l machine.openshift.io/cluster-api-machine-role=master \ | |
-o jsonpath='{range .items[*]}{.status.addresses[?(@.type=="InternalDNS")].address}{"\n"}{end}')) | |
for master in ${MASTERS[@]} | |
do | |
if [[ "${master}" == ${SURVIVING_MASTER_NODE} ]]; then continue; fi | |
echo "Recovering ${master}" | |
ETCD_HOSTNAME='etcd-member-$(hostname -f)' | |
bastion_ssh core@${master} "sudo -i env SETUP_ETCD_ENVIRONMENT=${SETUP_ETCD_ENVIRONMENT} KUBE_CLIENT_AGENT=${KUBE_CLIENT_AGENT} /bin/bash -x /usr/local/bin/etcd-member-recover.sh ${SURVIVING_MASTER_IP} ${ETCD_HOSTNAME}" | |
done | |
for master in ${MASTERS[@]} | |
do | |
retry 10 oc get pod/etcd-member-${master} -n openshift-etcd -o name | |
retry 10 oc wait pod/etcd-member-${master} -n openshift-etcd --for condition=Ready | |
done | |
echo "Removing ssh-bastion" | |
retry 10 oc delete project openshift-ssh-bastion | |
echo "Scale etcd-quorum guard" | |
retry 10 oc scale --replicas=3 deployment.apps/etcd-quorum-guard -n openshift-machine-config-operator | |
echo "Remove etcd-signer" | |
oc delete pod/etcd-signer -n openshift-config | |
echo "Sleeping for a minute to make sure Prometheus are no longer firing" | |
sleep 60 | |
} | |
function setup-google-cloud-sdk() { | |
cd /tmp | |
curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-256.0.0-linux-x86_64.tar.gz | |
tar -xzf google-cloud-sdk-256.0.0-linux-x86_64.tar.gz | |
export PATH=$PATH:/tmp/google-cloud-sdk/bin | |
mkdir gcloudconfig | |
export CLOUDSDK_CONFIG=/tmp/gcloudconfig | |
gcloud auth activate-service-account --key-file="${GCP_SHARED_CREDENTIALS_FILE}" | |
gcloud config set project ${GCP_PROJECT} | |
} | |
function run-dr-snapshot-tests() { | |
openshift-tests run-dr restore-snapshot "${TEST_SUITE}" \ | |
--provider "${TEST_PROVIDER:-}" -o /tmp/artifacts/e2e.log --junit-dir /tmp/artifacts/junit | |
exit 0 | |
} | |
function run-dr-quorum-tests() { | |
openshift-tests run-dr quorum-restore "${TEST_SUITE}" \ | |
--provider "${TEST_PROVIDER:-}" -o /tmp/artifacts/e2e.log --junit-dir /tmp/artifacts/junit | |
exit 0 | |
} | |
function run-upgrade-tests() { | |
openshift-tests run-upgrade "${TEST_SUITE}" --to-image "${RELEASE_IMAGE_LATEST}" \ | |
--provider "${TEST_PROVIDER:-}" -o /tmp/artifacts/e2e.log --junit-dir /tmp/artifacts/junit | |
exit 0 | |
} | |
function run-tests() { | |
openshift-tests run "${TEST_SUITE}" \ | |
--provider "${TEST_PROVIDER:-}" -o /tmp/artifacts/e2e.log --junit-dir /tmp/artifacts/junit | |
exit 0 | |
} | |
if [[ "${CLUSTER_TYPE}" == "gcp" ]]; then | |
echo "Setting up gcloud" | |
setup-google-cloud-sdk | |
fi | |
#TEST_SUITE=openshift/conformance/parallel run-tests | |
openshift-tests run-test "[sig-scheduling] Multi-AZ Cluster Volumes [sig-storage] should only be allowed to provision PDs in zones where nodes exist [Suite:openshift/conformance/parallel] [Suite:k8s]" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment