Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
detach azure disks used in kubernetes pods errored because of disk related issues
#!/bin/bash
if [ "$#" -ne 4 ]; then
echo "Illegal number of parameters"
echo ""
echo "Usage: ./fix-azure-disk-double-mount.sh <azure subscription id> <azure resource_group> <kube context> <kube namespace>"
echo ""
echo "Find all pods in error, checks if it is related to an Azure disk issue. Then detach from azure VM these disks. It should trigger a resync automatically on kube side. Run the scripts multiple times if the first time is not the one."
exit 1
fi
KUBE_NS=$4
AZURE_RG=$2
AZURE_SUB_ID=$1
KUBE_CTX=$3
DISKS_MAPPING=""
ALL_DISKS=""
NL=$'\n'
KUBE_FAILED_DISKS=""
# First check that mandatory tools are here
command -v jq >/dev/null 2>&1 || { echo >&2 "jq not installed. Aborting."; exit 1; }
AZ_BIN=$(which az.cmd 2> /dev/null)
if [[ $? -eq 1 ]] ; then
AZ_BIN=$(which az 2> /dev/null)
if [[ $? -eq 1 ]] ; then
echo "az cli not found"
exit 1
fi
fi
KUBE_BIN=$(which kubectl 2> /dev/null)
if [[ $? -eq 1 ]] ; then
echo "kubectl not found"
exit 1
fi
echo ""
echo "using \"$AZ_BIN\" binary"
echo "using \"$KUBE_BIN\" binary"
# Switch contexts in azure and kube
echo ""
echo "Switching to azure subscription $AZURE_SUB_ID ..."
"$AZ_BIN" account set --subscription $AZURE_SUB_ID
echo "Switched to azure subscription \"$AZURE_SUB_ID\"."
echo ""
echo "Switching to context $KUBE_CTX ..."
kubectl config use-context $KUBE_CTX
# Find all failed pods
echo ""
echo "Loading failed pods from context $KUBE_CTX and namespace $KUBE_NS ..."
PODS_LIST=$(kubectl get pods -n $KUBE_NS | grep -vi "STATUS" | grep -vi "Running\|Completed" | awk '{print $1}')
if [ -z "$PODS_LIST" ]; then
echo "no failed pods found"
exit 0
fi
echo ""
echo "Failed pods found in context $KUBE_CTX and namespace $KUBE_NS :"
echo "$PODS_LIST" | sed -e 's/^/\* /'
# For each failed pod, find the ones with describe messages related to Azure disk attachment problems
# and extract failed disks name
for pod_name in $PODS_LIST; do
KUBE_DISKS=$("$KUBE_BIN" describe pod $pod_name -n $KUBE_NS | grep "Blob is already in use" | grep -o "disk '.*' using" | grep -o "'.*'" | tr -d "'")
if [ -z "$KUBE_DISKS" ]; then
KUBE_DISKS=$("$KUBE_BIN" describe pod $pod_name -n $KUBE_NS | grep "AttachDiskWhileBeingDetached" | grep -o "disk '.*' to" | grep -o "'.*'" | tr -d "'")
fi
# Sometimes the error is not related to disk attachment error
if [ -z "$KUBE_DISKS" ]; then
continue
fi
if [ -z "$KUBE_FAILED_DISKS" ]; then
KUBE_FAILED_DISKS="$KUBE_DISKS"
else
KUBE_FAILED_DISKS="${KUBE_FAILED_DISKS}${NL}${KUBE_DISKS}"
fi
done
KUBE_FAILED_DISKS=$(echo "$KUBE_FAILED_DISKS" | sort | uniq)
echo ""
echo "Failed disks found in context $KUBE_CTX and namespace $KUBE_NS :"
echo "$KUBE_FAILED_DISKS" | sed -e 's/^/\* /'
# Load all disks in all VMs
echo ""
echo "Loading VM list from resource group \"$AZURE_RG\" ..."
VMS_LIST=$("$AZ_BIN" vm list -g $AZURE_RG | jq -r '.[] | .name' | grep agent)
if [ -z "$VMS_LIST" ]; then
echo "no VM found in this resource group"
exit 1
fi
echo ""
echo "VM found in resources group $AZURE_RG:"
echo "$VMS_LIST" | sed -e 's/^/\* /'
echo ""
for vm_name in $VMS_LIST; do
echo "Loading disks for VM $vm_name ..."
DISKS=$("$AZ_BIN" vm get-instance-view --name $vm_name -g $AZURE_RG | jq -r '.storageProfile.dataDisks[] | .name')
for disk in $DISKS; do
if [ -z "$DISKS_MAPPING" ]; then
DISKS_MAPPING="$disk $vm_name"
else
DISKS_MAPPING="${DISKS_MAPPING}${NL}$disk $vm_name"
fi
done
done
# For each failed disks, detach them in all the VMs we found them on
for failed_disk in $KUBE_FAILED_DISKS; do
VMS_WITH_THIS_DISK=$(echo "$DISKS_MAPPING" | grep $failed_disk | cut -d' ' -f2)
if [ -z "$VMS_WITH_THIS_DISK" ]; then
echo ""
echo "Disk $failed_disk is not mounted on any VMs"
continue
fi
echo ""
echo "Disk $failed_disk found in VMs :"
echo "$VMS_WITH_THIS_DISK" | sed -e 's/^/\* /'
for vm_name in $VMS_WITH_THIS_DISK; do
echo "attempting to detach disk $failed_disk from $vm_name in resources group $AZURE_RG ..."
"$AZ_BIN" vm disk detach --name $failed_disk -g $AZURE_RG --vm-name $vm_name
done
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment