Skip to content

Instantly share code, notes, and snippets.

@jaysonzhao
Last active October 25, 2024 09:46
Openshift cluster health check
#!/bin/bash
# Function to check certificate expiration using openssl
check_certificate_expiration() {
local service_name=$1
local endpoint=$2
echo -e "\nChecking certificate for $service_name at $endpoint"
expiration_date=$(echo | openssl s_client -connect $endpoint 2>/dev/null | openssl x509 -noout -dates | grep 'notAfter' | cut -d= -f2)
if [ -z "$expiration_date" ]; then
echo "Unable to retrieve certificate expiration date for $service_name."
else
echo "$service_name certificate expires on: $expiration_date"
fi
}
# Check if oc is installed
if ! command -v oc &> /dev/null; then
echo "oc command not found. Please install and configure the OpenShift CLI."
exit 1
fi
# Check if logged in to OpenShift
if ! oc whoami &> /dev/null; then
echo "You are not logged in to an OpenShift cluster. Please log in using 'oc login'."
exit 1
fi
# Get cluster information
echo "Cluster Information:"
oc cluster-info
# Check health of all nodes
echo -e "\nNode Health:"
oc get nodes -o wide
# Check each node's disk usage
echo -e "\nNode Disk Usage:"
for node in $(oc get nodes -o jsonpath='{.items[*].metadata.name}'); do
echo -e "\nNode: $node"
oc debug node/$node -- chroot /host df -h / | grep -v Filesystem
done
# Check health of all pods in all namespaces
echo -e "\nPod Health in All Namespaces:"
oc get pods --all-namespaces -o wide
# Check resource consumption
echo -e "\nResource Consumption (CPU/Memory):"
oc adm top nodes
oc adm top pods --all-namespaces
# Check storage usage
echo -e "\nPersistent Volume Usage:"
oc get pv -o wide
echo -e "\nPersistent Volume Claim Usage:"
oc get pvc --all-namespaces -o wide
# Enhanced PV usage information
echo -e "\nDetailed Persistent Volume Usage Information:"
for pv in $(oc get pv -o jsonpath='{.items[*].metadata.name}'); do
echo -e "\nPersistent Volume: $pv"
oc describe pv $pv | grep -E "Capacity|Access Modes|Reclaim Policy|Status|Claim|StorageClass|Reason"
done
# Check cluster operators' health
echo -e "\nCluster Operators Health:"
oc get co
# Collect information about pods with error status
echo -e "\nPods with Error Status:"
for ns in $(oc get namespaces -o jsonpath='{.items[*].metadata.name}'); do
echo -e "\nNamespace: $ns"
oc get pods -n $ns --field-selector=status.phase!=Running,status.phase!=Succeeded -o wide
done
# Collect error events
echo -e "\nError Events in All Namespaces:"
for ns in $(oc get namespaces -o jsonpath='{.items[*].metadata.name}'); do
echo -e "\nNamespace: $ns"
oc get events -n $ns --field-selector=type=Warning
done
# Gather alerts from Prometheus
echo -e "\nActive Alerts:"
PROMETHEUS_ROUTE=$(oc -n openshift-monitoring get route prometheus-k8s -o jsonpath='{.spec.host}')
if [ -z "$PROMETHEUS_ROUTE" ]; then
echo "Prometheus route not found. Ensure the OpenShift monitoring stack is installed and accessible."
else
# Find the secret containing the token for the prometheus-k8s service account
SECRET_NAME=$(oc get secrets -n openshift-monitoring -o jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | grep '^prometheus-k8s-token-')
if [ -z "$SECRET_NAME" ]; then
echo "Prometheus token secret not found."
else
# Extract the bearer token from the secret
BEARER_TOKEN=$(oc get secret $SECRET_NAME -n openshift-monitoring -o jsonpath='{.data.token}' | base64 --decode)
ALERTS=$(curl -s -k -H "Authorization: Bearer $BEARER_TOKEN" "https://$PROMETHEUS_ROUTE/api/v1/alerts" | jq '.data.alerts[] | {alertname: .labels.alertname, severity: .labels.severity, description: .annotations.description, state: .state}')
if [ -z "$ALERTS" ]; then
echo "No active alerts found."
else
echo "$ALERTS" | jq
fi
fi
fi
# Check cluster-level certificate expiration
echo -e "\nCluster-Level Certificate Expiration Dates:"
# Check kube api ca
echo -e "\nCluster-Level api server signer Dates:"
oc -n openshift-kube-apiserver-operator get secret kube-apiserver-to-kubelet-signer -o jsonpath='{.metadata.annotations.auth\.openshift\.io/certificate-not-after}'
# API Server certificate
API_SERVER_URL=$(oc whoami --show-server | sed 's|https://||' | sed 's|:.*||')
check_certificate_expiration "API Server" "$API_SERVER_URL:6443"
# Console certificate
CONSOLE_ROUTE=$(oc get route -n openshift-console console -o jsonpath='{.spec.host}')
check_certificate_expiration "Console" "$CONSOLE_ROUTE:443"
# Registry certificate
REGISTRY_ROUTE=$(oc get route -n openshift-image-registry default-route -o jsonpath='{.spec.host}' 2>/dev/null)
if [ -n "$REGISTRY_ROUTE" ]; then
check_certificate_expiration "Registry" "$REGISTRY_ROUTE:443"
else
echo "Registry route not found. Ensure the default route is configured for the image registry."
fi
# Summarize resource usage
echo -e "\nResource Usage Summary:"
TOTAL_CPU=$(oc adm top nodes --no-headers | awk '{sum+=$2} END {print sum}')
TOTAL_MEMORY=$(oc adm top nodes --no-headers | awk '{sum+=$4} END {print sum}')
echo "Total CPU usage across all nodes: $TOTAL_CPU cores"
echo "Total Memory usage across all nodes: $TOTAL_MEMORY MiB"
echo -e "\nHealth check complete."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment