Last active
October 25, 2024 09:46
Openshift cluster health check
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Function to check certificate expiration using openssl | |
check_certificate_expiration() { | |
local service_name=$1 | |
local endpoint=$2 | |
echo -e "\nChecking certificate for $service_name at $endpoint" | |
expiration_date=$(echo | openssl s_client -connect $endpoint 2>/dev/null | openssl x509 -noout -dates | grep 'notAfter' | cut -d= -f2) | |
if [ -z "$expiration_date" ]; then | |
echo "Unable to retrieve certificate expiration date for $service_name." | |
else | |
echo "$service_name certificate expires on: $expiration_date" | |
fi | |
} | |
# Check if oc is installed | |
if ! command -v oc &> /dev/null; then | |
echo "oc command not found. Please install and configure the OpenShift CLI." | |
exit 1 | |
fi | |
# Check if logged in to OpenShift | |
if ! oc whoami &> /dev/null; then | |
echo "You are not logged in to an OpenShift cluster. Please log in using 'oc login'." | |
exit 1 | |
fi | |
# Get cluster information | |
echo "Cluster Information:" | |
oc cluster-info | |
# Check health of all nodes | |
echo -e "\nNode Health:" | |
oc get nodes -o wide | |
# Check each node's disk usage | |
echo -e "\nNode Disk Usage:" | |
for node in $(oc get nodes -o jsonpath='{.items[*].metadata.name}'); do | |
echo -e "\nNode: $node" | |
oc debug node/$node -- chroot /host df -h / | grep -v Filesystem | |
done | |
# Check health of all pods in all namespaces | |
echo -e "\nPod Health in All Namespaces:" | |
oc get pods --all-namespaces -o wide | |
# Check resource consumption | |
echo -e "\nResource Consumption (CPU/Memory):" | |
oc adm top nodes | |
oc adm top pods --all-namespaces | |
# Check storage usage | |
echo -e "\nPersistent Volume Usage:" | |
oc get pv -o wide | |
echo -e "\nPersistent Volume Claim Usage:" | |
oc get pvc --all-namespaces -o wide | |
# Enhanced PV usage information | |
echo -e "\nDetailed Persistent Volume Usage Information:" | |
for pv in $(oc get pv -o jsonpath='{.items[*].metadata.name}'); do | |
echo -e "\nPersistent Volume: $pv" | |
oc describe pv $pv | grep -E "Capacity|Access Modes|Reclaim Policy|Status|Claim|StorageClass|Reason" | |
done | |
# Check cluster operators' health | |
echo -e "\nCluster Operators Health:" | |
oc get co | |
# Collect information about pods with error status | |
echo -e "\nPods with Error Status:" | |
for ns in $(oc get namespaces -o jsonpath='{.items[*].metadata.name}'); do | |
echo -e "\nNamespace: $ns" | |
oc get pods -n $ns --field-selector=status.phase!=Running,status.phase!=Succeeded -o wide | |
done | |
# Collect error events | |
echo -e "\nError Events in All Namespaces:" | |
for ns in $(oc get namespaces -o jsonpath='{.items[*].metadata.name}'); do | |
echo -e "\nNamespace: $ns" | |
oc get events -n $ns --field-selector=type=Warning | |
done | |
# Gather alerts from Prometheus | |
echo -e "\nActive Alerts:" | |
PROMETHEUS_ROUTE=$(oc -n openshift-monitoring get route prometheus-k8s -o jsonpath='{.spec.host}') | |
if [ -z "$PROMETHEUS_ROUTE" ]; then | |
echo "Prometheus route not found. Ensure the OpenShift monitoring stack is installed and accessible." | |
else | |
# Find the secret containing the token for the prometheus-k8s service account | |
SECRET_NAME=$(oc get secrets -n openshift-monitoring -o jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | grep '^prometheus-k8s-token-') | |
if [ -z "$SECRET_NAME" ]; then | |
echo "Prometheus token secret not found." | |
else | |
# Extract the bearer token from the secret | |
BEARER_TOKEN=$(oc get secret $SECRET_NAME -n openshift-monitoring -o jsonpath='{.data.token}' | base64 --decode) | |
ALERTS=$(curl -s -k -H "Authorization: Bearer $BEARER_TOKEN" "https://$PROMETHEUS_ROUTE/api/v1/alerts" | jq '.data.alerts[] | {alertname: .labels.alertname, severity: .labels.severity, description: .annotations.description, state: .state}') | |
if [ -z "$ALERTS" ]; then | |
echo "No active alerts found." | |
else | |
echo "$ALERTS" | jq | |
fi | |
fi | |
fi | |
# Check cluster-level certificate expiration | |
echo -e "\nCluster-Level Certificate Expiration Dates:" | |
# Check kube api ca | |
echo -e "\nCluster-Level api server signer Dates:" | |
oc -n openshift-kube-apiserver-operator get secret kube-apiserver-to-kubelet-signer -o jsonpath='{.metadata.annotations.auth\.openshift\.io/certificate-not-after}' | |
# API Server certificate | |
API_SERVER_URL=$(oc whoami --show-server | sed 's|https://||' | sed 's|:.*||') | |
check_certificate_expiration "API Server" "$API_SERVER_URL:6443" | |
# Console certificate | |
CONSOLE_ROUTE=$(oc get route -n openshift-console console -o jsonpath='{.spec.host}') | |
check_certificate_expiration "Console" "$CONSOLE_ROUTE:443" | |
# Registry certificate | |
REGISTRY_ROUTE=$(oc get route -n openshift-image-registry default-route -o jsonpath='{.spec.host}' 2>/dev/null) | |
if [ -n "$REGISTRY_ROUTE" ]; then | |
check_certificate_expiration "Registry" "$REGISTRY_ROUTE:443" | |
else | |
echo "Registry route not found. Ensure the default route is configured for the image registry." | |
fi | |
# Summarize resource usage | |
echo -e "\nResource Usage Summary:" | |
TOTAL_CPU=$(oc adm top nodes --no-headers | awk '{sum+=$2} END {print sum}') | |
TOTAL_MEMORY=$(oc adm top nodes --no-headers | awk '{sum+=$4} END {print sum}') | |
echo "Total CPU usage across all nodes: $TOTAL_CPU cores" | |
echo "Total Memory usage across all nodes: $TOTAL_MEMORY MiB" | |
echo -e "\nHealth check complete." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment