jaysonzhao/gist:118f5d5f989fceffacd8ea5cd32bf7d7

## gistfile1.txt
#!/bin/bash

# Function to check certificate expiration using openssl
check_certificate_expiration() {
    local service_name=$1
    local endpoint=$2

    echo -e "\nChecking certificate for $service_name at $endpoint"
    expiration_date=$(echo | openssl s_client -connect $endpoint 2>/dev/null | openssl x509 -noout -dates | grep 'notAfter' | cut -d= -f2)
    if [ -z "$expiration_date" ]; then
        echo "Unable to retrieve certificate expiration date for $service_name."
    else
        echo "$service_name certificate expires on: $expiration_date"
    fi
}

# Check if oc is installed
if ! command -v oc &> /dev/null; then
    echo "oc command not found. Please install and configure the OpenShift CLI."
    exit 1
fi

# Check if logged in to OpenShift
if ! oc whoami &> /dev/null; then
    echo "You are not logged in to an OpenShift cluster. Please log in using 'oc login'."
    exit 1
fi

# Get cluster information
echo "Cluster Information:"
oc cluster-info

# Check health of all nodes
echo -e "\nNode Health:"
oc get nodes -o wide

# Check each node's disk usage
echo -e "\nNode Disk Usage:"
for node in $(oc get nodes -o jsonpath='{.items[*].metadata.name}'); do
    echo -e "\nNode: $node"
    oc debug node/$node -- chroot /host df -h / | grep -v Filesystem
done

# Check health of all pods in all namespaces
echo -e "\nPod Health in All Namespaces:"
oc get pods --all-namespaces -o wide

# Check resource consumption
echo -e "\nResource Consumption (CPU/Memory):"
oc adm top nodes
oc adm top pods --all-namespaces

# Check storage usage
echo -e "\nPersistent Volume Usage:"
oc get pv -o wide
echo -e "\nPersistent Volume Claim Usage:"
oc get pvc --all-namespaces -o wide

# Enhanced PV usage information
echo -e "\nDetailed Persistent Volume Usage Information:"
for pv in $(oc get pv -o jsonpath='{.items[*].metadata.name}'); do
    echo -e "\nPersistent Volume: $pv"
    oc describe pv $pv | grep -E "Capacity|Access Modes|Reclaim Policy|Status|Claim|StorageClass|Reason"
done

# Check cluster operators' health
echo -e "\nCluster Operators Health:"
oc get co

# Collect information about pods with error status
echo -e "\nPods with Error Status:"
for ns in $(oc get namespaces -o jsonpath='{.items[*].metadata.name}'); do
    echo -e "\nNamespace: $ns"
    oc get pods -n $ns --field-selector=status.phase!=Running,status.phase!=Succeeded -o wide
done

# Collect error events
echo -e "\nError Events in All Namespaces:"
for ns in $(oc get namespaces -o jsonpath='{.items[*].metadata.name}'); do
    echo -e "\nNamespace: $ns"
    oc get events -n $ns --field-selector=type=Warning
done


# Gather alerts from Prometheus
echo -e "\nActive Alerts:"
PROMETHEUS_ROUTE=$(oc -n openshift-monitoring get route prometheus-k8s -o jsonpath='{.spec.host}')
if [ -z "$PROMETHEUS_ROUTE" ]; then
    echo "Prometheus route not found. Ensure the OpenShift monitoring stack is installed and accessible."
else
    # Find the secret containing the token for the prometheus-k8s service account
    SECRET_NAME=$(oc get secrets -n openshift-monitoring -o jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | grep '^prometheus-k8s-token-')

    if [ -z "$SECRET_NAME" ]; then
        echo "Prometheus token secret not found."
    else
        # Extract the bearer token from the secret
        BEARER_TOKEN=$(oc get secret $SECRET_NAME -n openshift-monitoring -o jsonpath='{.data.token}' | base64 --decode)

        ALERTS=$(curl -s -k -H "Authorization: Bearer $BEARER_TOKEN" "https://$PROMETHEUS_ROUTE/api/v1/alerts" | jq '.data.alerts[] | {alertname: .labels.alertname, severity: .labels.severity, description: .annotations.description, state: .state}')

        if [ -z "$ALERTS" ]; then
            echo "No active alerts found."
        else
            echo "$ALERTS" | jq
        fi
    fi
fi

# Check cluster-level certificate expiration
echo -e "\nCluster-Level Certificate Expiration Dates:"

# Check kube api ca
echo -e "\nCluster-Level api server signer Dates:"
oc -n openshift-kube-apiserver-operator get secret kube-apiserver-to-kubelet-signer -o jsonpath='{.metadata.annotations.auth\.openshift\.io/certificate-not-after}'


# API Server certificate
API_SERVER_URL=$(oc whoami --show-server | sed 's|https://||' | sed 's|:.*||')
check_certificate_expiration "API Server" "$API_SERVER_URL:6443"

# Console certificate
CONSOLE_ROUTE=$(oc get route -n openshift-console console -o jsonpath='{.spec.host}')
check_certificate_expiration "Console" "$CONSOLE_ROUTE:443"

# Registry certificate
REGISTRY_ROUTE=$(oc get route -n openshift-image-registry default-route -o jsonpath='{.spec.host}' 2>/dev/null)
if [ -n "$REGISTRY_ROUTE" ]; then
    check_certificate_expiration "Registry" "$REGISTRY_ROUTE:443"
else
    echo "Registry route not found. Ensure the default route is configured for the image registry."
fi

# Summarize resource usage
echo -e "\nResource Usage Summary:"
TOTAL_CPU=$(oc adm top nodes --no-headers | awk '{sum+=$2} END {print sum}')
TOTAL_MEMORY=$(oc adm top nodes --no-headers | awk '{sum+=$4} END {print sum}')
echo "Total CPU usage across all nodes: $TOTAL_CPU cores"
echo "Total Memory usage across all nodes: $TOTAL_MEMORY MiB"

echo -e "\nHealth check complete."
	#!/bin/bash

	# Function to check certificate expiration using openssl
	check_certificate_expiration() {
	local service_name=$1
	local endpoint=$2

	echo -e "\nChecking certificate for $service_name at $endpoint"
	expiration_date=$(echo \| openssl s_client -connect $endpoint 2>/dev/null \| openssl x509 -noout -dates \| grep 'notAfter' \| cut -d= -f2)
	if [ -z "$expiration_date" ]; then
	echo "Unable to retrieve certificate expiration date for $service_name."
	else
	echo "$service_name certificate expires on: $expiration_date"
	fi
	}

	# Check if oc is installed
	if ! command -v oc &> /dev/null; then
	echo "oc command not found. Please install and configure the OpenShift CLI."
	exit 1
	fi

	# Check if logged in to OpenShift
	if ! oc whoami &> /dev/null; then
	echo "You are not logged in to an OpenShift cluster. Please log in using 'oc login'."
	exit 1
	fi

	# Get cluster information
	echo "Cluster Information:"
	oc cluster-info

	# Check health of all nodes
	echo -e "\nNode Health:"
	oc get nodes -o wide

	# Check each node's disk usage
	echo -e "\nNode Disk Usage:"
	for node in $(oc get nodes -o jsonpath='{.items[*].metadata.name}'); do
	echo -e "\nNode: $node"
	oc debug node/$node -- chroot /host df -h / \| grep -v Filesystem
	done

	# Check health of all pods in all namespaces
	echo -e "\nPod Health in All Namespaces:"
	oc get pods --all-namespaces -o wide

	# Check resource consumption
	echo -e "\nResource Consumption (CPU/Memory):"
	oc adm top nodes
	oc adm top pods --all-namespaces

	# Check storage usage
	echo -e "\nPersistent Volume Usage:"
	oc get pv -o wide
	echo -e "\nPersistent Volume Claim Usage:"
	oc get pvc --all-namespaces -o wide

	# Enhanced PV usage information
	echo -e "\nDetailed Persistent Volume Usage Information:"
	for pv in $(oc get pv -o jsonpath='{.items[*].metadata.name}'); do
	echo -e "\nPersistent Volume: $pv"
	oc describe pv $pv \| grep -E "Capacity\|Access Modes\|Reclaim Policy\|Status\|Claim\|StorageClass\|Reason"
	done

	# Check cluster operators' health
	echo -e "\nCluster Operators Health:"
	oc get co

	# Collect information about pods with error status
	echo -e "\nPods with Error Status:"
	for ns in $(oc get namespaces -o jsonpath='{.items[*].metadata.name}'); do
	echo -e "\nNamespace: $ns"
	oc get pods -n $ns --field-selector=status.phase!=Running,status.phase!=Succeeded -o wide
	done

	# Collect error events
	echo -e "\nError Events in All Namespaces:"
	for ns in $(oc get namespaces -o jsonpath='{.items[*].metadata.name}'); do
	echo -e "\nNamespace: $ns"
	oc get events -n $ns --field-selector=type=Warning
	done


	# Gather alerts from Prometheus
	echo -e "\nActive Alerts:"
	PROMETHEUS_ROUTE=$(oc -n openshift-monitoring get route prometheus-k8s -o jsonpath='{.spec.host}')
	if [ -z "$PROMETHEUS_ROUTE" ]; then
	echo "Prometheus route not found. Ensure the OpenShift monitoring stack is installed and accessible."
	else
	# Find the secret containing the token for the prometheus-k8s service account
	SECRET_NAME=$(oc get secrets -n openshift-monitoring -o jsonpath='{.items[*].metadata.name}' \| tr ' ' '\n' \| grep '^prometheus-k8s-token-')

	if [ -z "$SECRET_NAME" ]; then
	echo "Prometheus token secret not found."
	else
	# Extract the bearer token from the secret
	BEARER_TOKEN=$(oc get secret $SECRET_NAME -n openshift-monitoring -o jsonpath='{.data.token}' \| base64 --decode)

	ALERTS=$(curl -s -k -H "Authorization: Bearer $BEARER_TOKEN" "https://$PROMETHEUS_ROUTE/api/v1/alerts" \| jq '.data.alerts[] \| {alertname: .labels.alertname, severity: .labels.severity, description: .annotations.description, state: .state}')

	if [ -z "$ALERTS" ]; then
	echo "No active alerts found."
	else
	echo "$ALERTS" \| jq
	fi
	fi
	fi

	# Check cluster-level certificate expiration
	echo -e "\nCluster-Level Certificate Expiration Dates:"

	# Check kube api ca
	echo -e "\nCluster-Level api server signer Dates:"
	oc -n openshift-kube-apiserver-operator get secret kube-apiserver-to-kubelet-signer -o jsonpath='{.metadata.annotations.auth\.openshift\.io/certificate-not-after}'


	# API Server certificate
	API_SERVER_URL=$(oc whoami --show-server \| sed 's\|https://\|\|' \| sed 's\|:.*\|\|')
	check_certificate_expiration "API Server" "$API_SERVER_URL:6443"

	# Console certificate
	CONSOLE_ROUTE=$(oc get route -n openshift-console console -o jsonpath='{.spec.host}')
	check_certificate_expiration "Console" "$CONSOLE_ROUTE:443"

	# Registry certificate
	REGISTRY_ROUTE=$(oc get route -n openshift-image-registry default-route -o jsonpath='{.spec.host}' 2>/dev/null)
	if [ -n "$REGISTRY_ROUTE" ]; then
	check_certificate_expiration "Registry" "$REGISTRY_ROUTE:443"
	else
	echo "Registry route not found. Ensure the default route is configured for the image registry."
	fi

	# Summarize resource usage
	echo -e "\nResource Usage Summary:"
	TOTAL_CPU=$(oc adm top nodes --no-headers \| awk '{sum+=$2} END {print sum}')
	TOTAL_MEMORY=$(oc adm top nodes --no-headers \| awk '{sum+=$4} END {print sum}')
	echo "Total CPU usage across all nodes: $TOTAL_CPU cores"
	echo "Total Memory usage across all nodes: $TOTAL_MEMORY MiB"

	echo -e "\nHealth check complete."