Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@funkytaco
Forked from rvanbutselaar/OpenShift Monitoring
Created June 2, 2020 19:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save funkytaco/8e87493e2e361ce0412de6d4fdb1ef19 to your computer and use it in GitHub Desktop.
Save funkytaco/8e87493e2e361ce0412de6d4fdb1ef19 to your computer and use it in GitHub Desktop.
# Docker daemon
systemctl is-active docker
# Check that Docker volume group has adequate space
echo $(echo \"$(docker info 2>/dev/null | awk '/Data Space Available/ {print $4}') / $(docker info 2>/dev/null | awk '/Data Space Total/ {print $4}')\" | bc -l) '>' 0.3 | bc -l
# Check that Docker volume group has adequate metadata space
echo $(echo \"$(docker info 2>/dev/null | awk '/Metadata Space Available/ {print $4}') / $(docker info 2>/dev/null | awk '/Metadata Space Total/ {print $4}')\" | bc -l) '>' 0.3 | bc -l
# etcd is active
systemctl is-active etcd
# etcd volume is not too full
echo "$(lvs | awk '/etcd/ {print $5}') > 70" | bc
# Master API service is active
master:433/healthz
systemctl is-active atomic-openshift-master
# Master API service is active (multi-master)
systemctl is-active atomic-openshift-master-api
# Master controller service is active (multi-master)
systemctl is-active atomic-openshift-master-controller
# Node service is active
systemctl is-active atomic-openshift-node
# Node’s local data storage volume is not too full
echo "$(lvs | awk '/origin/ {print $5}') > 70" | bc
# openvswitch service is active
systemctl is-active openvswitch
# OpenShift Components
Service bus
Routers
Registry
DNSmasq
# Logging
Elasticsearch
Kibana
Fluentd
# Metrics
Hawkular
Heapster
Cassandra
# Health of master API endpoint
curl -H "Authorization: Bearer $(oc whoami -t)" https://<my_cluster_api>:8443/healthz | grep ok
# Health of router
curl http://router.default.svc.cluster.local:1936/healthz | grep 200
# Health of registry
curl -I https://docker-registry.default.svc.cluster.local:5000/healthz | grep 200
# Health of EFK logging stack
https://github.com/redhat-cop/openshift-toolkit/blob/master/health_check/elasticsearch-health-check-ocp34.sh
# Health of metrics stack
https://github.com/redhat-cop/openshift-toolkit/blob/master/health_check/metrics-health-check.sh
# CPU usage
Requests
Load
# Memory usage
Requests
Used
Memory reserved Total in cluster (aggregate over all nodes)
#
# Promethius
#
https://github.com/wkulhanek/openshift-prometheus
# Number of cores each machine in cluster has
machine_cpu_cores
# Total number of cores in cluster
sum(machine_cpu_cores)
# Percentage of total cluster CPU in use
sum(container_memory_rss) / sum(machine_memory_bytes)
# Percentage of total cluster memory in use
sum(container_memory_rss) / sum(machine_memory_bytes)
# Total number of consumed cores in cluster:
sum(sort_desc(rate(container_cpu_usage_seconds_total{id="/"}[5m])))
# Number of containers that start or restart over previous 10 minutes
sum(changes(container_start_time_seconds[10m]))
# Number of mutating API requests being made to control plane
sort_desc(drop_common_labels(sum without (instance,type,code) (rate(apiserver_request_count{verb=~"POST|PUT|DELETE|PATCH"}[5m]))))
# Number of non-mutating API requests being made to control plane
sort_desc(drop_common_labels(sum without (instance,type,code) (rate(apiserver_request_count{verb=~"GET|LIST|WATCH"}[5m]))))
# Top 10 pods doing most receive network traffic
topk(10, (sum by (pod_name) (rate(container_network_receive_bytes_total[5m]))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment