Skip to content

Instantly share code, notes, and snippets.

@superseb
Last active August 12, 2023 17:10
Show Gist options
  • Save superseb/3cccbfa910bf2fbe831ede4f201284c3 to your computer and use it in GitHub Desktop.
Save superseb/3cccbfa910bf2fbe831ede4f201284c3 to your computer and use it in GitHub Desktop.
Troubleshooting Kubernetes commands

Troubleshooting Kubernetes commands

Commands belonging to the Rancher webinar Troubleshooting Kubernetes

etcd

Check etcd members

docker exec etcd etcdctl member list

Check endpoint health

docker exec etcd etcdctl endpoint health --endpoints=$(docker exec etcd /bin/sh -c "etcdctl member list | cut -d, -f5 | sed -e 's/ //g' | paste -sd ','")

Check endpoint status

docker exec etcd etcdctl endpoint status --endpoints=$(docker exec etcd /bin/sh -c "etcdctl member list | cut -d, -f5 | sed -e 's/ //g' | paste -sd ','") --write-out table

Fill up etcd

docker exec -ti etcd sh
dd if=/dev/zero of=testfile.out bs=1500 count=1024
while true; do cat testfile.out | etcdctl put key || break; done

Check alarm status

docker exec etcd etcdctl alarm list

Compact

rev=$(docker exec etcd etcdctl endpoint status --write-out json | egrep -o '"revision":[0-9]*' | egrep -o '[0-9]*') 
docker exec etcd etcdctl compact "$rev"

Defrag

docker exec etcd etcdctl defrag --endpoints=$(docker exec etcd /bin/sh -c "etcdctl member list | cut -d, -f5 | sed -e 's/ //g' | paste -sd ','")

Disarm

docker exec etcd etcdctl alarm disarm
docker exec etcd etcdctl alarm list
<empty>

etcd debug logging

curl -XPUT -d '{"Level":"DEBUG"}' --cacert $(docker exec etcd printenv ETCDCTL_CACERT) --cert $(docker exec etcd printenv ETCDCTL_CERT) --key $(docker exec etcd printenv ETCDCTL_KEY) https://localhost:2379/config/local/log

Restore to info logging

curl -XPUT -d '{"Level":"INFO"}' --cacert $(docker exec etcd printenv ETCDCTL_CACERT) --cert $(docker exec etcd printenv ETCDCTL_CERT) --key $(docker exec etcd printenv ETCDCTL_KEY) https://localhost:2379/config/local/log 

etcd metrics: Get all metrics for wal_fsync_duration_seconds

curl -s --cacert $(docker exec etcd printenv ETCDCTL_CACERT) --cert $(docker exec etcd printenv ETCDCTL_CERT) --key $(docker exec etcd printenv ETCDCTL_KEY) $(docker exec etcd printenv ETCDCTL_ENDPOINTS)/metrics | grep wal_fsync_duration_seconds

etcd metrics: Loop and calculate percentage of wal_fsync_duration_seconds within 8ms (requires bc)

while true; do echo "scale=2;$(curl -s --cacert $(docker exec etcd printenv ETCDCTL_CACERT) --cert $(docker exec etcd printenv ETCDCTL_CERT) --key $(docker exec etcd printenv ETCDCTL_KEY) $(docker exec etcd printenv ETCDCTL_ENDPOINTS)/metrics | grep 'etcd_disk_wal_fsync_duration_seconds_bucket{le="0.002"}' | awk '{ print $2+0 }') / $(curl -s --cacert $(docker exec etcd printenv ETCDCTL_CACERT) --cert $(docker exec etcd printenv ETCDCTL_CERT) --key $(docker exec etcd printenv ETCDCTL_KEY) $(docker exec etcd printenv ETCDCTL_ENDPOINTS)/metrics | grep wal_fsync_duration_seconds_count  | awk '{ print $2+0 }')" | bc; done

Leader changes

# curl -s --cacert $(docker exec etcd printenv ETCDCTL_CACERT) --cert $(docker exec etcd printenv ETCDCTL_CERT) --key $(docker exec etcd printenv ETCDCTL_KEY) https://localhost:2379/metrics  | grep ^etcd_server_leader_changes_seen_total

kube-apiserver

kube-apiserver to etcd-servers connectivity check

for etcdserver in $(docker inspect kube-apiserver --format='{{range .Args}}{{.}}{{"\n"}}{{end}}' | grep etcd-servers | awk -F= '{ print $2 }' | tr ',' '\n'); do SSLDIR=$(docker inspect kube-apiserver --format '{{ range .Mounts }}{{ if eq .Destination "/etc/kubernetes" }}{{ .Source }}{{ end }}{{ end }}'); echo "Validating connection to ${etcdserver}/health"; curl -w '\nConnect:%{time_connect}\nStart Transfer: %{time_starttransfer}\nTotal: %{time_total}\nResponse code: %{http_code}\n' --cacert $SSLDIR/ssl/kube-ca.pem --cert $SSLDIR/ssl/kube-apiserver.pem --key $SSLDIR/ssl/kube-apiserver-key.pem "${etcdserver}/health"; done

kube-apiserver responsiveness

for cip in $(kubectl get nodes -l "node-role.kubernetes.io/controlplane=true" -o jsonpath='{range.items[*].status.addresses[?(@.type=="InternalIP")]}{.address}{"\n"}{end}'); do kubectl --kubeconfig kube_config_cluster.yml --server https://${cip}:6443 get nodes -v6 2>&1 | grep round_trippers; done

kube-controller-manager

Find current leader

kubectl -n kube-system get endpoints kube-controller-manager -o jsonpath='{.metadata.annotations.control-plane\.alpha\.kubernetes\.io/leader}'                                                                     
{"holderIdentity":"seb-doctl-ubuntu-5_96fb83ba-6023-11e9-a7a7-429a019f0230","leaseDurationSeconds":15,"acquireTime":"2019-04-16T08:42:57Z","renewTime":"2019-04-16T10:36:25Z","leaderTransitions":1}

kube-scheduler

Find current leader

kubectl -n kube-system get endpoints kube-scheduler -o jsonpath='{.metadata.annotations.control-plane\.alpha\.kubernetes\.io/leader}'

kubelet

Show kubelet stats

curl -sLk --cacert /etc/kubernetes/ssl/kube-ca.pem --cert /etc/kubernetes/ssl/kube-node.pem --key /etc/kubernetes/ssl/kube-node-key.pem https://127.0.0.1:10250/stats

Generic

Liveness check

apiVersion: v1
kind: Pod
metadata:
  labels:
    test: liveness
  name: liveness-exec
spec:
  containers:
  - name: liveness
    image: k8s.gcr.io/busybox
    args:
    - /bin/sh
    - -c
    - touch /tmp/healthy; sleep 30; rm -rf /tmp/healthy; sleep 600
    livenessProbe:
      exec:
        command:
        - cat
        - /tmp/healthy
      initialDelaySeconds: 5
      periodSeconds: 5
kubectl get events --field-selector involvedObject.kind=Pod -w

Describe $resource

kubectl describe pod

Get events with filter

kubectl get events --field-selector involvedObject.kind=Pod -w

Check Pending pods

kubectl get pods --all-namespaces -o go-template='{{range .items}}{{if eq .status.phase "Pending"}}{{.spec.nodeName}}{{" "}}{{.metadata.name}}{{" "}}{{.metadata.namespace}}{{" "}}{{range .status.conditions}}{{.message}}{{";"}}{{end}}{{"\n"}}{{end}}{{end}}'

Nodes

Node difference check

kubectl get nodes -o custom-columns=NAME:.metadata.name,OS:.status.nodeInfo.osImage,KERNEL:.status.nodeInfo.kernelVersion,RUNTIME:.status.nodeInfo.containerRuntimeVersion,KUBELET:.status.nodeInfo.kubeletVersion,KUBEPROXY:.status.nodeInfo.kubeProxyVersion

Show taints

kubectl get nodes -o custom-columns=NAME:.metadata.name,TAINTS:.spec.taints

Show labels

kubectl get nodes --show-labels

Show node conditions

kubectl get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{$node.metadata.name}}{{": "}}{{.type}}{{":"}}{{.status}}{{"\n"}}{{end}}{{end}}'

Show node conditions that could cause issues

kubectl get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if ne .type "Ready"}}{{if eq .status "True"}}{{$node.metadata.name}}{{": "}}{{.type}}{{":"}}{{.status}}{{"\n"}}{{end}}{{else}}{{if ne .status "True"}}{{$node.metadata.name}}{{": "}}{{.type}}{{": "}}{{.status}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'

DNS

Check if internal cluster name resolve

kubectl run -it --rm --restart=Never busybox --image=busybox:1.28 -- nslookup kubernetes.default 

Check if external name resolves

kubectl run -it --rm --restart=Never busybox --image=busybox:1.28 -- nslookup www.google.com

Check upstream DNS nameserver(s)

kubectl -n kube-system get pods -l k8s-app=kube-dns --no-headers -o custom-columns=NAME:.metadata.name,HOSTIP:.status.hostIP | while read pod host; do echo "Pod ${pod} on host ${host}"; kubectl -n kube-system exec $pod -c kubedns cat /etc/resolv.conf; done

Ingress

Check responsiveness of Ingress Controller

kubectl -n ingress-nginx get pods -l app=ingress-nginx -o custom-columns=POD:.metadata.name,NODE:.spec.nodeName,IP:.status.podIP --no-headers | while read ingresspod nodename podip; do echo "=> Testing from ${ingresspod} on ${nodename} (${podip})"; curl -o /dev/null --connect-timeout 5 -s -w 'Connect: %{time_connect}\nStart Transfer: %{time_starttransfer}\nTotal: %{time_total}\nResponse code: %{http_code}\n' -k http://${podip}/healthz; done 

Add packet loss of 40% to one node running Ingress controller

tc qdisc add dev eth0 root netem loss 40% && sleep 120 && tc qdisc del dev eth0 root netem loss 40%

Check responsiveness Ingress -> Pods

kubectl run nginx --image=nginx --port=80 --expose
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
  name: nginx-ingress
  annotations:
    nginx.ingress.kubernetes.io/rewrite-target: /
spec:
  rules:
  - host: foo.bar.com
    http:
      paths:
      - path: /
        backend:
          serviceName: nginx
          servicePort: 80
$ kubectl -n ingress-nginx get pods -l app=ingress-nginx -o custom-columns=POD:.metadata.name,NODE:.spec.nodeName,IP:.status.podIP --no-headers | while read ingresspod nodename podip; do echo "=> Testing from ${ingresspod} on ${nodename} (${podip})"; kubectl -n default get i
ng -o custom-columns=NAMESPACE:.metadata.namespace,HOST:.spec.rules[].host,SERVICE:.spec.rules[].http.paths[].backend.serviceName --no-headers | while read namespace host service; do echo "==> Found host $host with service $service in $namespace"; kubectl -n $namespace get ep $service -o go-template='{{range .subsets
}}{{range .addresses}}{{ .ip}}{{" "}}{{ .nodeName}}{{"\n"}}{{end}}{{end}}' | while read ep epnodename; do echo "==> Connecting to ${ep} on ${epnodename}"; kubectl -n ingress-nginx exec $ingresspod -- curl -o /dev/null --connect-timeout 5 -s -w 'Connect:%{time_connect}\nStart Transfer: %{time_starttransfer}\nTotal: %{
time_total}\nResponse code: %{http_code}\n' --resolve $host:80:$ep http://${host}:80; RC=$?; if [ $RC -ne 0 ]; then echo "FAIL: ${nodename} cannot connectto ${epnodename}"; else echo OK; fi; done;  done; done

Check static NGINX config

for pod in $(kubectl -n ingress-nginx get pods -l app=ingress-nginx -o custom-columns=NAME:.metadata.name --no-headers); do kubectl -n ingress-nginx exec $pod -- cat /etc/nginx/nginx.conf; done

Use checksum to find differences

for pod in $(kubectl -n ingress-nginx get pods -l app=ingress-nginx -o custom-columns=NAME:.metadata.name --no-headers); do echo $pod; kubectl -n ingress-nginx exec $pod -- cat /etc/nginx/nginx.conf | md5; done

Exclude instance specific and randomized lines

for pod in $(kubectl -n ingress-nginx get pods -l app=ingress-nginx -o custom-columns=NAME:.metadata.name --no-headers); do echo $pod; kubectl -n ingress-nginx exec $pod -- cat /etc/nginx/nginx.conf | grep -v nameservers | grep -v resolver | grep -v "PEM sha" | md5; done 

Check dynamic NGINX config

for pod in $(kubectl -n ingress-nginx get pods -l app=ingress-nginx -o custom-columns=NAME:.metadata.name --no-headers); do echo $pod; kubectl -n ingress-nginx exec $pod -- curl -s http://127.0.0.1:18080/configuration/backends; done

Use checksum to view differences

for pod in $(kubectl -n ingress-nginx get pods -l app=ingress-nginx -o custom-columns=NAME:.metadata.name --no-headers); do echo $pod; kubectl -n ingress-nginx exec $pod -- curl -s http://127.0.0.1:18080/configuration/backends | md5; done

Pretty print using jq

for pod in $(kubectl -n ingress-nginx get pods -l app=ingress-nginx -o custom-columns=NAME:.metadata.name --no-headers); do echo $pod; kubectl -n ingress-nginx exec $pod -- curl -s http://127.0.0.1:18080/configuration/backends | jq .; done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment