Last active February 6, 2024 09:49
# routes
oc get routes --all-namespaces
# get the image shas from a release
oc adm release info --pullspecs | awk '{print " - " $2}'
# use butane for machine configs
# p&f issues
grep -r '' 'gcp-audit/quay-io-openshift-release-dev-ocp-v4-0-art-dev-sha256-3090cd5333971e522a2cb54e8586308cb5388e8c8011ffbdfd8305db0b0d8a41/audit_logs/kube-apiserver' | grep "watch=true" | grep 429 | grep -v '"userAgent":"kubelet' | grep '"username"' | wc -l
# query loki for etcd
sum by (instance) (rate({job="etcd"} | json | duration > 400ms [1m]))
kinit sbatsche@REDHAT.COM
# debug no api
# top against node
oc adm top node
# etcd pprof crictl
rictl exec -it b2c59200242fe sh -c 'curl --cert $ETCDCTL_CERT --key $ETCDCTL_KEY --cacert $ETCDCTL_CACERT https://localhost:2379/debug/pprof/heap' > /tmp/heap
# atop
atop -r atop -1 -f -D -p
# count resources etcd
etcdctl get / --prefix --keys-only | grep -oP "(?<=/\/).+?(?=\/)" | sort | uniq -c
# custom dep
replace => v0.0.0-20200729104859-26a400dab398
# backup example
oc get cm cluster-backup-pod -n openshift-etcd -o "jsonpath={.data['backup-pod\.yaml']}"
# node reboots
oc get events -o json | jq '.items[] | select((.reason=="Rebooted") or .reason=="Reboot") | .lastTimestamp + " -> " + .reason + " -> " + + " -> " + .message'
# parse events
curl -s | jq '.items[] |select(.reason=="UnhealthyEtcdMember") | .message'
# list leases
for lease in $(etcdctl lease list);do etcdctl lease timetolive $lease; done
for token in $(etcdctl get --prefix / --keys-only | sort -u);do etcdctl get $token -w json | etcdctl lease timetolive $(printf "%x\n" $(python -c 'import json,sys;print json.load(sys.stdin)["kvs"][0]["lease"]'));done
grep '"verb":' exlode | sort | uniq
# list event reasons
cat events.json | grep '\"reason\": \"' | sort -u -k2 | awk '{ print $2 }'
# base tel metrics
# metrics join
id_version_ebs_account_internal:cluster_subscribed + on(_id) (topk(1,cluster_version{type="current",version=~"4.\\d+\\.\\d+"}))
# more joins
id_version_ebs_account_internal:cluster_subscribed{managed="true"} + on(version) group_left() (topk(1,cluster_version{type="current",version=~"4.\\d+\\.\\d+"}))
# big join
label_replace(count(max by (_id) ((cluster_operator_up{name="ingress"} == 0) and on (_id) (cluster_operator_up{name="authentication"} == 0))), "which", "both", "mode", "") or
label_replace(count(max by (_id) ((cluster_operator_up{name="authentication"} == 0))), "which", "authentication", "mode", "") or
label_replace(count(max by (_id) ((cluster_operator_up{name="ingress"} == 0))), "which", "ingress", "mode", "")
# more prom
count by (version) (id_version_ebs_account_internal:cluster_subscribed + on(version) group_left(_blah) 0*(topk(1, cluster_version{type="current",version=~"4\\.\\d+\\.\\d+"})))
# parse events
jq '.items[] | select(.source.component=="kube-apiserver-operator-revisioncontroller" and .reason=="RevisionTriggered") | {lastTimestamp: .lastTimestamp, message: .message, source: .source }' events.json
### vsphere get console logs
Navigate to datastore view -> click datastore -> files -> <virtual machine name> directory -> click "serial.log" -> click "download"
oc rsh -n openshift-etcd $(oc get pods -n openshift-etcd -o jsonpath='{.items[0]}')
crictl ps -a --label "io.kubernetes.pod.namespace=openshift-etcd" -o json | jq -r '.containers[].id' | xargs -n1 crictl logs
crictl ps -a --label "io.kubernetes.pod.namespace=openshift-etcd" -o json | jq -r '.containers[] |"\(.id) \("' | xargs -n 2 bash -c 'crictl logs -t $0 &> $1-$0.log'
# cleanup
find ~/clusters/{aws,gcp,azure} -mindepth 2 -maxdepth 2 -type d -exec bash -c '$1/bin/openshift-install --dir "$1" destroy cluster' _ {} \;
# upgrade
oc adm upgrade --to-image$UPGRADE_RELEASE --force
# download CI run
wget -r -e robots=off -np -H
# grep kube log
sed -rn 's/^(\w+ [0-9]{1,2} [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}) ([^ ]+) ([^ ]+): [^ ]+ ([^ ]+) .* (Readiness probe for "etcd-quorum-guard-[[:alnum:]]+-[[:alnum:]]+).* (\w+$)/\1 \2 \3 \4 \5 \6/p' kubelet_service.log
# grep kube newer
sed -rn 's/(\w+ [0-9]{1,2} [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}.[0-9]+) ([^ ]+) ([^ ]+): [^ ]+ ([^ ]+) .* (Readiness probe for "etcd-quorum-guard-[[:alnum:]]+-[[:alnum:]]+).* (.*+$)/\1 \2 \6/p' kubelet_service.log | sort -k 2 | grep failure
# nicer way
cat host_service_logs/masters/kubelet_service.log | awk '/Readiness probe for \"etcd-quorum-guard.*/ { print $1, $2, $3, $4, $10, $11, $13, $14; }'
# check for crashing containers
oc get po -A | grep -vE "(Running|Completed)"
# patch to see upgrade paths
oc patch clusterversion/version --patch '{"spec":{"upstream":""}}' --type=merge
# update lastest on cluster
podman pull --authfile=/var/lib/kubelet/config.json $image
# grab MCO MachineConfig for etcd
oc get machineconfig 00-master -o jsonpath='{[?(@.path=="/etc/kubernetes/manifests/etcd-member.yaml")]contents.source}'
# grab ci runs
gsutil -m cp -r gs://origin-ci-test/logs/canary-openshift-ocp-installer-e2e-azure-4.2/290/ .
# vsphere install
# merge json pullsecrets
# merge all json in dir into 1 file
jq -s '[.[][]]'*.json &gt; manifest.json
# podman build rhel
sudo podman build --authfile=./PULL_SECRET_LOCATION -f images/tests/Dockerfile.rhel .
# search CI errors*API+data+in+etcd.*&maxAge=336h&context=2&type=all
# oc debug node
oc debug node/
chroot /host
## create/simulate latency
dmsetup create delayed
## watch keyspace and print counts by resource
ETCDCTL_API=3 etcdctl watch / --prefix -w fields > out | watch 'cat out | grep -oP "(?<=/\/).+?(?=\/)" | sort | uniq -c'
# convert to decimal
printf "%.2f" 7.516192768e+09
# p99 raw
echo $(( $(printf "%.f" 2.127424e+06) *99/100 ))
## refresh token
oc registry login --to=PULL_SECRET_LOCATION
## get IP of node
oc get node -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}{"\n"}'
## list registtry
oc get secret pull-secret -n openshift-config -o jsonpath={.data.'\.dockerconfigjson'} | base64 -d | jq -r '.auths | to_entries[]' | jq -r '.key'
## release
oc get imagestream installer -n openshift -o jsonpath={.status.tags[0].items[0].dockerImageReference} | egrep -o '^[^@]+'
## image
oc adm release info --image-for kube-etcd-signer-server --registry-config=./PULL_SECRET_LOCATION
# check cluster version
oc --config=${INSTALL_DIR}/auth/kubeconfig get clusterversion -oyaml
# etcd logs
master-logs etcd etcd &> etcd_server.log
# etcd related tasks
# external testing 2 stage docker
FROM openshift/origin-release:golang-1.10 AS builder
# openshift 4
CONTAINER=$(runc list | grep `pgrep etcd` | awk '{print $1}'); runc exec $CONTAINER etcd --version
# etcd
oc get pods --all-namespaces | grep etcd
# get image of container
oc get pod -o "jsonpath={range .status.containerStatuses[*]}{.name}{'\t'}{.state}{'\t'}{.image}{'\n'}{end}" -n kube-system etcd-member-ip-10-0-18-84.ec2.internal
# send to docker hub with podman
sudo podman push --authfile ~sbatsche/.docker/config.json localhost/machine-config-operator:v3.11.0-699-g100373ce-dirty hexfusion/machine-config-operator:latest
# give kube-system perms for operator
oc create clusterrolebinding etcd_operator --clusterrole=cluster-admin --serviceaccount=kube-system:default
# location of certs on bootstrap
# MCO regen
go test ./pkg/controller/template/... -u
my guess is
on bootstrap:
etcd-metrics-ca content is empty
in cluster:
etcd-metrics-ca config maps exists and is non-empty.
a good way would be
use the release image generated for your PR in CI and use the to create a cluster using installer.
how to find release image:
```2019/03/06 20:48:58 Create release image```
then `OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE=<release-image> create cluster`
and on bootstrap node check `/etc/mcs/machine-configs` and compare them with `oc get machine-configs` the generated ones
sudo podman run --quiet --rm image kube-client-agent
# check cert
openssl x509 -text -noout -in cert.pem
# check cert with output from oc
oc get cm -n openshift-config-managed csr-controller-ca -o json | jq -r '.data["ca-bundle.crt"]' | openssl x509 -text -noout -in -
# check CA created cert
openssl verify -verbose -x509_strict -CAfile ca.crt somecert.crt
# check csr
openssl req -text -noout -verify -in test.pem
# verify key was signed by csr
openssl verify -verbose -CAFile ca.crt domain.crt
# decryot base 64 encoded certs.
for ext in crt key; do oc -n openshift-config get secrets etcd-metrics-proxy-client -o yaml | grep tls.${ext} | awk '{ print $2 }' | base64 --decode > etcd-metrics-proxy-client.${ext};
### etcd watch keyspace
ETCDCTL_API=3 etcdctl watch / --prefix -w fields
### build latest CI release.
$ oc login and get link
## go to site
## get a release IE
## create cluster
# docker login
docker login -u hexfusion -p $(oc whoami -t)
# release
docker run -it -v $(pwd)/ci-operator:/ci-operator:z --from-dir /ci-operator/config/ --to-dir /ci-operator/job
# exec into etcd
id=$(sudo crictl ps --name etcd-member | awk 'FNR==2{ print $1}') && sudo crictl exec -it $id /bin/sh
# export certs
export ETCDCTL_API=3 ETCDCTL_CACERT=/etc/ssl/etcd/ca.crt ETCDCTL_CERT=$(find /etc/ssl/ -name *peer*crt) ETCDCTL_KEY=$(find /etc/ssl/ -name *peer*key)
# use etcdctl
ETCDCTL_API=3 etcdctl --cert=$(find / -name 'system:etcd-peer*.crt') --key=$(find / -name 'system:etcd-peer*.key') --cacert=$(find / -name 'ca.crt') member list
# print cluster ID.
etcdctl member list -w fields | grep -oP '(?<=ClusterID\"\s:\s).*' | xargs printf '%x\n'
# list a records from SRV
dig +noall +answer SRV _etcd-server-ssl._tcp.hexfusion.local | grep -oP '(?<=2380 ).*[^\.]' | xargs| sed -e 's/ /,/g'
# list ipv4 address
ip -o addr | grep -oP '(?<=inet )(\d{1,3}\.?){4}'
# size of secrets
etcdctl get --prefix / -w fields | grep -oP '(?<=Value\" : ")(.*)' | wc -c
# selinux
ausearch -m avc -c etcd
# create intial_cluster.
etcdctl member list -w json | jq -r '.members[] | [.name,.peerURLs[0]] | "\(.[0])=\(.[1])" ' | xargs | sed -e 's/ /,/g'
# Operator debug
# extract the payload for what CVO manages.
oc adm release extract --to=release-image
function git () {
(sudo podman run --rm -v ${PWD}:/root --rm --volume "$(pwd):/git:z" alpine/git "$@")
# regex to parse kubelet
# cleanup clusters
find ./clusters/{aws,gcp} -maxdepth 1 -type d -exec bash -c 'openshift-install --dir "$1" destroy cluster' _ {} \;
dump prom

mkdir -p $ARTIFACT_DIR/metrics
echo "Snapshotting prometheus ..."
oc --insecure-skip-tls-verify exec -n openshift-monitoring prometheus-k8s-0 -- tar cvzf - -C /prometheus .  >$ARTIFACT_DIR/metrics/prometheus.tar.gz

sum(container_memory_usage_bytes{image!="",}) by (namespace)

get commit sha of release image

$ sudo podman pull --authfile  ~/.PULL_SECRET_BUILD  $(oc get pods -n openshift-etcd-operator -o json | jq  -r '.items[].spec.containers[0].env[] | select(.name=="OPERATOR_IMAGE")'.value)

Trying to pull
Getting image source signatures
Copying blob 4fbc3bafa3d4 skipped: already exists  
Copying blob 34971b2d1eb9 skipped: already exists  
Copying blob 2ccc210e15d6 done  
Copying blob ca166bc0bd99 done  
Copying blob 685a3b67eda6 done  
Copying config f04b9935bd done  
Writing manifest to image destination
Storing signatures

git checkout $(sudo podman inspect 84a4766806bb56652311518e75c8d8e9b77b8f16c0662c6f9052c254427b24c0 | jq -r .[].Labels.\"\")

$ git log -1 -p -m ":/cache the client based on the endpoints to avoid reconstruction"

  POST /test-index/events
      "firstTimestamp":{ "type" : "date" },
      "count":{ "type" : "int" },
          "kind":{"type" : "text"},
          "name":{"type" : "text"},
          "uuid":{"type" : "text"}
      "kind":{"type" : "text" },
      "lastTimestamp":{"type" : "date" },
      "message":{ "type" :"text" },
          "creationTimestamp":{"type": "date"},
          "name":{"type": "text"},
          "namespace":{"type": "text"},
          "resourceVersion":{"type": "text"},
          "selfLink":{"type": "text"},
          "uuid":{"type": "text"}
      "reason":{ "type":"text" },
      "reportingComponent":{ "type":"text" },
      "reportingInstance" :{ "type":"text" },
          "component":{"type": "text"},
          "host":{"type": "text"}

dump prom

#!/usr/bin/env bash
function queue() {
  local TARGET="${1}"
  local LIVE
  LIVE="$(jobs | wc -l)"
  while [[ "${LIVE}" -ge 45 ]]; do
    sleep 1
    LIVE="$(jobs | wc -l)"
  echo "${@}"
  if [[ -n "${FILTER:-}" ]]; then
    "${@}" | "${FILTER}" >"${TARGET}" &
    "${@}" >"${TARGET}" &
mkdir -p $ARTIFACT_DIR/metrics
id=$(crictl ps -q --label "")
if [ -z "$id" ]; then
echo "prom container not found.."
echo "Snapshotting prometheus (may take 15s) ..."
queue ${ARTIFACT_DIR}/metrics/prometheus.tar.gz crictl exec $id tar cvzf - -C /prometheus .
FILTER=gzip queue ${ARTIFACT_DIR}/metrics/prometheus-target-metadata.json.gz crictl exec $id /bin/bash -c "curl -G http://localhost:9090/api/v1/targets/metadata --data-urlencode 'match_target={instance!=\"\"}'"

