Skip to content

Instantly share code, notes, and snippets.

@hexfusion
Last active February 6, 2024 09:49
Show Gist options
  • Save hexfusion/0ac019af41a19e8e1fe62f6cfb7435cc to your computer and use it in GitHub Desktop.
Save hexfusion/0ac019af41a19e8e1fe62f6cfb7435cc to your computer and use it in GitHub Desktop.
# routes
oc get routes --all-namespaces
# get the image shas from a release
oc adm release info quay.io/openshift-release-dev/ocp-release:4.14.10-x86_64 --pullspecs | awk '{print " - " $2}'
# use butane for machine configs
https://docs.openshift.com/container-platform/4.8/installing/install_config/installing-customizing.html
# p&f issues
grep -r 'apiserver.openshift.io/watch-rate-limit' 'gcp-audit/quay-io-openshift-release-dev-ocp-v4-0-art-dev-sha256-3090cd5333971e522a2cb54e8586308cb5388e8c8011ffbdfd8305db0b0d8a41/audit_logs/kube-apiserver' | grep "watch=true" | grep 429 | grep -v '"userAgent":"kubelet' | grep '"username"' | wc -l
# query loki for etcd
sum by (instance) (rate({job="etcd"} | json | duration > 400ms [1m]))
#
kinit sbatsche@REDHAT.COM
# debug no api
https://docs.openshift.com/container-platform/4.6/support/gathering-cluster-data.html#support-generating-a-sosreport-archive_gathering-cluster-data
# top against node
oc adm top node
# etcd pprof crictl
rictl exec -it b2c59200242fe sh -c 'curl --cert $ETCDCTL_CERT --key $ETCDCTL_KEY --cacert $ETCDCTL_CACERT https://localhost:2379/debug/pprof/heap' > /tmp/heap
# atop
atop -r atop -1 -f -D -p
# count resources etcd
etcdctl get / --prefix --keys-only | grep -oP "(?<=/kubernetes.io\/).+?(?=\/)" | sort | uniq -c
# custom dep
replace github.com/openshift/library-go => github.com/hexfusion/library-go v0.0.0-20200729104859-26a400dab398
# backup example
oc get cm cluster-backup-pod -n openshift-etcd -o "jsonpath={.data['backup-pod\.yaml']}"
# node reboots
oc get events -o json | jq '.items[] | select((.reason=="Rebooted") or .reason=="Reboot") | .lastTimestamp + " -> " + .reason + " -> " + .involvedObject.name + " -> " + .message'
# parse events
curl -s https://storage.googleapis.com/origin-ci-test/pr-logs/pull/openshift_cluster-etcd-operator/350/pull-ci-openshift-cluster-etcd-operator-master-e2e-aws/1463/artifacts/e2e-aws/gather-extra/events.json | jq '.items[] |select(.reason=="UnhealthyEtcdMember") | .message'
# list leases
for lease in $(etcdctl lease list);do etcdctl lease timetolive $lease; done
for token in $(etcdctl get --prefix /openshift.io/oauth/accesstokens --keys-only | sort -u);do etcdctl get $token -w json | etcdctl lease timetolive $(printf "%x\n" $(python -c 'import json,sys;print json.load(sys.stdin)["kvs"][0]["lease"]'));done
grep '"verb":' exlode | sort | uniq
# list event reasons
cat events.json | grep '\"reason\": \"' | sort -u -k2 | awk '{ print $2 }'
# base tel metrics
id_version_ebs_account_internal:cluster_subscribed
# metrics join
id_version_ebs_account_internal:cluster_subscribed + on(_id) (topk(1,cluster_version{type="current",version=~"4.\\d+\\.\\d+"}))
# more joins
id_version_ebs_account_internal:cluster_subscribed{managed="true"} + on(version) group_left() (topk(1,cluster_version{type="current",version=~"4.\\d+\\.\\d+"}))
# big join
label_replace(count(max by (_id) ((cluster_operator_up{name="ingress"} == 0) and on (_id) (cluster_operator_up{name="authentication"} == 0))), "which", "both", "mode", "") or
label_replace(count(max by (_id) ((cluster_operator_up{name="authentication"} == 0))), "which", "authentication", "mode", "") or
label_replace(count(max by (_id) ((cluster_operator_up{name="ingress"} == 0))), "which", "ingress", "mode", "")
# more prom
count by (version) (id_version_ebs_account_internal:cluster_subscribed + on(version) group_left(_blah) 0*(topk(1, cluster_version{type="current",version=~"4\\.\\d+\\.\\d+"})))
# parse events
jq '.items[] | select(.source.component=="kube-apiserver-operator-revisioncontroller" and .reason=="RevisionTriggered") | {lastTimestamp: .lastTimestamp, message: .message, source: .source }' events.json
### vsphere get console logs
Navigate to datastore view -> click datastore -> files -> <virtual machine name> directory -> click "serial.log" -> click "download"
oc rsh -n openshift-etcd $(oc get pods -n openshift-etcd -o jsonpath='{.items[0].metadata.name}')
crictl ps -a --label "io.kubernetes.pod.namespace=openshift-etcd" -o json | jq -r '.containers[].id' | xargs -n1 crictl logs
crictl ps -a --label "io.kubernetes.pod.namespace=openshift-etcd" -o json | jq -r '.containers[] |"\(.id) \(.metadata.name)"' | xargs -n 2 bash -c 'crictl logs -t $0 &> $1-$0.log'
# cleanup
find ~/clusters/{aws,gcp,azure} -mindepth 2 -maxdepth 2 -type d -exec bash -c '$1/bin/openshift-install --dir "$1" destroy cluster' _ {} \;
# upgrade
oc adm upgrade --to-image registry.svc.ci.openshift.org/ocp/release:$UPGRADE_RELEASE --force
# download CI run
wget -r -e robots=off -np -H https://gcsweb-ci.svc.ci.openshift.org/gcs/origin-ci-test/pr-logs/pull/24458/pull-ci-openshift-origin-master-e2e-aws-serial/12333/artifacts/e2e-aws-serial/
# grep kube log
sed -rn 's/^(\w+ [0-9]{1,2} [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}) ([^ ]+) ([^ ]+): [^ ]+ ([^ ]+) .* (Readiness probe for "etcd-quorum-guard-[[:alnum:]]+-[[:alnum:]]+).* (\w+$)/\1 \2 \3 \4 \5 \6/p' kubelet_service.log
# grep kube newer
sed -rn 's/(\w+ [0-9]{1,2} [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}.[0-9]+) ([^ ]+) ([^ ]+): [^ ]+ ([^ ]+) .* (Readiness probe for "etcd-quorum-guard-[[:alnum:]]+-[[:alnum:]]+).* (.*+$)/\1 \2 \6/p' kubelet_service.log | sort -k 2 | grep failure
# nicer way
cat host_service_logs/masters/kubelet_service.log | awk '/Readiness probe for \"etcd-quorum-guard.*/ { print $1, $2, $3, $4, $10, $11, $13, $14; }'
# check for crashing containers
oc get po -A | grep -vE "(Running|Completed)"
# patch to see upgrade paths
oc patch clusterversion/version --patch '{"spec":{"upstream":"https://openshift-release.svc.ci.openshift.org/graph"}}' --type=merge
# update lastest on cluster
podman pull --authfile=/var/lib/kubelet/config.json $image
# grab MCO MachineConfig for etcd
oc get machineconfig 00-master -o jsonpath='{.spec.config.storage.files[?(@.path=="/etc/kubernetes/manifests/etcd-member.yaml")]contents.source}'
# grab ci runs
gsutil -m cp -r gs://origin-ci-test/logs/canary-openshift-ocp-installer-e2e-azure-4.2/290/ .
# vsphere install
https://github.com/openshift/installer/tree/master/upi/vsphere
https://vcsa.vmware.devcluster.openshift.com/ui
# merge json pullsecrets
jq -s '.[0] * .[1]' CORE_PULL_SECRET CI_PULL_SECRET &> MASTER_PULL_SECRET
# merge all json in dir into 1 file
jq -s '[.[][]]'*.json &gt; manifest.json
# podman build rhel
sudo podman build --authfile=./PULL_SECRET_LOCATION -f images/tests/Dockerfile.rhel .
# search CI errors
https://ci-search-ci-search-next.svc.ci.openshift.org/?search=failed%3A.*API+data+in+etcd.*&maxAge=336h&context=2&type=all
# oc debug node
oc debug node/ip-10-0-137-127.us-east-2.compute.internal
chroot /host
## create/simulate latency
## https://www.enodev.fr/posts/emulate-a-slow-block-device-with-dm-delay.html
dmsetup create delayed
## watch keyspace and print counts by resource
ETCDCTL_API=3 etcdctl watch / --prefix -w fields > out | watch 'cat out | grep -oP "(?<=/kubernetes.io\/).+?(?=\/)" | sort | uniq -c'
# convert to decimal
printf "%.2f" 7.516192768e+09
# p99 raw
echo $(( $(printf "%.f" 2.127424e+06) *99/100 ))
## refresh token
oc registry login --to=PULL_SECRET_LOCATION
## get IP of node
oc get node ip-10-0-143-125.us-east-2.compute.internal -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}{"\n"}'
## list registtry
oc get secret pull-secret -n openshift-config -o jsonpath={.data.'\.dockerconfigjson'} | base64 -d | jq -r '.auths | to_entries[]' | jq -r '.key'
## release
oc get imagestream installer -n openshift -o jsonpath={.status.tags[0].items[0].dockerImageReference} | egrep -o '^[^@]+'
## image
oc adm release info --image-for kube-etcd-signer-server --registry-config=./PULL_SECRET_LOCATION
# check cluster version
oc --config=${INSTALL_DIR}/auth/kubeconfig get clusterversion -oyaml
# etcd logs
master-logs etcd etcd &> etcd_server.log
# etcd related tasks
https://docs.openshift.com/container-platform/3.11/day_two_guide/host_level_tasks.html#day-two-guide-etcd-backup
# external testing 2 stage docker
# https://mojo.redhat.com/docs/DOC-1178565?sr=search&searchId=429ba108-213b-4b81-87f2-b667aca3e228&searchIndex=0
FROM openshift/origin-release:golang-1.10 AS builder
# openshift 4
CONTAINER=$(runc list | grep `pgrep etcd` | awk '{print $1}'); runc exec $CONTAINER etcd --version
# etcd
oc get pods --all-namespaces | grep etcd
# get image of container
oc get pod -o "jsonpath={range .status.containerStatuses[*]}{.name}{'\t'}{.state}{'\t'}{.image}{'\n'}{end}" -n kube-system etcd-member-ip-10-0-18-84.ec2.internal
# send to docker hub with podman
sudo podman push --authfile ~sbatsche/.docker/config.json localhost/machine-config-operator:v3.11.0-699-g100373ce-dirty hexfusion/machine-config-operator:latest
# give kube-system perms for operator
oc create clusterrolebinding etcd_operator --clusterrole=cluster-admin --serviceaccount=kube-system:default
# location of certs on bootstrap
/var/opt/openshift/tls
/sysroot/ostree/deploy/redhat-coreos-maipo/var/opt/openshift/tls/
# MCO regen
go test ./pkg/controller/template/... -u
my guess is
on bootstrap:
https://github.com/openshift/machine-config-operator/pull/517/files#diff-8fb88a4862bafc203a34072446df1407R52
etcd-metrics-ca content is empty
in cluster:
https://github.com/openshift/machine-config-operator/pull/517/files#diff-554de5523753fda8c93e7008c9bd947fR287
etcd-metrics-ca config maps exists and is non-empty.
a good way would be
use the release image generated for your PR in CI and use the to create a cluster using installer.
how to find release image:
```2019/03/06 20:48:58 Create release image registry.svc.ci.openshift.org/ci-op-l4qnhir7/release:latest```
from https://openshift-gce-devel.appspot.com/build/origin-ci-test/pr-logs/pull/openshift_machine-config-operator/517/pull-ci-openshift-machine-config-operator-master-e2e-aws/2369?log#log
then `OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE=<release-image> create cluster`
and on bootstrap node check `/etc/mcs/machine-configs` and compare them with `oc get machine-configs` the generated ones
##
sudo podman run --quiet --rm quay.io/hexfusion/origin-release:v4.0 image kube-client-agent
# check cert
openssl x509 -text -noout -in cert.pem
# check cert with output from oc
oc get cm -n openshift-config-managed csr-controller-ca -o json | jq -r '.data["ca-bundle.crt"]' | openssl x509 -text -noout -in -
# check CA created cert
openssl verify -verbose -x509_strict -CAfile ca.crt somecert.crt
# check csr
openssl req -text -noout -verify -in test.pem
# verify key was signed by csr
openssl verify -verbose -CAFile ca.crt domain.crt
# decryot base 64 encoded certs.
for ext in crt key; do oc -n openshift-config get secrets etcd-metrics-proxy-client -o yaml | grep tls.${ext} | awk '{ print $2 }' | base64 --decode > etcd-metrics-proxy-client.${ext};
### etcd watch keyspace
ETCDCTL_API=3 etcdctl watch / --prefix -w fields
### build latest CI release.
$ oc login and get link
## go to site https://openshift-release.svc.ci.openshift.org/
## get a release IE registry.svc.ci.openshift.org/ocp/release:4.1.0-0.ci-2019-04-29-142604
## OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE=registry.svc.ci.openshift.org/ocp/release:4.1.0-0.ci-2019-04-29-142604 create cluster
# docker login
docker login -u hexfusion -p $(oc whoami -t) registry.svc.ci.openshift.org
# release
docker run -it -v $(pwd)/ci-operator:/ci-operator:z registry.svc.ci.openshift.org/ci/ci-operator-prowgen:latest --from-dir /ci-operator/config/ --to-dir /ci-operator/job
# exec into etcd
id=$(sudo crictl ps --name etcd-member | awk 'FNR==2{ print $1}') && sudo crictl exec -it $id /bin/sh
# export certs
export ETCDCTL_API=3 ETCDCTL_CACERT=/etc/ssl/etcd/ca.crt ETCDCTL_CERT=$(find /etc/ssl/ -name *peer*crt) ETCDCTL_KEY=$(find /etc/ssl/ -name *peer*key)
# use etcdctl
ETCDCTL_API=3 etcdctl --cert=$(find / -name 'system:etcd-peer*.crt') --key=$(find / -name 'system:etcd-peer*.key') --cacert=$(find / -name 'ca.crt') member list
# print cluster ID.
etcdctl member list -w fields | grep -oP '(?<=ClusterID\"\s:\s).*' | xargs printf '%x\n'
# list a records from SRV
dig +noall +answer SRV _etcd-server-ssl._tcp.hexfusion.local | grep -oP '(?<=2380 ).*[^\.]' | xargs| sed -e 's/ /,/g'
# list ipv4 address
ip -o addr | grep -oP '(?<=inet )(\d{1,3}\.?){4}'
# size of secrets
etcdctl get --prefix /kubernetes.io/secrets -w fields | grep -oP '(?<=Value\" : ")(.*)' | wc -c
# selinux
ausearch -m avc -c etcd
# create intial_cluster.
etcdctl member list -w json | jq -r '.members[] | [.name,.peerURLs[0]] | "\(.[0])=\(.[1])" ' | xargs | sed -e 's/ /,/g'
#####
# Operator debug
#####
# extract the payload for what CVO manages.
oc adm release extract --from=quay.io/hexfusion/origin-release:v4.2 --to=release-image
function git () {
(sudo podman run --rm -v ${PWD}:/root --rm --volume "$(pwd):/git:z" alpine/git "$@")
}
# regex to parse kubelet
https://gist.github.com/hexfusion/88e45f9d2c0ce6530bd4e3fa0bd9cfde
# cleanup clusters
find ./clusters/{aws,gcp} -maxdepth 1 -type d -exec bash -c 'openshift-install --dir "$1" destroy cluster' _ {} \;
@hexfusion
Copy link
Author

hexfusion commented Feb 19, 2020

dump prom

#!/bin/bash
ARTIFACT_DIR=$PWD
mkdir -p $ARTIFACT_DIR/metrics
echo "Snapshotting prometheus ..."
oc --insecure-skip-tls-verify exec -n openshift-monitoring prometheus-k8s-0 -- tar cvzf - -C /prometheus .  >$ARTIFACT_DIR/metrics/prometheus.tar.gz

@hexfusion
Copy link
Author

metrics

sum(container_memory_usage_bytes{image!="",}) by (namespace)

@hexfusion
Copy link
Author

get commit sha of release image

$ sudo podman pull --authfile  ~/.PULL_SECRET_BUILD  $(oc get pods -n openshift-etcd-operator -o json | jq  -r '.items[].spec.containers[0].env[] | select(.name=="OPERATOR_IMAGE")'.value)

Trying to pull registry.svc.ci.openshift.org/ocp/4.5-2020-03-16-085352@sha256:c6aac32c2ebb7fa9c3915617368f830473308450d53975644cc931801e60997a...
Getting image source signatures
Copying blob 4fbc3bafa3d4 skipped: already exists  
Copying blob 34971b2d1eb9 skipped: already exists  
Copying blob 2ccc210e15d6 done  
Copying blob ca166bc0bd99 done  
Copying blob 685a3b67eda6 done  
Copying config f04b9935bd done  
Writing manifest to image destination
Storing signatures
f04b9935bde071777630da63ebd1be66fdc692edeac8f67b69373e7d567c17cb

git checkout $(sudo podman inspect 84a4766806bb56652311518e75c8d8e9b77b8f16c0662c6f9052c254427b24c0 | jq -r .[].Labels.\"io.openshift.build.commit.id\")

$ git log -1 -p -m ":/cache the client based on the endpoints to avoid reconstruction"

@hexfusion
Copy link
Author

hexfusion commented Mar 29, 2020

  POST /test-index/events
 {
  "items":{
    "properties":{
      "firstTimestamp":{ "type" : "date" },
      "count":{ "type" : "int" },
      "involvedObject":{
        "properties":{
          "kind":{"type" : "text"},
          "name":{"type" : "text"},
          "uuid":{"type" : "text"}
        }
      },
      "kind":{"type" : "text" },
      "lastTimestamp":{"type" : "date" },
      "message":{ "type" :"text" },
      "metadata":{
        "properties":{
          "creationTimestamp":{"type": "date"},
          "name":{"type": "text"},
          "namespace":{"type": "text"},
          "resourceVersion":{"type": "text"},
          "selfLink":{"type": "text"},
          "uuid":{"type": "text"}
        }
      },
      "reason":{ "type":"text" },
      "reportingComponent":{ "type":"text" },
      "reportingInstance" :{ "type":"text" },
      "source":{
        "properties":{
          "component":{"type": "text"},
          "host":{"type": "text"}
        }
      }
    }
  }
}

@hexfusion
Copy link
Author

hexfusion commented Jul 11, 2020

dump prom

#!/usr/bin/env bash
function queue() {
  local TARGET="${1}"
  shift
  local LIVE
  LIVE="$(jobs | wc -l)"
  while [[ "${LIVE}" -ge 45 ]]; do
    sleep 1
    LIVE="$(jobs | wc -l)"
  done
  echo "${@}"
  if [[ -n "${FILTER:-}" ]]; then
    "${@}" | "${FILTER}" >"${TARGET}" &
  else
    "${@}" >"${TARGET}" &
  fi
}
ARTIFACT_DIR=$PWD
mkdir -p $ARTIFACT_DIR/metrics
id=$(crictl ps -q --label "io.kubernetes.container.name=prometheus")
if [ -z "$id" ]; then
echo "prom container not found.."
fi
echo "Snapshotting prometheus (may take 15s) ..."
queue ${ARTIFACT_DIR}/metrics/prometheus.tar.gz crictl exec $id tar cvzf - -C /prometheus .
FILTER=gzip queue ${ARTIFACT_DIR}/metrics/prometheus-target-metadata.json.gz crictl exec $id /bin/bash -c "curl -G http://localhost:9090/api/v1/targets/metadata --data-urlencode 'match_target={instance!=\"\"}'"

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment