Skip to content

Instantly share code, notes, and snippets.

@hexfusion
Last active May 24, 2023 21:02
Show Gist options
  • Save hexfusion/dc9d82caa4b20ed8d1c29d01f9a57f0c to your computer and use it in GitHub Desktop.
Save hexfusion/dc9d82caa4b20ed8d1c29d01f9a57f0c to your computer and use it in GitHub Desktop.
# sniff hypervisor
dmesg | grep Hypervisor
[ 0.000000] Hypervisor detected: KVM
# get key in bytes
etcdctl get $key -w fields | grep -oP "(?<=Value\" : \").*" | wc -c
# defrag status in MB
cat etcd_info/endpoint_status.json | jq '(.[0].Status.dbSize - .[0].Status.dbSizeInUse)/1000/1000'
3095.384064
# count objects
sh-4.4# etcdctl get / --prefix --keys-only | sed '/^$/d' | cut -d/ -f3 | sort | uniq -c | sort -rn
# jq skip non json
cat $json | jq -R 'fromjson? | '.query'
# time range
cat $json | jq -R 'fromjson? | select((.ts >= "2021-04-14T02:48") and (.ts <= "2021-04-14T03:50"))'
# hh new revision
oc patch etcd cluster -p='{"spec": {"forceRedeploymentReason": "recovery-'"$( date --rfc-3339=ns )"'"}}' --type=merge
# parse protobuf
cat data-1/member/snap/*.snap | protoc --decode_raw
# debug
curl --cert $ETCDCTL_CERT --key $ETCDCTL_KEY --cacert $ETCDCTL_CACERT -L https://127.0.0.1:2379/config/local/log -XPUT -d '{"level": "error"}'
curl --cert $ETCDCTL_CERT --key $ETCDCTL_KEY --cacert $ETCDCTL_CACERT -L https://127.0.0.1:2379/config/local/log -XPUT -d '{"level": "debug"}'
# grab slow queries and leader elections
sed -rn -e 's/.*\s([0-9]{4}-[0-9]{1,2}-[0-9]{2}\s[0-9]{1,2}\:[0-9]{1,2}\:[1-9]{1,2}\.[1-9]{1,10}).*\btook too long \(([0-9]{1,4}\.[0-9]{1,12}s).*/\1 \2/p' -e 's/.*\s([0-9]{4}-[0-9]{1,2}-[0-9]{2}\s[0-9]{1,2}\:[0-9]{1,2}\:[1-9]{1,2}\.[1-9]{1,10}).*(elected leader [0-9a-z]{1,20} at term [0-9]{1,5}).*/\1\2/p' file
# metrics
grep -oP '(?<=took too long \().*(\d{1,6})' * | sort
# grafana dashboards
3070 # etcd default
https://github.com/cloud-bulldozer/arsenal/blob/master/openshift-performance-dashboard/grafana/on-cluster-latest.json
# run etcd metrics dump
curl https://gist.githubusercontent.com/hexfusion/f9a10ef97ca2bbd70b754a038c4e05c2/raw/9e9d2c877116c801417778e17027fc19d4798bd7/ocp4-etcd-get-metrics.sh | bash
# grab size from etcd metics
grep -oP '(?<=size:)[0-9]+' | sort -n
# use bbolt to fix freelist
bbolt compact -o ./fixed.db ./snapshot-apa600001.db
# observe writes
echo 1 > /proc/sys/vm/block_dump
journalctl -f
# heavy compact
rev=$(etcdctl3 endpoint status --write-out="json" | egrep -o '"revision":[0-9]*' | egrep -o '[0-9]*' -m1)
etcdctl3 compact $rev
# metrics
histogram_quantile(0.95, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance,le))
# number of daily leader elections
changes(etcd_server_leader_changes_seen_total{job="$etcd_name"}[1d])
# percentage of roundtrip below 6.4ms
sum(rate(etcd_network_peer_round_trip_time_seconds_bucket{le="0.0064"}[5m])) by (instance) / sum(rate(etcd_network_peer_round_trip_time_seconds_count[5m])) by (instance) * 100
# etcd_disk_backend_commit_duration_seconds_bucket lt 0.032 by percentage
sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{le="0.032"}[5m])) by (instance) / sum(rate(etcd_disk_backend_commit_duration_seconds_count[5m])) by (instance) * 100
# fsync FAQ says p99 should be less than 10ms we do 16
sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{le="0.016"}[5m])) by (instance) / sum(rate(etcd_disk_wal_fsync_duration_seconds_count[5m])) by (instance) * 100
# number of watch streams
sum(grpc_server_started_total{grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})
# number of lease streams
sum(grpc_server_started_total{grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})
# RPC Failed Rate
sum(rate(grpc_server_handled_total{grpc_type="unary",grpc_code!="OK"}[5m]))
# RPC Rate
sum(rate(grpc_server_started_total{grpc_type="unary"}[5m]))
# DB Size
etcd_mvcc_db_total_size_in_bytes
# has leader
sum(etcd_server_has_leader)
RSS
process_resident_memory_bytes{job="$etcd_name"}
# gRPC traffic in (Client)
rate(etcd_network_client_grpc_received_bytes_total{job="$etcd_name"}[5m])
# gRPC traffic out (Client)
rate(etcd_network_client_grpc_sent_bytes_total{job="$etcd_name"}[5m])
# gRPC traffic in (Peer)
sum(rate(etcd_network_peer_received_bytes_total{job="$etcd_name"}[5m])) by (instance)
# gRPC traffic out (Peer)
sum(rate(etcd_network_peer_sent_bytes_total{job="$etcd_name"}[5m])) by (instance)
# CPU %
rate(process_cpu_seconds_total{job="etcd"}[5m]) * 100
# amount of CPU seconds throttled per second
increase(container_cpu_cfs_throttled_periods_total{container_name!="<your-container>",namespace="<your-namespace>"}[5m])
# etcd CPU total with cadvisor
rate(container_cpu_usage_seconds_total{container_name=~"etcd.*",pod_name!=}[5m])
# RSS per namespace
process_resident_memory_bytes{endpoint="etcd-metrics",job="etcd",namespace="openshift-etcd",service="etcd"}
##### etcd log grep strings
E | # errors
C | # catosprophic
pkg/osutil: received terminated # killed
# kubelet
etcd-member.yaml": invalid pod: [spec.initContainers[0].image: Required value] # MCO not sending image
Started container etcd-member
mcdorig
podman run --volume "$PWD:/mount:z" docker.io/ljishen/fio /mount/etcd.fio > result
## fio config
[global]
name=custom
filename=/mount/custom-delete-me
rw=write
bs=2300
fdatasync=1
iodepth=128
[file1]
size=128M
ioengine=libaio
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
container_memory_rss{namespace="openshift-etcd", container="etcd-member"} / (1024 ^ 3)
# 4.4+
container_memory_rss{namespace="openshift-etcd", container="etcd"} / (1024 ^ 3)
sum(rate(etcd_server_leader_changes_seen_total[5m]))
#3.11 memory query
container_memory_rss{namespace=~"kube-system",pod_name=~"master-etcd.*",container_name="etcd"} / (1024 ^ 3)
@hexfusion
Copy link
Author

curl -s --cert $ETCDCTL_CERT --key $ETCDCTL_KEY --cacert $ETCDCTL_CACERT https://127.0.0.1:2379/debug/pprof/heap > heap

@hexfusion
Copy link
Author

#!/bin/bash

LIST=($(etcdctl get --prefix /kubernetes.io/controlplane.operator.openshift.io/podnetworkconnectivitychecks/ --keys-only | sort | uniq))

for key in ${LIST[@]}; do
  echo "$key size: $(etcdctl get $key | wc -c)"
done

@hexfusion
Copy link
Author

dump prom 3.11

$ oc -n openshift-monitoring exec prometheus-k8s-0 -c prometheus -- /bin/bash
-c "tar cvJf - --warning=no-file-changed /prometheus 2>/dev/null" >
prometheus.tar.xz
$ oc -n openshift-monitoring exec prometheus-k8s-1 -c prometheus -- /bin/bash
-c "tar cvJf - --warning=no-file-changed /prometheus 2>/dev/null" >
prometheus.tar.xz

To import it and check for integrity, I spin a local Prometheus container with
the following:

$ sudo podman run --rm -it -p 9090:9090 -u 0 -v
$(pwd)/prometheus:/prometheus/:Z docker.io/prom/prometheus:v2.16.0

@hexfusion
Copy link
Author

hexfusion commented Oct 8, 2020

Inject latency

#!/bin/bash

choice=$(oc get --namespace openshift-etcd --selector etcd pods -o json | jq -r '.items[] | .spec.nodeName + " " + (.status.containerStatuses[] | select(.name=="etcd") | .containerID[8:])' | fzf)
IFS=' ' read node container_id <<< "$choice"

pid=$(oc debug --quiet nodes/$node -- chroot /host crictl inspect -o go-template --template '{{.info.pid}}' $container_id)
oc debug --quiet nodes/$node -- chroot /host strace -Tfe inject=fdatasync:delay_enter=800000 -e trace=fdatasync -p $pid

compose

#!/bin/bash
set -x

CONTAINER_IDS=($(docker-compose ps -q))
PID=$(docker inspect --format '{{ .State.Pid }}' ${CONTAINER_IDS[0]})
echo -e "injecting latency into container id ${CONTAINER_IDS[0]}"

sudo strace -Tfe inject=fdatasync:delay_enter=2400000 -e trace=fdatasync -p $PID

@hexfusion
Copy link
Author

hexfusion commented Oct 21, 2020

terms and latency parse

awk '/became.*at term/ { 
        print $1,$2, $7, $8, $9, $10, $11; } 
/took too long.*[0-9]+s)/ { 
        split($(NF-2),a,"[(s)]");
        if (a[2] > 1) {
                print $1, $2, "slow request took:", $(NF-2)
        } 
}' | tr -d '()' 

sort .. | sort -n -k 6

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment