Skip to content

Instantly share code, notes, and snippets.

@dcapwell
Last active January 12, 2018 02:23
Show Gist options
  • Save dcapwell/d290fcc82710c43c38482db5b8939629 to your computer and use it in GitHub Desktop.
Save dcapwell/d290fcc82710c43c38482db5b8939629 to your computer and use it in GitHub Desktop.
Usage, Saturation, Errors checklist script
#!/usr/bin/env bash
##
## Script is a automation for collecting metrics that represent (U)sage, (S)aturation, and (E)rrors (USE).
## This script is mostly automating [1], so more details can be found there
##
## References
## * [1] http://www.brendangregg.com/USEmethod/use-linux.html
##
#set -x
#set -e
#set -o pipefail
set -u
function banner() {
echo ""
echo "### $@ ###"
echo ""
}
function comment() {
echo "// $@"
}
function run() {
echo ''
echo "[COMMAND] $@"
eval "$@" 2>&1 || true
}
DELAY=1
COUNT=10
banner "Host Details"
run hostname -f
run cat /etc/*release*
run uname -a
run ethtool eth0
run cat /proc/cpuinfo
run lscpu
run numactl --hardware
run uptime
# should be 1
run cat /proc/sys/kernel/perf_event_paranoid
# should be 0
run cat /proc/sys/kernel/kptr_restrict
run sudo -n perf list
run ps -ef
banner "CPU utilization (system-wide)"
comment '"us" + "sy" + "st"'
run vmstat -n $DELAY $COUNT
comment 'sum fields except "%idle" and "%iowait"'
run sar -u
comment 'sum fields except "idl" and "wai"'
run dstat -c $DELAY $COUNT
run numastat
run sudo -n perf stat -a -- sleep 30
banner "CPU utilization (per-cpu)"
# could also run sar -P ALL if mpstat is not installed
comment 'sum fields except "%idle" and "%iowait"'
run mpstat -P ALL $DELAY $COUNT
banner "CPU utilization (per-process)"
comment '"%CPU"'
run "echo P | top -b -d $DELAY -n $COUNT"
comment '"%CPU"'
run pidstat $DELAY $COUNT
banner "CPU saturation (system-wide)"
comment '"r" > CPU count'
run vmstat $DELAY $COUNT
comment '"runq-sz" > CPU count'
run sar -q
comment '"run" > CPU count'
run dstat -p $DELAY $COUNT
banner "CPU saturation (per-process)"
comment '2nd field (sched_info.run_delay)'
for pid in $(ps -A -o pid); do
run cat /proc/$pid/schedstat || true
done
# some kernel settings block this, so don't fail and move on
comment '(shows "Average" and "Maximum" delay per-schedule)'
run "sudo -n perf sched record -- sleep 1 && sudo -n perf sched latency -v" || true
run "sudo -n perf timechart"
banner "Memory utilization (system-wide)"
comment '"Mem:" (main memory), "Swap:" (virtual memory'
run free -m
comment '"free" (main memory), "swap" (virtual memory)'
run vmstat $DELAY $COUNT
comment '"%memused"'
run sar -r
comment '"free"'
run dstat -m $DELAY $COUNT
comment 'for kmem slab usage'
run sudo -n slabtop -s c --once
banner "Memory saturation (system-wide)"
comment '"si"/"so" (swapping)'
run vmstat $DELAY $COUNT
comment '"pgscank" + "pgscand" (scanning)'
run sar -B
run sar -W
banner "Memory saturation (per-process)"
comment '10th field (min_flt)'
for pid in $(ps -A -o pid); do
run cat /proc/$pid/stat || true
done
banner "Memory errors"
run "dmesg | grep killed"
run dmesg
banner "Network utilization"
comment '"rxKB/s"/max "txKB/s"/max'
run sar -n DEV $DELAY $COUNT
comment 'RX/TX tput / max bandwidth'
run ip -s link
comment '"bytes" RX/TX tput/max'
run cat /proc/net/dev
banner "Network saturation"
comment '"overruns", "dropped"'
run ifconfig
comment '"segments retransmited"'
run netstat -s
comment '*drop and *fifo metrics'
run sar -n EDEV
banner "Network errors"
comment ' "errors", "dropped"'
run ifconfig
comment '"RX-ERR"/"TX-ERR"'
run netstat -i
comment '"rxerr/s" "txerr/s"'
run sar -n EDEV
comment '"errs", "drop"'
run cat /proc/net/dev
banner "I/O utilization (system-wide)"
comment '"%util"'
run iostat -xz $DELAY $COUNT
comment '"%util"'
run sar -d
banner "I/O utilization (per-process)"
run sudo -n iotop --batch --delay=$DELAY --iter=$COUNT
run pidstat -d
for pid in $(ps -A -o pid); do
run cat /proc/$pid/sched | grep 'se.statistics.iowait_sum' || true
done
banner "I/O saturation"
comment '"avgqu-sz" > 1, or high "await"'
run iostat -xnz $DELAY $COUNT
for d in $(find /sys/devices/ -name ioerr_cnt); do
run cat "$d"
done
banner "Storage"
run swapon -s
run free
run "cat /proc/meminfo | grep -i swap"
run df -h
run "sudo -n cat /var/log/messages | grep -i err"
banner "Storage Controller"
comment 'sum devices and compare to known IOPS/tput limits per-card'
run iostat -xz $DELAY $COUNT
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment