Skip to content

Instantly share code, notes, and snippets.

@FlorianHeigl
Last active February 29, 2024 10:46
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save FlorianHeigl/7790daac7f84a491e10f59dfd862432a to your computer and use it in GitHub Desktop.
Save FlorianHeigl/7790daac7f84a491e10f59dfd862432a to your computer and use it in GitHub Desktop.
omd tuning script
#!/usr/bin/env bash
# License: BSD
# Author: Florian Heigl
set -eu
# variablen ziehen falls nicht da
SITECFG=~/etc/omd/site.conf
test -r $SITECFG && bash -eun $SITECFG && source $SITECFG
# total memory ermitteln
TOTAL=$( free | awk '/Mem:/ {print $2}' )
# liste von testkommandos
tests[1]="grep -c ^processor /proc/cpuinfo"
tests[2]="echo $(( ${TOTAL} / 1024 / 1024 )) GB"
tests[3]="cat /proc/sys/kernel/pid_max"
tests[4]="cat /proc/sys/fs/file-max"
tests[5]="awk '{print \$1}' /proc/sys/fs/file-nr"
tests[6]="grep $(df --output=source $OMD_ROOT | grep ^/dev) /proc/mounts | awk '{print \$4}'"
tests[7="ps hux -U $OMD_SITE | awk -v total=$TOTAL '{ sum += \$6 } END { printf \"%.2f%%\n\", sum / total * 100}'"
tests[8]="ulimit -n"
tests[9]="ulimit -u"
tests[10]="lsof | grep -c $OMD_SITE"
tests[11]="ps -ef | grep -c ${OMD_SITE}"
# rrdcached can kill the system if it falls behind, the worker setting in OMD is not correctly applied, so you normally
# only have the default number of workers!
tests[12]="grep ^write_bytes /proc/$( pgrep -u $OMD_SITE rrdcached )/io | awk '{print \$2 / 1024 / 1024}'"
tests[13]="grep ^read_bytes /proc/$( pgrep -u $OMD_SITE rrdcached )/io | awk '{print \$2 / 1024 / 1024}'"
# active check worker, can block and _will_ block on down hosts (they're executed anyway)
tests[14]="pgrep -u $OMD_SITE checkhelper | wc -l"
# cmk workers, needed to get check throughput. beware there are rolling restart issues with some buggy checks that OOM them.
tests[15]="pgrep -u $OMD_SITE -f 'python /omd/sites/$OMD_SITE/bin/cmk --keepalive' | wc -l"
tests[16]="grep -c 'Resource temp' ~/var/log/cmc.log"
# connected to number of livestatus slots
tests[17]="pgrep -u $OMD_SITE -f 'python /omd/sites/$OMD_SITE/bin/liveproxyd' | wc -l"
tests[18]="grep -c 'Site is considered dead. Closing all connections.' ~/var/log/liveproxyd.log"
tests[19]="grep -c -E 'Cannot forward next' ~/var/log/liveproxyd.log"
tests[20]="curl -s localhost:${CONFIG_APACHE_TCP_PORT}/server-status | grep -E 'requests currently being processed'"
tests[21]="grep -c 'WARNING: ping-queueing has lasted' ~/var/log/cmc.log" # richtiges log?
tests[22]="ethtool -g eth0 | grep -A1 'Current hardware' | grep ^RX | awk '{print \$2}'"
tests[23]="ethtool -S eth0 | grep -i OOB | awk '{sum+=\$4} END {print sum}'"
#tests[24]="" # lost the command
# liste von tests, name je nach komponente
messages[1]="System: CPU Cores"
messages[2]="System: Total Memory"
messages[3]="System: process limit"
messages[4]="System: file limit"
messages[5]="System: open file handles"
messages[6]="System: OMD_ROOT mount info"
messages[7]="OMD Site: Total memory used"
messages[8]="OMD Site: process limit"
messages[9]="OMD Site: file limit"
messages[10]="OMD Site: open file handles"
messages[11]="OMD Site: running processes"
messages[12]="rrdcached: written GB"
messages[13]="rrdcached: read GB"
messages[14]="CMC: active check workers"
messages[15]="CMC: check_mk workers"
messages[16]="CMC: resources exhaustion errors"
messages[17]="Liveproxyd: processes"
messages[18]="Liveproxyd: remote site conn dead errors"
messages[19]="Liveproxyd: remote site query aborted errors"
messages[20]="Apache: worker usage"
messages[21]="icmphelper: ping-queue over 100ms errors "
messages[22]="System: RX buffer size "
messages[23]="System: NIC out of buffer errrors "
messages[24]="System: driver rx/tx drop errors "
#TODO:
# would be nice, kann man aber auch einfach direkt monitoren
# memory stats pro prozessgruppe
# wait auf prozessen der site
# site uptime ermitteln und mit prozess usage rechnen (aber dann kann man sie auch einfach monitoren)
# evtl. noch reinnehmen:
# compressing / caching in site.conf, sinnvolle settings ja/nein
# tests anwerfen, werte ausgeben
for index in ${!messages[*]}; do
echo "${messages[$index]} : $(eval "${tests[$index]}" )"
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment