Skip to content

Instantly share code, notes, and snippets.

@g0053
Forked from lunr/smon.sh
Created December 5, 2021 15:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save g0053/1e77d6db7fd859089f0fba56963918bd to your computer and use it in GitHub Desktop.
Save g0053/1e77d6db7fd859089f0fba56963918bd to your computer and use it in GitHub Desktop.
Bash script to monitor a set of servers
#!/bin/bash
# This script will loop through a set of servers and print their load average,
# memory usage statistics and cpu statistics
# If a servers load average is in a warning or critical level, the text color
# of the host and the load average change to yellow (warning) or red (critical)
# The primary SSH Command has a timeout feature in it. If your server is under
# load and cannot respond fast enough, the script will alert on connection timeout.
# If you are running this script on a Mac, it will notify you by running `say`
# and bringing the terminal windows to the front
#
# author: jamesp@teamddm.com
# date: 2011-08-25
# version: 0.9.7
# Usage: smon.sh [-1] SECONDS
# Options: -1 will only run script once, not loop every X seconds
# Notes: key-based SSH login's are required.
LOAD_WARN=1.5
LOAD_CRITICAL=2.25
HOWOFTEN=$1
ONCE=0
# Login as root
HOSTLIST=(
"woody.example.com"
)
# Login as you
UHOSTLIST=(
"buzz.example.com"
)
while getopts "1" OPTION
do
case $OPTION in
1)
ONCE=1
;;
esac
done
if [ ! -n "$1" ]; then
HOWOFTEN="30"
fi
# Function to loop through each host list with a specific user login
# Can print load average and memory statistics
function monit {
USER=$1
HOST=$2
# Login to servers, timing out after 5 seconds and pushing errors out to null
STATS=`ssh -o ConnectTimeout=5 $USER\@$HOST "uptime; free -m | grep -i 'mem\|buffers/cache'; iostat -c 2>&1" 2>&1`
# Cannot connect to host, connection timeout, unresponsive?
if [ $? -gt 0 ]
then
echo -e "\033[37;41m$HOST: Connection timeout\033[0m\n"
say -v Vicki 'Host unresponsive on ' ${HOST%%.*}
osascript -e 'tell application "System Events" to set visible of process "Terminal" to true'
return
fi
# Start compiling statistics
HOSTPRT=`echo $HOST`
LOADAVG=`echo $STATS | awk -F'load averages?:' '{ print $2 }' | awk -F'Mem:' '{ print $1 }'`
CURLOAD=`echo $LOADAVG | awk '{ print $1 }' | sed s/,//g`
MEMUSAGE=`echo $STATS | awk -F'Mem:' '{ print $2 }' | awk '{ print $2 " / " $1 "MB (" $10 "MB free)" }'`
# IOSTAT report order: %user, %nice, %system, %iowait
IOSTAT=`echo $STATS | awk -F'avg-cpu:' '{ print $2}' | awk -F'idle' '{ print $2 }' | awk '{ print $1 "%us, " $3 "%sy, " $2 "%ni, " $4 "%wa" }'`
# Check server load and if critical or warn, then change terminal colors
if [ $(echo "$CURLOAD > $LOAD_CRITICAL" | bc) == "1" ]
then
echo -e "\033[37;41m$HOSTPRT\033[0m"
PARTIALLOAD=`echo $STATS | head -n 1 | awk '{ print $11 " " $12 }'`
LOADAVGPRT=`echo -e "\x1b[5m\033[37;41m$CURLOAD\033[0m\x1b[25m, $PARTIALLOAD"`
NOTIFYONCRITICAL=1
WHOISCRITICAL=$HOST
elif [ $(echo "$CURLOAD > $LOAD_WARN" | bc) == "1" ]
then
echo -e "\033[1;33m$HOSTPRT\033[0m"
PARTIALLOAD=`echo $STATS | head -n 1 | awk '{ print $11 " " $12 }'`
LOADAVGPRT=`echo -e "\033[1;33m$CURLOAD\033[0m, $PARTIALLOAD"`
else
echo -e $HOSTPRT
LOADAVGPRT=$LOADAVG
fi
# Print statistics
echo -e "\tLoad average: $LOADAVGPRT"
echo -e "\tMemory: $MEMUSAGE"
echo -e "\tCPU: $IOSTAT"
echo -e ''
}
# Start infinite loop to go through servers
while :
do
clear
NOTIFYONCRITICAL=false
WHOISCRITICAL=false
NOW=`date`
echo -e "Monitoring servers every $HOWOFTEN seconds... \t\t $NOW\n\n"
# Loop through root@HOST servers
for HOST in ${HOSTLIST[*]}; do
monit 'root' $HOST
done
# Loop through <YOU>@HOST servers
for UHOST in ${UHOSTLIST[*]}; do
monit `whoami` $UHOST
done
# IF any of the hosts report critical load, execute notification
# Notifications are to beep twice and bring terminal window to front
if [ $NOTIFYONCRITICAL == "1" ]
then
echo -e '\t\033[37;41mCRITICAL FOUND\033[0m\n'
#osascript -e "beep 2"
say -v Vicki 'load critical on ' ${WHOISCRITICAL%%.*}
osascript -e 'tell application "System Events" to set visible of process "Terminal" to true'
fi
# If Once option is set, then only run the loop once
if [ $ONCE == "1" ]
then
echo -e "\n\t\t-- Running once --\n"
exit
fi
# print that we're done looping through servers
echo -e '\n\t -- last update:' `date +%T` ' --\n'
# Sleep the loop for X seconds
sleep $HOWOFTEN
# continue infinite loop
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment