Created
August 23, 2011 15:14
-
-
Save lunr/1165392 to your computer and use it in GitHub Desktop.
Bash script to monitor a set of servers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This script will loop through a set of servers and print their load average, | |
# memory usage statistics and cpu statistics | |
# If a servers load average is in a warning or critical level, the text color | |
# of the host and the load average change to yellow (warning) or red (critical) | |
# The primary SSH Command has a timeout feature in it. If your server is under | |
# load and cannot respond fast enough, the script will alert on connection timeout. | |
# If you are running this script on a Mac, it will notify you by running `say` | |
# and bringing the terminal windows to the front | |
# | |
# author: jamesp@teamddm.com | |
# date: 2011-08-25 | |
# version: 0.9.7 | |
# Usage: smon.sh [-1] SECONDS | |
# Options: -1 will only run script once, not loop every X seconds | |
# Notes: key-based SSH login's are required. | |
LOAD_WARN=1.5 | |
LOAD_CRITICAL=2.25 | |
HOWOFTEN=$1 | |
ONCE=0 | |
# Login as root | |
HOSTLIST=( | |
"woody.example.com" | |
) | |
# Login as you | |
UHOSTLIST=( | |
"buzz.example.com" | |
) | |
while getopts "1" OPTION | |
do | |
case $OPTION in | |
1) | |
ONCE=1 | |
;; | |
esac | |
done | |
if [ ! -n "$1" ]; then | |
HOWOFTEN="30" | |
fi | |
# Function to loop through each host list with a specific user login | |
# Can print load average and memory statistics | |
function monit { | |
USER=$1 | |
HOST=$2 | |
# Login to servers, timing out after 5 seconds and pushing errors out to null | |
STATS=`ssh -o ConnectTimeout=5 $USER\@$HOST "uptime; free -m | grep -i 'mem\|buffers/cache'; iostat -c 2>&1" 2>&1` | |
# Cannot connect to host, connection timeout, unresponsive? | |
if [ $? -gt 0 ] | |
then | |
echo -e "\033[37;41m$HOST: Connection timeout\033[0m\n" | |
say -v Vicki 'Host unresponsive on ' ${HOST%%.*} | |
osascript -e 'tell application "System Events" to set visible of process "Terminal" to true' | |
return | |
fi | |
# Start compiling statistics | |
HOSTPRT=`echo $HOST` | |
LOADAVG=`echo $STATS | awk -F'load averages?:' '{ print $2 }' | awk -F'Mem:' '{ print $1 }'` | |
CURLOAD=`echo $LOADAVG | awk '{ print $1 }' | sed s/,//g` | |
MEMUSAGE=`echo $STATS | awk -F'Mem:' '{ print $2 }' | awk '{ print $2 " / " $1 "MB (" $10 "MB free)" }'` | |
# IOSTAT report order: %user, %nice, %system, %iowait | |
IOSTAT=`echo $STATS | awk -F'avg-cpu:' '{ print $2}' | awk -F'idle' '{ print $2 }' | awk '{ print $1 "%us, " $3 "%sy, " $2 "%ni, " $4 "%wa" }'` | |
# Check server load and if critical or warn, then change terminal colors | |
if [ $(echo "$CURLOAD > $LOAD_CRITICAL" | bc) == "1" ] | |
then | |
echo -e "\033[37;41m$HOSTPRT\033[0m" | |
PARTIALLOAD=`echo $STATS | head -n 1 | awk '{ print $11 " " $12 }'` | |
LOADAVGPRT=`echo -e "\x1b[5m\033[37;41m$CURLOAD\033[0m\x1b[25m, $PARTIALLOAD"` | |
NOTIFYONCRITICAL=1 | |
WHOISCRITICAL=$HOST | |
elif [ $(echo "$CURLOAD > $LOAD_WARN" | bc) == "1" ] | |
then | |
echo -e "\033[1;33m$HOSTPRT\033[0m" | |
PARTIALLOAD=`echo $STATS | head -n 1 | awk '{ print $11 " " $12 }'` | |
LOADAVGPRT=`echo -e "\033[1;33m$CURLOAD\033[0m, $PARTIALLOAD"` | |
else | |
echo -e $HOSTPRT | |
LOADAVGPRT=$LOADAVG | |
fi | |
# Print statistics | |
echo -e "\tLoad average: $LOADAVGPRT" | |
echo -e "\tMemory: $MEMUSAGE" | |
echo -e "\tCPU: $IOSTAT" | |
echo -e '' | |
} | |
# Start infinite loop to go through servers | |
while : | |
do | |
clear | |
NOTIFYONCRITICAL=false | |
WHOISCRITICAL=false | |
NOW=`date` | |
echo -e "Monitoring servers every $HOWOFTEN seconds... \t\t $NOW\n\n" | |
# Loop through root@HOST servers | |
for HOST in ${HOSTLIST[*]}; do | |
monit 'root' $HOST | |
done | |
# Loop through <YOU>@HOST servers | |
for UHOST in ${UHOSTLIST[*]}; do | |
monit `whoami` $UHOST | |
done | |
# IF any of the hosts report critical load, execute notification | |
# Notifications are to beep twice and bring terminal window to front | |
if [ $NOTIFYONCRITICAL == "1" ] | |
then | |
echo -e '\t\033[37;41mCRITICAL FOUND\033[0m\n' | |
#osascript -e "beep 2" | |
say -v Vicki 'load critical on ' ${WHOISCRITICAL%%.*} | |
osascript -e 'tell application "System Events" to set visible of process "Terminal" to true' | |
fi | |
# If Once option is set, then only run the loop once | |
if [ $ONCE == "1" ] | |
then | |
echo -e "\n\t\t-- Running once --\n" | |
exit | |
fi | |
# print that we're done looping through servers | |
echo -e '\n\t -- last update:' `date +%T` ' --\n' | |
# Sleep the loop for X seconds | |
sleep $HOWOFTEN | |
# continue infinite loop | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment