Skip to content

Instantly share code, notes, and snippets.

@caot
Last active November 29, 2017 19:20
Show Gist options
  • Save caot/535cbb4457911f457315 to your computer and use it in GitHub Desktop.
Save caot/535cbb4457911f457315 to your computer and use it in GitHub Desktop.
Check on cluster jobs' status and email report Idle jobs
#!/bin/bash
NL=$'\n'
#EXP='DISK WARNING|DISK OK'
OK="DISK OK"
WARNING="DISK WARNING"
CRITICAL="DISK CRITICAL"
EMAIL=someone@somecorp.com
EXP="$WARNING|$CRITICAL"
REG=$(echo $EXP | sed -e 's/ /[[:space:]]/g')
# https://www.monitoring-plugins.org/doc/man/check_disk.html
check_disk_cmd="check_disk -w 30% -c 20% -p"
check_disk_rmt="module load nagios/plugins-2.2.1 && ${check_disk_cmd}"
PATHR=/home/tangc/clustercheck
cd $PATHR
if [ -f ~/.bash_profile ]; then
. ~/.bash_profile
elif [ -f ~/.bashrc ]; then
. ~/.bashrc
else
echo `date` "Not Found ~/.bash_profile or ~/.bashrc" | mail -s "Not Found ~/.bash_profile or ~/.bashrc, Cluster check_disk" $EMAIL
fi
module load nagios/plugins-2.2.1
emailInterval=$(( 3600 * 1 ))
checktimestamp() {
current=`date +%s`
if [ -f $timestamp ]; then
last_modified=$(stat -c "%Y" $timestamp)
else
last_modified=$current
fi
diff=$(($current-$last_modified))
echo "current: " $current
echo "last_modified: " $last_modified
echo "diff: " $diff
# set email_interval
h=$((10#$(date +%H)))
if [[ $((10#$(date +%u))) -le 5 ]] ; then
if [[ $h -lt 8 || $h -gt 17 ]] ; then
emailInterval=$(( 3600 * 12 ))
fi
else
emailInterval=$(( 3600 * 24 ))
fi
}
to_mail() {
countwcl=$(echo "{$mailcontent}" | grep -E $REG | wc -l)
timestamp=_timestamp`echo $PATHX | sed -e 's/\//_/g'`
echo "_timestamp: $timestamp"
echo "emailInterval: $emailInterval"
checktimestamp
echo "countwcl: $countwcl"
# send mail?
if (( $countwcl > 0 && ($diff == 0 || $diff > $emailInterval) )); then
echo 'email ...'
echo "${mailcontent}" | sed -e 's/\r/\n/g' | mail -s "-- Cluster check_disk ${PATHX} with status $EXP [$countwcl]" $EMAIL
touch ./$timestamp
ls -lrt $timestamp
else
if (( $countwcl < 1 )); then
[ -f $timestamp ] && rm ./$timestamp
fi
# test for status OK, NO email
#echo `date` "NO email ...\\"${mailcontent} | mail -s "-- $countwcl, $diff, $emailInterval, Cluster check_disk ${PATHX} with status OK, #NO email ..." $EMAIL
: # :, which is a null command in bash
fi
echo "diff2: " $diff
}
check_path () {
sh pbsnodes_check.sh "${check_disk_rmt} ${PATHX}"
sleep 3s
mailcontent=$(cat /tmp/pssh-to-be-deleted-Cu8vih*/* | egrep -v "DISK OK|No route to host" | awk '/^compute-/ {x=$0; matched=flag=1; next} matched {x=x"\n"$0; matched=0; } $0&&flag {print x"\n"; flag=0}')
mailcontentB=$(echo "${mailcontent}" | grep compute- | awk -F'.' '{ print $1 }' | sort -V | while read i; do echo '-------'$i'---------------------------------------'; checknode $i; done)
mailcontentC=$(echo "${mailcontentB}" | grep Jobs: | awk '{ print $2 }' | sort | while read i; do echo '-----'$i'-----------------------------------------'; checkjob $i; done)
mailcontent=$mailcontent$'\n\n'$mailcontentB$'\n\n'$mailcontentC
to_mail
}
check_computing_nodes () {
for p in /var/spool/torque /tmp /boot /opt
do
echo $p `date`
PATHX=$p
check_path $p
sleep 0.2s
echo `date` "\n"
done
}
check_computing_headnode() {
for p in /export/home
do
PATHX=$p
mailcontent=$(hostname)": "$(${check_disk_cmd} ${p})
to_mail #${mailcontent}
done
}
check_computing_nodes
check_computing_headnode
#!/bin/bash
# ref http://unix.stackexchange.com/questions/19008/automatically-run-commands-over-ssh-on-many-servers
echo $(date)
echo $@
tmp_dir_prefix=pssh-to-be-deleted-Cu8vihiWFhATbsGlSrUO
rm -r /tmp/${tmp_dir_prefix}.*
tmpdir=${TMPDIR:-/tmp}/${tmp_dir_prefix}.`date +"%Y%m%d.%M%S"`
mkdir -p $tmpdir
count=0
/usr/local/bin/pbsnodes -a | grep compute- | egrep -v status | sort -V | while read userhost ; do
echo --${userhost}--
#userhostname=$(echo ${userhost} | sed -e 's/\.local//g')
#echo --${userhostname}--
# ssh -n -o BatchMode=yes ${userhost} 'printf "\n$(hostname)\n" & echo $('$@')' > ${tmpdir}/${userhost} 2>&1 &
ssh -XY -n -o BatchMode=yes ${userhost} 'printf "\n$(hostname)\n" & echo "$('$@')"' > ${tmpdir}/${userhost} 2>&1 &
count=`expr $count + 1`
sleep 0.06
done
while [ $count -gt 0 ]; do
wait $pids
count=`expr $count - 1`
done
#ls -lrt ${tmpdir}
#cat ${tmpdir}/*
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment