Last active
November 29, 2017 19:20
-
-
Save caot/535cbb4457911f457315 to your computer and use it in GitHub Desktop.
Check on cluster jobs' status and email report Idle jobs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
NL=$'\n' | |
#EXP='DISK WARNING|DISK OK' | |
OK="DISK OK" | |
WARNING="DISK WARNING" | |
CRITICAL="DISK CRITICAL" | |
EMAIL=someone@somecorp.com | |
EXP="$WARNING|$CRITICAL" | |
REG=$(echo $EXP | sed -e 's/ /[[:space:]]/g') | |
# https://www.monitoring-plugins.org/doc/man/check_disk.html | |
check_disk_cmd="check_disk -w 30% -c 20% -p" | |
check_disk_rmt="module load nagios/plugins-2.2.1 && ${check_disk_cmd}" | |
PATHR=/home/tangc/clustercheck | |
cd $PATHR | |
if [ -f ~/.bash_profile ]; then | |
. ~/.bash_profile | |
elif [ -f ~/.bashrc ]; then | |
. ~/.bashrc | |
else | |
echo `date` "Not Found ~/.bash_profile or ~/.bashrc" | mail -s "Not Found ~/.bash_profile or ~/.bashrc, Cluster check_disk" $EMAIL | |
fi | |
module load nagios/plugins-2.2.1 | |
emailInterval=$(( 3600 * 1 )) | |
checktimestamp() { | |
current=`date +%s` | |
if [ -f $timestamp ]; then | |
last_modified=$(stat -c "%Y" $timestamp) | |
else | |
last_modified=$current | |
fi | |
diff=$(($current-$last_modified)) | |
echo "current: " $current | |
echo "last_modified: " $last_modified | |
echo "diff: " $diff | |
# set email_interval | |
h=$((10#$(date +%H))) | |
if [[ $((10#$(date +%u))) -le 5 ]] ; then | |
if [[ $h -lt 8 || $h -gt 17 ]] ; then | |
emailInterval=$(( 3600 * 12 )) | |
fi | |
else | |
emailInterval=$(( 3600 * 24 )) | |
fi | |
} | |
to_mail() { | |
countwcl=$(echo "{$mailcontent}" | grep -E $REG | wc -l) | |
timestamp=_timestamp`echo $PATHX | sed -e 's/\//_/g'` | |
echo "_timestamp: $timestamp" | |
echo "emailInterval: $emailInterval" | |
checktimestamp | |
echo "countwcl: $countwcl" | |
# send mail? | |
if (( $countwcl > 0 && ($diff == 0 || $diff > $emailInterval) )); then | |
echo 'email ...' | |
echo "${mailcontent}" | sed -e 's/\r/\n/g' | mail -s "-- Cluster check_disk ${PATHX} with status $EXP [$countwcl]" $EMAIL | |
touch ./$timestamp | |
ls -lrt $timestamp | |
else | |
if (( $countwcl < 1 )); then | |
[ -f $timestamp ] && rm ./$timestamp | |
fi | |
# test for status OK, NO email | |
#echo `date` "NO email ...\\"${mailcontent} | mail -s "-- $countwcl, $diff, $emailInterval, Cluster check_disk ${PATHX} with status OK, #NO email ..." $EMAIL | |
: # :, which is a null command in bash | |
fi | |
echo "diff2: " $diff | |
} | |
check_path () { | |
sh pbsnodes_check.sh "${check_disk_rmt} ${PATHX}" | |
sleep 3s | |
mailcontent=$(cat /tmp/pssh-to-be-deleted-Cu8vih*/* | egrep -v "DISK OK|No route to host" | awk '/^compute-/ {x=$0; matched=flag=1; next} matched {x=x"\n"$0; matched=0; } $0&&flag {print x"\n"; flag=0}') | |
mailcontentB=$(echo "${mailcontent}" | grep compute- | awk -F'.' '{ print $1 }' | sort -V | while read i; do echo '-------'$i'---------------------------------------'; checknode $i; done) | |
mailcontentC=$(echo "${mailcontentB}" | grep Jobs: | awk '{ print $2 }' | sort | while read i; do echo '-----'$i'-----------------------------------------'; checkjob $i; done) | |
mailcontent=$mailcontent$'\n\n'$mailcontentB$'\n\n'$mailcontentC | |
to_mail | |
} | |
check_computing_nodes () { | |
for p in /var/spool/torque /tmp /boot /opt | |
do | |
echo $p `date` | |
PATHX=$p | |
check_path $p | |
sleep 0.2s | |
echo `date` "\n" | |
done | |
} | |
check_computing_headnode() { | |
for p in /export/home | |
do | |
PATHX=$p | |
mailcontent=$(hostname)": "$(${check_disk_cmd} ${p}) | |
to_mail #${mailcontent} | |
done | |
} | |
check_computing_nodes | |
check_computing_headnode |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# ref http://unix.stackexchange.com/questions/19008/automatically-run-commands-over-ssh-on-many-servers | |
echo $(date) | |
echo $@ | |
tmp_dir_prefix=pssh-to-be-deleted-Cu8vihiWFhATbsGlSrUO | |
rm -r /tmp/${tmp_dir_prefix}.* | |
tmpdir=${TMPDIR:-/tmp}/${tmp_dir_prefix}.`date +"%Y%m%d.%M%S"` | |
mkdir -p $tmpdir | |
count=0 | |
/usr/local/bin/pbsnodes -a | grep compute- | egrep -v status | sort -V | while read userhost ; do | |
echo --${userhost}-- | |
#userhostname=$(echo ${userhost} | sed -e 's/\.local//g') | |
#echo --${userhostname}-- | |
# ssh -n -o BatchMode=yes ${userhost} 'printf "\n$(hostname)\n" & echo $('$@')' > ${tmpdir}/${userhost} 2>&1 & | |
ssh -XY -n -o BatchMode=yes ${userhost} 'printf "\n$(hostname)\n" & echo "$('$@')"' > ${tmpdir}/${userhost} 2>&1 & | |
count=`expr $count + 1` | |
sleep 0.06 | |
done | |
while [ $count -gt 0 ]; do | |
wait $pids | |
count=`expr $count - 1` | |
done | |
#ls -lrt ${tmpdir} | |
#cat ${tmpdir}/* |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment