Last active
February 18, 2021 10:38
-
-
Save baileythegreen/48cf072d2016d3e6a544233d90b9e8dd to your computer and use it in GitHub Desktop.
This is a set of files used to monitor disk usage on a remote cluster with a SunGridEngine-type scheduler. It is designed to run as a daemon process with very low overhead, that runs periodic checks for how much of a group's space is being used. It is intended to catch runaway jobs before they can write Terabytes of meaningless textfiles, or to …
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file is only here so the gist will be named correctly. Ignore it. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This is a dummy flag. | |
You can delete this file to trigger the infinite loop to fail. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/bash | |
# In order to run this as a daemon, use: | |
# | |
# nohup ./monitor_daemon.sh & | |
# | |
# NOT: | |
# | |
# sh monitor_daemon.sh | |
# . monitor_daemon.sh | |
# | |
# | |
# `Nohup` ensures that the process will not be broken by logging out of the system. | |
# `.` ensures it will be named 'monitor_daemon.sh' and not 'sh' in the output of `ps`, | |
# so you can still find and kill it, if necessary. | |
# `&` places it in the background so you can start it and then continue doing other things. | |
# The script should ideally be started after midnight, other wise the days for the comparisons might be slightly off. | |
# This may or may not be problematic. | |
# To use this, the following strings will need to be filled in: | |
# <path_to_group_space> | |
# <path_to_script> | |
# <quota> # this should be the part of the `quota` output that you want to consider. | |
# It's two lines in this case: a path, and then a line in the form of `51945.46 GB of 60416.00 GB (85.97%) used ` | |
# <email> | |
# You may wish to replace references to 'Eddie' with the name of your system. | |
function storage_calculator(){ | |
FOLDER=(`ls -d <path_to_group_space>`) | |
echo ${FOLDER[@]} | tr ' ' '\n' > <path_to_group_space>/.dirlist | |
qsub -t 1-${#FOLDER[@]} <path_to_script>/storage_calculator.sh | |
} | |
# this creates an infinite loop; don't panic; it's fine | |
while true | |
do | |
for x in {0..3} | |
do | |
# check for flag file, if not, die | |
if [ ! -f ~/.monitor_flag ] | |
then | |
exit | |
fi | |
# echo "Starting: `date`" # testing line | |
# get names of today and yesterday | |
today=`date +"%A"` | |
yesterday=`date --date="1 days ago" +"%A"` | |
# determine which file to use for the six-hour comparison (the only one where the file can vary) | |
if [[ $x -eq 0 ]] | |
then | |
today_usage=(`sed "" ${yesterday}.usage`) | |
else | |
today_usage=(`sed "" newday.usage`) | |
fi | |
# echo "After if statement: `date`" # testing line | |
# get usage numbers from yesterday and a week ago | |
yesterday_usage=(`sed "" ${yesterday}.usage`) | |
sennight_usage=(`sed "" ${today}.usage`) | |
# get current usage level and other things for comparisons | |
percent_float="`quota | grep -A 1 <quota> | tr -d '\n' | awk '{print $7}' | tr -d '()%'`" | |
percent=`printf '%.*f\n' 0 $percent_float` | |
used_float="`quota | grep -A 1 <quota> | tr -d '\n' | awk '{print $2}' | tr -d '()%'`" | |
used=`printf '%.*f\n' 0 $used_float` | |
# echo "All variables set: `date`" # testing line | |
# write the new used amounts to file for record keeping | |
echo $used >> newday.usage | |
# if last cycle of `for` loop, write contents of newday.usage over the file for today | |
if [[ $x -eq 3 ]] | |
then | |
echo `cat newday.usage` > ${today}.usage | |
rm newday.usage | |
fi | |
#echo "Math: `date`" # testing line | |
## comparisons; thresholds used were arbitrarily chosen; they are run in order of precedence | |
## if a single one fails, a notification email is sent | |
## if all pass, an email is still sent, as proof the cluster hasn't spontaneously died, but this could be removed | |
ninetyfive=`echo "$percent >= 95" | bc -l` | |
six=`echo "$used - ${today_usage[$x-1]}" | bc -l` | |
twentyfour=`echo "$used - ${yesterday_usage[$x]}" | bc -l` | |
week=`echo "$used - ${sennight_usage[$x]}" | bc -l` | |
#echo "Tests: `date`" # testing line | |
if [[ $ninetyfive -eq 1 ]] | |
then | |
mailx -s "Eddie storage alert" <email> <<< "$used_float GB ($percent %) of the Eddie space is full." | |
storage_calculator | |
elif [[ $six -ge 1000 ]] | |
then | |
mailx -s "Eddie storage alert" <email> <<< "$used GB ($percent %) of the Eddie space is full. Six hours ago, ${today_usage[$x-1]} GB was being used." | |
storage_calculator | |
elif [[ $twentyfour -ge 3000 ]] | |
then | |
mailx -s "Eddie storage alert" <email> <<< "$used GB ($percent %) of the Eddie space is full. Yesterday, ${yesterday_usage[$x]} GB was being used." | |
storage_calculator | |
elif [[ $week -ge 5000 ]] | |
then | |
mailx -s "Eddie storage alert" <email> <<< "$used GB ($percent %) of the Eddie space is full. Last week, ${sennight_usage[$x]} GB was being used." | |
storage_calculator | |
else | |
mailx -s "Eddie storage is fine" <email> <<< "Everything is good." | |
fi | |
sleep 21600 | |
done | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/bash | |
#$ -N storage_check | |
#$ -wd <path_to_working_directory> | |
#$ -l h_vmem=2G | |
#$ -l h_rt=24:00:00 | |
#$ -e /dev/null | |
#$ -o /dev/null | |
### Comments ### | |
# This file takes a file with a list of directories, one per line, | |
# and runs `du` on them. It is intended to help diagnose which user | |
# might be using more disk space than they are aware. | |
parameter_file=./.dirlist | |
line=`sed -n -e "${SGE_TASK_ID} p" ${parameter_file}` | |
line=`basename ${line}` | |
#exec &> ${line}.space_usage.out | |
#exec 2> ${line}.space_usage.error | |
du --total --summarize -h ${line}/* > ${line}.space_usage |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment