Skip to content

Instantly share code, notes, and snippets.

@jab416171
Created April 28, 2014 00:00
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jab416171/11358501 to your computer and use it in GitHub Desktop.
Save jab416171/11358501 to your computer and use it in GitHub Desktop.
#!/bin/sh
#
# � 2010 Western Digital Technologies, Inc. All rights reserved.
#
# monitorVolume.sh
# Note: this is called by cron
#
#
PATH=/sbin:/bin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin
. /usr/local/sbin/share-param.sh
. /etc/nas/alert-param.sh
. /etc/system.conf
. /etc/wdcomp.d/wd-nas/wd-nas.conf 2> /dev/null
MAX_USAGE_THRESH=95
MIN_USAGE_THRESH=93
# check DataVolume percent used
percentUsed=`getDataVolumePercentUsed.sh`
echo "% used=${percentUsed} MAX=${MAX_USAGE_THRESH}"
if [ -f /tmp/tst_freespace ] || [ "${percentUsed}" -gt "${MAX_USAGE_THRESH}" ]; then
if [ ! -f ${FREESPACE_STATUS_FILE} ]; then
sendAlert.sh "${diskNearCapacity}"
fi
if [ ! -f ${FREESPACE_STATUS_FILE} ]; then
touch ${FREESPACE_STATUS_FILE}
incUpdateCount.pm system_state
fi
else
if [ "${percentUsed}" -le "${MIN_USAGE_THRESH}" ]; then
if [ -f ${FREESPACE_STATUS_FILE} ]; then
rm -f ${FREESPACE_STATUS_FILE}
incUpdateCount.pm system_state
fi
fi
fi
#!/bin/bash
#
# (c) 2013 Western Digital Technologies, Inc. All rights reserved.
#
# monitorio - Monitor disk activity, and put system into standby. Also, monitor to trigger file tally process
##
PATH=/sbin:/bin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin
. /lib/lsb/init-functions
source /etc/priority.conf
source /etc/system.conf
source /usr/local/sbin/drive_helper.sh
source /etc/wdcomp.d/wd-nas/wd-nas.conf 2> /dev/null
[ -f /usr/local/sbin/ledConfig.sh ] && . /usr/local/sbin/ledConfig.sh
MIN_SINCE_DISK_ACCESS=/tmp/minutes_since_disk_access
TALLY_PIDFILE=/var/run/tally.pid
TALLY_DAEMON=/usr/local/bin/tally
TALLY_PIPE=/var/local/nas_file_tally/tallyd.pipe
MEDIACRAWLER_REWALK=/tmp/mediacrawler_rewalk
# Only give monitorio 20% of the CPU Max
MONITORIO_CPU_SHARE=20
CGROUP_MONITORIO=/sys/fs/cgroup/monitorio
mkdir -p $CGROUP_MONITORIO
echo $MONITORIO_CPU_SHARE > $CGROUP_MONITORIO/cpu.shares
echo $$ > $CGROUP_MONITORIO/tasks
total_df_file=$WD_NAS_VAR_DIR/total_df
# trigger tally (or share size) when df result changes by TALLY_TRIGGER_THRESH_KB
TALLY_TRIGGER_THRESH_KB=1000000
file_tally() {
if [ ! -p $TALLY_PIPE ]; then
mkfifo $TALLY_PIPE
fi
start-stop-daemon --start --quiet --oknodo --nicelevel $monitorio_nice --pidfile $TALLY_PIDFILE --make-pidfile --background --exec $TALLY_DAEMON --
ls -s1NRA --block-size=1 /shares | awk '
{
if ($1 ~ /^[0-9]+$/) {
# printf("#4:%s:%s/%s\0\0\0\0",$1,current_dir,substr($0,index($0,$2)));
printf("#4:%s:%s/%s~~~~",$1,current_dir,substr($0,index($0,$2)));
}
else {
if ($1 != "total") {
current_dir = (substr($0,1,length($0)-1));
}
}
}
END {
printf("#0:0:/tmp/TALLYEND.DONE~~~~");
}
' > $TALLY_PIPE
# ' > /var/local/nas_file_tally/tallyd.txt
# cat /var/local/nas_file_tally/tallyd.txt > $TALLY_PIPE
}
wait_system_ready() {
while [ ! -f "/tmp/ready" ]; do
logger -s "$0: waiting for system to become ready.."
sleep 5
done
}
tmp_share_size=/tmp/share_size
tmp_internal_share_size=/tmp/internal_share_size
tmp_external_share_size=/tmp/external_share_size
calculate_share_size() {
find /shares -maxdepth 1 -mindepth 1 -type d -not -name ".*" -print0 | xargs -0 -I {} getShareSize.sh {} > ${tmp_internal_share_size}
cat $tmp_internal_share_size $tmp_external_share_size > ${tmp_share_size}
}
calculate_external_share_size() {
find /shares -maxdepth 1 -mindepth 1 -type l -print0 | xargs -0 -I {} getShareSize.sh {} > ${tmp_external_share_size}
cat $tmp_internal_share_size $tmp_external_share_size > ${tmp_share_size}
}
checkDataTrigger() {
result="trigger"
if [ -f ${total_df_file} ]; then
total_df=`cat ${total_df_file}`
result=`df | grep /DataVolume | awk -v total_df=${total_df} -v thresh=${TALLY_TRIGGER_THRESH_KB} '{x=$3 - total_df; abs_x=(x >= 0) ? x : -x; if(abs_x >= thresh) printf("trigger")}'`
fi
if [ "$result" == "trigger" ]; then
df | grep /DataVolume | awk '{print $3}' > ${total_df_file}
fi
echo $result
}
mkdir -p `dirname ${SHARE_SIZE_CACHE}`
declare -i sleepcount
declare -i rootdisk_thresh
declare -i enterStandbyTime=0
rm -f /tmp/standby
rm -f ${MEDIACRAWLER_REWALK}
source /etc/standby.conf
resetSleepCount() {
sleepcount=0
# if in emergency run level, set standby threshold to 1 minute, since drive should go into standby as early as possible, otherwise, read config file
if [ "`getRunLevel.pl`" == "emergency" ]; then
standby_time=1
rootdisk_thresh=1
standby_enable="enabled"
else
source /etc/standby.conf
rootdisk_thresh=`expr $standby_time - 1`
fi
}
currentRootDevice=`cat /proc/cmdline | awk -F= 'BEGIN{RS=" "}{ if ($1=="root") print $2 }'`
rootDisk=`basename ${currentRootDevice}`
dataVolumeDisk=`basename ${dataVolumeDevice}`
drivelist=(`internalDrives`)
echo "0" > ${MIN_SINCE_DISK_ACCESS}
# wait for system to become ready
wait_system_ready
# run file tally at startup (in the background)
if [ ! -f $TALLY_DAEMON ]; then
logger "Tally daemon not installed, exiting tally function"
## if tally not present, then call calculate_share_size
calculate_share_size
calculate_external_share_size
rm ${SHARE_SIZE_CACHE}
ln -s ${tmp_share_size} ${SHARE_SIZE_CACHE}
else
file_tally &
fi
if [ "$1" == "debug" ]; then
echo "1" > /proc/sys/vm/block_dump
dmesg -c > /dev/null
fi
while :; do
for i in ${drivelist[@]}; do
hdparm -C $i | grep -q "standby"
standby_test=$?
[ "$standby_test" -eq "1" ] && break
done
if [ "$standby_test" -eq "0" ]; then
sleep 5
continue
else
if [ -f /tmp/standby ]; then
standby_since=`stat --format %z /tmp/standby`
rm -f /tmp/standby
# Cancel blue color and turn on green if applicable
ledCtrl.sh LED_EV_DISK_STBY LED_STAT_OK
### This will allow individual components to register for wakupevents
run-parts /etc/nas/wakeup.d
###
touch ${MEDIACRAWLER_REWALK}
currentTime=`date +%s`
timeInStandby=`expr $currentTime - $enterStandbyTime`
echo "exit standby after $timeInStandby (since $standby_since)"
logger "exit standby after $timeInStandby (since $standby_since)"
if [ "$1" == "debug" ]; then
dmesg -c
fi
fi
resetSleepCount
echo $sleepcount > ${MIN_SINCE_DISK_ACCESS}
trigger_tally=0
iow_root=`awk -v disk="${rootDisk}" '{if ($3==disk) print $10}' /proc/diskstats`
ior_datavol=`awk -v disk="${dataVolumeDisk}" '{if ($3==disk) print $6}' /proc/diskstats`
iow_datavol=`awk -v disk="${dataVolumeDisk}" '{if ($3==disk) print $10}' /proc/diskstats`
if [ "$1" == "debug" ]; then
echo "Init ior_datavol=$ior_datavol ior_datavol2=$ior_datavol2"
echo " iow_datavol=$iow_datavol iow_datavol2=$iow_datavol2"
echo " iow_root=$iow_root iow_root2=$iow_root2"
dmesg -c
fi
while :; do
# Wait for 60 seconds
sleep 60
iow_root2=`awk -v disk="${rootDisk}" '{if ($3==disk) print $10}' /proc/diskstats`
ior_datavol2=`awk -v disk="${dataVolumeDisk}" '{if ($3==disk) print $6}' /proc/diskstats`
iow_datavol2=`awk -v disk="${dataVolumeDisk}" '{if ($3==disk) print $10}' /proc/diskstats`
# check for file tally sync
if [ "$iow_datavol" -ne "$iow_datavol2" ] && [ "`checkDataTrigger`" == "trigger" ]; then
incUpdateCount.pm data_volume_write
monitorFreeSpace.sh
if [ -f $TALLY_DAEMON ]; then
# also run tally if installed
pidofproc -p $TALLY_PIDFILE $TALLY_DAEMON >/dev/null
if [ $? -ne 0 ]; then
file_tally
fi
createBackupTally.sh
else
## if tally not present, then call calculate_share_size
calculate_share_size
fi
fi
# calculate size of external shares. Note that this must be done outside of "checkDataTrigger" so that it is done more often.
calculate_external_share_size
# use data volume writes until near sleep threshold, then check all disk writes
old_sleepcount=sleepcount
if [ $((sleepcount)) -eq $((rootdisk_thresh)) ] && [ "$iow_root" -eq "$iow_root2" ]; then
sleepcount=$((sleepcount+1))
elif [ $((sleepcount)) -lt $((rootdisk_thresh)) ] && [ "$ior_datavol" -eq "$ior_datavol2" ] && [ "$iow_datavol" -eq "$iow_datavol2" ]; then
sleepcount=$((sleepcount+1))
else
resetSleepCount
fi
echo $sleepcount > ${MIN_SINCE_DISK_ACCESS}
if [ "$1" == "debug" ]; then
[ "$sleepcount" != "0" ] && echo "sleepcount: $sleepcount"
[ "$sleepcount" == "0" ] && echo "Disk activity:"
echo "... ior_datavol=$ior_datavol ior_datavol2=$ior_datavol2"
echo "... iow_datavol=$iow_datavol iow_datavol2=$iow_datavol2"
echo "... iow_root=$iow_root iow_root2=$iow_root2"
# dmesg -c
fi
ior_datavol=$ior_datavol2
iow_datavol=$iow_datavol2
iow_root=$iow_root2
smartTestStatus=`getSmartTestStatus.sh | awk '{print $1}'`
if [ "$standby_enable" == "enabled" ] && [ "$sleepcount" -eq "$standby_time" ] && [ "$smartTestStatus" != "inprogress" ]; then
touch /tmp/standby
enterStandbyTime=`date +%s`
echo "Enter standby"
if [ "$1" == "debug" ]; then
echo "`date`: Enter standby "
dmesg -c > /dev/null
fi
for i in ${drivelist[@]}; do
hdparm -y $i >/dev/null
done
# turn on solid blue if applicable
ledCtrl.sh LED_EV_DISK_STBY LED_STAT_IN_PROG
sleep 5
break
fi
done
fi
done
#!/bin/sh
#
# � 2010 Western Digital Technologies, Inc. All rights reserved.
#
# monitorSmartStatus.sh
# Note: this is called by cron
#
#
PATH=/sbin:/bin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin
. /etc/system.conf
. /usr/local/sbin/share-param.sh
. /etc/nas/alert-param.sh
. /usr/local/sbin/drive_helper.sh
[ -f /usr/local/sbin/ledConfig.sh ] && . /usr/local/sbin/ledConfig.sh
# exit if in standby
if [ -f /tmp/standby ]; then
exit 0;
fi
# exit if system with no internal drives
if [ "${DVC_DRIVE_COUNT}" == "0" ]; then
exit 0
fi
atLeastOneDriveFailed=FALSE
driveList=( `internalDrives` )
for drive in "${driveList[@]}"
do
smartctl -d ata -H ${drive} | grep -q PASSED
if [ $? -ne 0 ]; then
atLeastOneDriveFailed=TRUE
fi
done
if [ "$atLeastOneDriveFailed" = "TRUE" ] || [ -f /tmp/tst_smart ]; then
if [ ! -f /tmp/smart_fail ]; then
sendAlert.sh "${driveSmartFail}"
ledCtrl.sh LED_EV_DISK_SMART LED_STAT_ERR
incUpdateCount.pm system_state
fi
touch /tmp/smart_fail
else
rm -f /tmp/smart_fail
fi
#!/bin/sh
#
# (c) 2012 Western Digital Technologies, Inc. All rights reserved.
#
# monitorTemperature.sh
# Note: This is called by init-script monitorTemperature
#
# This script is responsible to monitor temperature of internal drives
# and take actions if temperture is not normal
#
# It takes following actions depending on temperature of internal drives
# if temperature of any drive > TF
# - change led to RED
# - send shutdown alert
# - change run-level to emergency
# - exit
#
# if temperature of any drive between T2 & TF
# - send shutdown-warning alert
# - start a shutdown-warning timer of 1 HR
# - if timer expires change run-level to emergency
# - exit
#
# if temperature of any drive between T1 & T2
# - send high-temperature warning alert
# - exit
#
# To restart all services & get back to normal
# if temperature of all drives <= T2 - Hysterisis
# - send normal temperature alert
# - change led to GREEN
# - change run-level to application
# - exit
#
## --- Includes
PATH=/sbin:/bin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin
source /usr/local/sbin/share-param.sh
source /etc/system.conf
source /etc/nas/alert-param.sh # ( for alerts )
source /usr/local/sbin/drive_helper.sh # ( for internalDrives() )
source /usr/local/sbin/wdStatus.sh # ( for $WDST_XXX status codes )
source /etc/wdcomp.d/wd-nas/temperature-monitor.conf
[ -f /usr/local/sbin/ledConfig.sh ] && . /usr/local/sbin/ledConfig.sh
## --- Constants
## Acronyms
NM=${STATE_NORMAL}
WR=${STATE_WARNING}
SW=${STATE_SHUTDOWN_WARNING}
SI=${STATE_SHUTDOWN_IMMEDIATE}
UK=${STATE_UNKNOWN}
## Internal Constants
## NB: TURN OFF BEFORE CHECKING-IN
DEBUG=0 ## for debugging
## For Testing
## - set TEST=1
## - set DEBUG=1
## - disable the infinite "for MONITOR_TIMER" loop
## - enter different temperatures on input
## - to test with infinite "for MONITOR_TIMER" loop set different values for TEMP_TX
TEST=0
## Logger facility
FAC=local2
## Table of allowed actions based on last & curr state
## last | curr -- NM WR SW SI UK
## |
eval ACTION${NM}="( act_noop act_warning act_start_timer act_emergency act_noop )"
eval ACTION${WR}="( act_normal act_noop act_start_timer act_emergency act_noop )"
eval ACTION${SW}="( act_normal act_hysterisis act_check_timer act_emergency act_check_timer )"
eval ACTION${SI}="( act_restart act_hysterisis act_cooldown act_cooldown act_cooldown )"
eval ACTION${UK}="( act_UK_2_NM act_warning act_start_timer act_emergency act_noop )" ## should never be called as UK state is never saved
## --- Global Variables
drive_list=
last_state=
curr_state=
curr_temp=
## --- Functions
## Get the drive temperature
##
## Input:
## drive device (e.g. sda, sdb)
##
## Output:
## on success - drive temperature
## on failure - ""(empty)
##
## E.g. getDriveTemperature "/dev/sda"
getDriveTemperature()
{
local drive
local temp
## pass arguments
drive=${1}
## get the drive temperature using smart
temp=`smartctl -d ata -A "${drive}" | \
awk '{if ($2 == "Temperature_Celsius") print $10}'`
echo "${temp}" > "${SMART_STATE}"
## return not found if number is not returned
if ! [[ "${temp}" =~ ^[0-9]+$ ]]; then
logger -p ${FAC}.err "$0: Non-numeric drive temperature \"${temp}\" obtained"
return ${WDST_NOTFOUND}
fi
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Drive ${drive} temperature is ${temp}"
fi
## output the temperature
echo "${temp}"
return ${WDST_OK}
}
## Get the current temperature state
##
## Input:
## drive list (e.g. /dev/sda /dev/sdb)
##
## Output:
## on success - temperature state (e.g. NM, WR, SW, SI)
## on failure - ""(empty)
##
## E.g. determineCurrentState "/dev/sda /dev/sdb /dev/sdc"
determineCurrentState()
{
local drive
local temp
local drive_temp
local drive_state
local prev_state
local transition
## allowed temperature states transitions across all drives
## NB: "TRANSITION" is treated as a 2-D array
## prev | next --- NM WR SW SI UK
## |
eval TRANSITION${NM}="( ${NM} ${WR} ${SW} ${SI} ${UK} )"
eval TRANSITION${WR}="( ${WR} ${WR} ${SW} ${SI} ${WR} )"
eval TRANSITION${SW}="( ${SW} ${SW} ${SW} ${SI} ${SW} )"
eval TRANSITION${SI}="( ${SI} ${SI} ${SI} ${SI} ${SI} )"
eval TRANSITION${UK}="( ${UK} ${WR} ${SW} ${SI} ${UK} )"
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Getting current temperature state"
fi
## init
drive_state=${NM}
drive_temp=0
## loop through the drive list & finalize temperature state using
## TRANSITION table
for drive in ${drive_list[@]}
do
## save prev state & temp
prev_state=${drive_state}
## get the current drive temperature
temp=`getDriveTemperature "${drive}"`
## TEST ONLY
if [ ${TEST} -ne 0 ]; then
echo -n "Enter temperature: "
read temp
logger -p ${FAC}.debug "$0: INPUT temperature is ${temp}"
fi
## reset temperature to 0 if not defined
if [ $? -ne ${WDST_OK} ] || [ -z "${temp}" ]; then
temp=0
fi
## NB: ${temp} is integer value
## determine the temperature state of this drive
if [ ${temp} -eq 0 ]; then
state=${UK}
elif [ ${temp} -le ${TEMP_T1} ]; then
state=${NM}
elif [ ${temp} -gt ${TEMP_T1} ] && [ ${temp} -le ${TEMP_T2} ]; then
state=${WR}
elif [ ${temp} -gt ${TEMP_T2} ] && [ ${temp} -le ${TEMP_TF} ]; then
state=${SW}
elif [ ${temp} -gt ${TEMP_TF} ]; then
state=${SI}
fi
## get the actual drive state using the TRANSITION table
transition=TRANSITION${prev_state}[${state}]
drive_state=${!transition}
## update drive temperature if state changes or temperature increases
if [ ${drive_temp} -eq 0 ] || [ ${drive_state} -ne ${prev_state} ] || [ ${drive_temp} -lt ${temp} ]; then
drive_temp=${temp}
fi
## optimization: break the loop if current state is SI (shutdown immediate)
if [ ${drive_state} -eq ${SI} ]; then break; fi
done
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Current Temperature - ${drive_temp}, Current State - ${drive_state}"
fi
## pass to global variables
curr_state=${drive_state}
curr_temp=${drive_temp}
return ${WDST_OK}
}
## --- Action Handlers
act_emergency()
{
## NB: curr_state is always ${SI} in this action
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Action Shutdown, Temperature - ${curr_temp}"
fi
## update the temp state file
echo "${SI}" > "${TEMP_STATE}"
## set over temperature state
touch "${OVER_TEMP_FLAG}"
## change led color to red
ledCtrl.sh LED_EV_THERMO LED_STAT_ERR
## send over-temperature with immediate shutdown alert
sendAlert.sh "${thermalShutdownImmediate}"
## stop the timer
echo 0 > "${TEMP_SHUTDOWN_TIMER}"
## notify system for thermal state-change
incUpdateCount.pm ${THERMAL_STATE_NFY_ID}
## log emergency
logger -p ${FAC}.emerg "$0: Current temperature(${curr_temp}) is over max-threshold, stopping all services"
## change run-level to emergency
changeRunLevel.pl --level=emergency
return ${WDST_OK}
}
act_restart()
{
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Action Restart, Temperature - ${curr_temp}"
fi
## reset over-temperature state
rm -f "${OVER_TEMP_FLAG}"
## change led color to green only if system health is good
ledCtrl.sh LED_EV_THERMO LED_STAT_OK
## send normal temperature alert
sendAlert.sh "${temperatureNormal}"
## update the temp state file
echo "${NM}" > "${TEMP_STATE}"
## notify system for thermal state-change
incUpdateCount.pm ${THERMAL_STATE_NFY_ID}
## log notice
logger -p ${FAC}.notice "$0: Temperature of all drives(${curr_temp}) is now normal, restarting all services"
## change run-level to application mode
changeRunLevel.pl --level=app
return ${WDST_OK}
}
act_cooldown()
{
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Action Cooldown, Temperature - ${curr_temp}"
fi
## log notice
logger -p ${FAC}.notice "$0: Current temperature(${curr_temp}) is still hot, maintaining shutdown"
return ${WDST_OK}
}
act_noop()
{
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Action noop, Temperature - ${curr_temp}"
fi
## no action taken
return ${WDST_OK}
}
act_start_timer()
{
## NB: curr_state is always ${SW} in this action
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Action Shutdown-Warning, Starting Timer, Temperature - ${curr_temp}"
fi
## update the temp state file
echo "${SW}" > "${TEMP_STATE}"
## change led color blinking yellow
ledCtrl.sh LED_EV_THERMO LED_STAT_WARN
## send over-temperature with pending shutdown alert
sendAlert.sh "${thermalShutdownPending}"
## start the shutdown timer
date -u +%s > "${TEMP_SHUTDOWN_TIMER}"
## notify system for thermal state-change
incUpdateCount.pm ${THERMAL_STATE_NFY_ID}
## log critical
logger -p ${FAC}.crit "$0: Over-Temperature condition(${curr_temp}), Shutdown-Warning, Timer started"
return ${WDST_OK}
}
act_check_timer()
{
## NB: curr_state is always ${SW} in this action
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Action Shutdown-Warning, Checking Timer, Temperature - ${curr_temp}"
fi
## get start timer, should never be 0
start_time=( `cat "${TEMP_SHUTDOWN_TIMER}"` )
if [ ${start_time} -eq 0 ]; then return ${WDST_FAILED}; fi
## get current time
curr_time=( `date -u +%s` )
## no action if timer has not expired
## NB: Temperature state shall remain SW
if [ $(( ${curr_time} - ${start_time} )) -le ${MAX_SW_TIME} ]; then
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Timer has not expired"
fi
return ${WDST_OK}
fi
## --- Timer has expired
## log critical
logger -p ${FAC}.notice "$0: Over-Temperature condition(${curr_temp}), Timer expired"
## stop the timer
echo 0 > "${TEMP_SHUTDOWN_TIMER}"
## reset disk-smart led event
ledCtrl.sh LED_EV_THERMO LED_STAT_OK
## modify state to SI
curr_state=${SI}
## initiate last_state->SI action
## execute the action based on last & current state
action_hdlr=ACTION${last_state}[${curr_state}]
${!action_hdlr}
return $?
}
act_warning()
{
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Action Normal -> Warning, Temperature - ${curr_temp}"
fi
## update the temp state file
echo "${WR}" > "${TEMP_STATE}"
## send high-temperature warning
sendAlert.sh "${systemTemperatureHigh}"
## notify system for thermal state-change
incUpdateCount.pm ${THERMAL_STATE_NFY_ID}
## log
logger -p ${FAC}.warning "$0: High-Temperature(${curr_temp}) condition detected"
return ${WDST_OK}
}
act_normal()
{
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Action Shutdown-Warning -> Normal, Temperature - ${curr_temp}"
fi
## stop the timer
echo 0 > "${TEMP_SHUTDOWN_TIMER}"
## reset disk-smart led event
ledCtrl.sh LED_EV_THERMO LED_STAT_OK
## send normal temperature alert
sendAlert.sh "${temperatureNormal}"
## update the temp state file
## NB: Update state "after" all actions when switching to normal are completed
echo "${NM}" > "${TEMP_STATE}"
## notify system for thermal state-change
incUpdateCount.pm ${THERMAL_STATE_NFY_ID}
## log notice
logger -p ${FAC}.notice "$0: Temperature of all drives(${curr_temp}) is now normal"
return ${WDST_OK}
}
act_hysterisis()
{
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Action Hysterisis, Temperature - ${curr_temp}"
fi
## remain in last state if within hysterisis; otherwise update current state to normal
if [ ${curr_temp} -gt $(( ${TEMP_T2} - ${HYSTERISIS} )) ]; then
curr_state=${last_state}
else
curr_state=${NM}
fi
## execute the action based on last & current state
action_hdlr=ACTION${last_state}[${curr_state}]
${!action_hdlr}
return $?
}
act_UK_2_NM()
{
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Action Unknown -> Normal, Temperature - ${curr_temp}"
fi
## NB: No need to notify system state-change as last state was unknown
## update the temp state file
echo "${NM}" > "${TEMP_STATE}"
return ${WDST_OK}
}
## --- Main script
{
## exit if system has no internal drives
if [ "${DVC_DRIVE_COUNT}" == "0" ]; then
exit 0
fi
## get list of drives
drive_list=( `internalDrives` )
## exit if no drives are found
if [ -z "${drive_list}" ]; then
exit 0
fi
logger -p ${FAC}.info "$0: Starting Temperature Monitor"
## init temp state to normal if not over temperature
if [ ! -f "${TEMP_STATE}" ]; then
echo "${NM}" > "${TEMP_STATE}"
if [ -f "${OVER_TEMP_FLAG}" ]; then
echo "${SI}" > "${TEMP_STATE}"
fi
fi
## init shutdown timer if not in shutdown-warning state
last_state=( `cat ${TEMP_STATE}` )
if [ ${last_state} -ne ${SW} ]; then
echo 0 > "${TEMP_SHUTDOWN_TIMER}"
fi
## loop every MONITOR_TIMER seconds
## NB: disable loop for if TEST=1
for (( ; ; `sleep ${MONITOR_TIMER}`)); do
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Starting Temperature Monitor Run"
fi
## get the last saved temperature state
## NB: This state was saved in an earlier run of this script
last_state=( `cat ${TEMP_STATE}` )
if [ $? -ne ${WDST_OK} ] || [ -z "${last_state}" ] || ! [[ "${last_state}" =~ ^[0-9]+$ ]] || [ ${last_state} -ge ${N_STATES} ]; then
last_state=${NM}
## initialize the last state file
echo "${NM}" > "${TEMP_STATE}"
fi
## skip run if in standby & last state is normal
if [ -f "${STANDBY_STATE}" ] && [ ${last_state} -eq ${NM} ]; then
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Skipping run as system is in standby"
fi
continue
fi
## determine the current temperature state
## NB: This function shall set $curr_state & $curr_temp global vars
determineCurrentState
if [ $? -ne ${WDST_OK} ] || [ -z "${curr_state}" ]; then
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Failed to determine current state; continuing"
fi
continue
fi
## execute the action based on last & current state
## NB: Cannot execute ${array${last_state}[${curr_state}]} directly
## It must be saved in to a variable x & executed using ${!x}
action_hdlr=ACTION${last_state}[${curr_state}]
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Last state=${last_state}; action=`echo ${!action_hdlr}`"
fi
${!action_hdlr}
if [ $? -ne ${WDST_OK} ]; then
logger -p ${FAC}.err "$0: Failed to execute the action `echo ${!action_hdlr}`; last state=${last_state}; curr temp=${curr_temp}"
continue
fi
## debug
if [ ${DEBUG} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Finished Temperature Monitor Run"
fi
## break if TEST
if [ ${TEST} -ne 0 ]; then
logger -p ${FAC}.debug "$0: Finished Test Run"
break
fi
done ## for MONITOR_TIMER infinite loop
logger -p ${FAC}.info "$0: Finished Temperature Monitor"
exit 0
}
## --- End of Main script
#!/bin/sh
#
# � 2010 Western Digital Technologies, Inc. All rights reserved.
#
# monitorVolume.sh
# Note: this is called by cron
#
#
PATH=/sbin:/bin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin
. /usr/local/sbin/share-param.sh
. /etc/nas/alert-param.sh
. /etc/system.conf
. /etc/wdcomp.d/wd-nas/wd-nas.conf 2> /dev/null
[ -f /usr/local/sbin/ledConfig.sh ] && . /usr/local/sbin/ledConfig.sh
lockFile="/tmp/monitorVolume"
# exit if in standby, or factory restore in progress
if [ -f /tmp/standby ] || [ -f ${reformatDataVolume} ]; then
exit 0;
fi
# exit if system with no internal drives
if [ "${DVC_DRIVE_COUNT}" == "0" ]; then
exit 0
fi
# exit if already another instance of script is in progress
lockfile-create --retry 0 "${lockFile}" >/dev/null 2>&1
if [ $? -ne 0 ]; then
exit 0
fi
# If script were to take longer than 5 minutes
lockfile-touch ${lockFile} &
pid="$!"
df | grep -q ${dataVolumeDevice}
if [ $? -ne 0 ] || [ -f /tmp/tst_volume ]; then
if [ ! -f /tmp/volume_failed ]; then
sendAlert.sh "${volumeFailure}"
incUpdateCount.pm system_state
fi
ledCtrl.sh LED_EV_VOLUME LED_STAT_ERR
touch /tmp/volume_failed
# clean up mutual exclusion
kill "${pid}" >/dev/null 2>&1
lockfile-remove ${lockFile} >/dev/null 2>&1
exit 0
else
rm -f /tmp/volume_failed
fi
# clean up mutual exclusion
kill "${pid}" >/dev/null 2>&1
lockfile-remove ${lockFile} >/dev/null 2>&1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment