Skip to content

Instantly share code, notes, and snippets.

@earthgecko
Created February 24, 2015 16:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save earthgecko/5588dc17c8ebe2a7c082 to your computer and use it in GitHub Desktop.
Save earthgecko/5588dc17c8ebe2a7c082 to your computer and use it in GitHub Desktop.
skyline horizon watcher
#!/bin/bash
#### snype_skyline_horizon.sh ####
#
####
# DESCRIPTION
# This mitigates against horizon running, but not populating redis and horizon
# not running properly in general.
#
# THIS IS JUST A GIST
# THIS IS JUST A GIST
# Deal with logging and variables such as SERVER, statsd_node and if you have
# monit in the mix, etc yourself and your own skyline log paths etc these are
# are just bits for reference for a pattern to run via cron every minute
# Example script log is just flushed here on every run
LOGFILE=/var/log/skyline/snype_skyline_horizon.log
> $LOGFILE
function log () {
local log_string="$1"
local output_type="$2"
echo "$(date +%Y%m%d%H%M%S) - $output_type: $log_string" >> $LOGFILE
}
sleep 30
HORIZON_RESTART=0
if [ ! -f /var/log/skyline/horizon.log ]; then
CHECK_PROCS=$(ps aux | grep -v grep | grep -c "horizon-agent.py start")
if [ $CHECK_PROCS -eq 0 ]; then
log "No horizon processes running" error
HORIZON_RESTART=1
else
log "horizon processes running, but no logfile, restarting horizon" error
HORIZON_RESTART=1
fi
fi
if [ $HORIZON_RESTART -eq 0 ]; then
if [ $HORIZON_RESTART -eq 0 ]; then
LOGFILE_EPOCH=$(stat --format=%Y /var/log/skyline/horizon.log)
SECONDS_DIFFERENCE=$(( $TIMESTAMP - $LOGFILE_EPOCH ))
if [ $SECONDS_DIFFERENCE -gt 180 ]; then
log "The horizon log was last modified $SECONDS_DIFFERENCE seconds ago" warn
log "horizon set to restart" notice
HORIZON_RESTART=1
fi
fi
fi
if [ $HORIZON_RESTART -eq 0 ]; then
# This mitigates against horizon running, but not populating redis
EMPTY_QUEUE=$(tail -n 30 /var/log/skyline/horizon.log | grep -c "worker queue is empty and timed out")
if [ $EMPTY_QUEUE -gt 3 ]; then
log "The horizon log reports worker queue is empty and timed out" warn
tail -n 30 /var/log/skyline/horizon.log | grep "worker queue is empty and timed out" >> $LOGFILE
log "horizon set to restart" notice
HORIZON_RESTART=1
fi
fi
if [ $HORIZON_RESTART -eq 1 ]; then
# Mitigate monit race condition
log "Stopping monit" notice
/sbin/service monit stop >> $LOGFILE
log "Restarting horizon" notice
/etc/init.d/horizon stop >> $LOGFILE
sleep 10
/etc/init.d/horizon start >> $LOGFILE
log "Starting monit" notice
/sbin/service monit start >> $LOGFILE
echo "$HOSTNAME.skyline.horizon.restarted:$TIMESTAMP|g"| nc -w 5 -u $statsd_node 8125
log "Submitted $HOSTNAME.skyline.horizon.restarted:$TIMESTAMP|g to $statsd_node" notice
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment