Skip to content

Instantly share code, notes, and snippets.

@aivanise
Created December 7, 2021 07:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aivanise/5414924f1bbcf84cbdff0a26916b75ee to your computer and use it in GitHub Desktop.
Save aivanise/5414924f1bbcf84cbdff0a26916b75ee to your computer and use it in GitHub Desktop.
#!/bin/bash
#
# automatic (linux) router failover script
# pings around and changes the default route to backup
# if all the sites are not available for TIMEOUT seconds
# source the script to get useful control functions (type help for help)
# run it every minute or so to fail the default route over automatically
# depoending on the availability of the IPs
# IPS: put the list of IP addresses to be pinged prefixed by #IP, only the second field is looked at, you can put the comment in the rest
#IP 194.25.0.60 resolv-h.dtag.de
#IP 194.25.0.68 resolv-f.dtag.de
#IP 194.25.0.52 resolv-l.dtag.de
#IP 109.234.111.81 domaindiscount24.com
#IP 185.194.238.252 ip-projects.de
#IP 104.244.42.65 twitter.com
#IP 8.8.8.8 google.dns1
#IP 8.8.4.4 google.dns2
#IP 62.138.238.100 t-online.de.1
#IP 62.138.239.100 t-online.de.2
#IP 1.1.1.1 cloudflare dns
[[ -z "$TIMEOUT" ]] && TIMEOUT=30 # wait so many seconds to make sure the line is down
# pipe to this to log (default syslog)
LOGGER="logger -t failover -p local3.info"
# routes definition:
# first one is main, second one is backup
ROUTES="87.130.124.57 192.168.2.253"
# interfaces routes live on (i.e. eth0 eth1, can be the same)
INTERFACES="ftto vdsl"
# router internal ip, if you have a router cluster, whoever owns this IP is the active member
ROUTERIP=192.168.220.254
routes=( $ROUTES )
MAINROUTE=${routes[0]}
BACKUPROUTE=${routes[1]}
ifaces=( $INTERFACES )
MAINIFACE=${routes[0]}
BACKUPIFACE=${routes[1]}
# do not fail back earlier than this
FAILOVER_THRESHOLD_SEC=600
#### no user serviceable parts below this line (ahahah)
# are we root or not?
[[ $(id -u) != 0 ]] && SUDO=sudo
# get a device for a route, hardcoded for now
function getdev {
case "$1" in
$MAINROUTE) echo $MAINIFACE ;;
$BACKUPROUTE) echo $BACKUPIFACE ;;
*) echo unkn ;;
esac
}
# check a route by pinging a list of IPs (above)
# parameter: one of the IPs from the $ROUTES variable above
function check_route {
# check a few ips over route r
# returns: count of unreachable hosts or -1 for linkdown
local route=$1
dev=$(getdev $route)
gw="via $route dev $dev onlink"
script=$0
[[ "${BASH_SOURCE[0]}" != "${0}" ]] && script=${BASH_SOURCE[0]};
IPS=$(grep '^#IP' $script | cut -d' ' -f 2 ) #TEST | awk '{print "1.2.3.4"}');
IPS_CNT=$(echo $IPS | wc -w)
# now test
if [[ "$gw" ]] && /sbin/ip link show dev $dev 2> /dev/null | grep -q UP; then
# make sure that all test IPs are rerouted through the device
for cip in $IPS; do
$SUDO /sbin/ip route replace $cip $gw 2>/dev/null || { echo $IPS_CNT; return; };
done
UNR=$(/usr/bin/fping -u $IPS 2>/dev/null | xargs echo) # check
UNR_CNT=$(echo $UNR | wc -w) # how many are down?
LEFT=$(($IPS_CNT - $UNR_CNT)) # how many are up?
else
# main device is down, so fake all unreachable
UNR=$IPS
UNR_CNT=$IPS_CNT
LEFT=0
fi
# clean up the routes
for cip in $IPS; do
[[ "$gw" ]] && $SUDO /sbin/ip route delete $cip $gw 2>/dev/null
done
# debug
if [[ "$LEFT" -lt 3 ]]; then
echo "$route down, unreachable $UNR" | $LOGGER
else
if [[ $UNR_CNT -gt 0 ]]; then
( echo -n the following is still unreachable, tweak the list of hosts :
for host in $UNR; do echo -n $(grep $host $script); done; echo ) | $LOGGER
fi
fi
echo "$UNR_CNT"
}
function route_active {
# is route active anywhere?
local route="$1"
/sbin/ip r s default | fgrep -q $route
}
function route_reachable {
# is the route reachable?
local route="$1"
unr=$(check_route $route)
if [[ "$unr" -gt 3 ]] || [[ "$unr" -eq -1 ]]; then # recheck after TIMEOUT to make sure
sleep $TIMEOUT
unr=$(check_route $route)
fi
if [[ "$unr" -gt 3 ]] || [[ "$unr" -eq -1 ]]; then
return 1
else
return 0
fi
}
# fail back to main route
function failback {
if ! route_active $MAINROUTE || [[ "$force" ]]; then
$SUDO /sbin/ip r r default via $MAINROUTE dev $(getdev $MAINROUTE) onlink
fi
echo failback done | $LOGGER
}
# fail over to backup route
function failover {
if ! route_active $BACKUPROUTE || [[ "$force" ]]; then
$SUDO /sbin/ip r r default via $BACKUPROUTE dev $(getdev $BACKUPROUTE) onlink
fi
echo failover done | $LOGGER
}
function post_failover {
# tidy up after failover
$SUDO touch /tmp/last_failover
$SUDO chmod 777 /tmp/last_failover
lines
}
function lines {
echo default route for internal network: $(/sbin/ip r s | fgrep default)
[[ -f /tmp/no_failback ]] && echo "/tmp/no_failback is set"
[[ -f /tmp/last_failover ]] && echo "last failover (/tmp/last_failover) was at " $(stat --format +%z /tmp/last_failover | cut -d. -f 1)
echo last 6 events:
$SUDO fgrep failover /var/log/syslog | tail -6
}
# return MAIN or BACKUP or UNKNOWN depending on where the default route goes
function linkstate {
route=$(ip r s default | awk '{print $3}')
case "$route" in
$MAINROUTE) echo "MAIN"; return 0;;
$BACKUPROUTE) echo "BACKUP"; return 1;;
*) echo UNKNOWN;;
esac
}
function getstate {
# get network state as a bitmap, to be used in the decision tree below
route_reachable $MAINROUTE && printf 1 || printf 0
route_reachable $BACKUPROUTE && printf 1 || printf 0
route_active $MAINROUTE && printf 1 || printf 0
route_active $BACKUPROUTE && printf 1 || printf 0
[[ -f /tmp/no_failback ]] && printf 1 || printf 0
}
function help {
cat<<EOF
internet lines commands:
lines - human readable link state
linkstate - simple link state MAIN/BACKUP/UNKNOWN
getstate - binary link state
failover - fail over to backuo line
failback - fail back to main line
check_route
$MAINROUTE|
$BACKUPROUTE - 0 is OK, more than 3 is bad
touch /tmp/last_failover - prevent further failovers for $FAILOVER_THRESHOLD_SEC seconds
touch /tmp/no_failback - prevent failbacks forever
EOF
}
######## main program ###########################
if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then
[[ -t 0 ]] && printf "\nsystem state: $(systemctl is-system-running)\nlink state: $(linkstate)\n *** type help for more commands ***\n\n"
return
fi
if [[ "$1" == "linkstate" ]]; then
linkstate
elif [[ "$1" == "rebalancevpn" ]]; then
rebalancevpn $2 $3
elif [[ "$1" == "vpnips" ]]; then
vpnips $2 $3
else
# cleanup old failovers that might be stuck due to network outages
PIDS=$(pgrep -f failover.sh)
for pid in $PIDS; do
# kill only instances without parameters (i.e. no monitoring)
if [[ $pid != $$ ]] && [[ -f /proc/$pid/cmdline ]] && [[ $(xargs -n 1 -0 echo 2> /dev/null < /proc/$pid/cmdline | wc -l) -eq 1 ]]; then
$SUDO kill -9 $pid 2>/dev/null
fi
done
#### main loop #####
# if we are not the master router, just set the default route to the master and do nothing
if ! ip addr show | fgrep -q $ROUTERIP/; then
if ! ip route show default | fgrep -q $ROUTERIP; then
ip route replace default via $ROUTERIP
echo "We are not the master, setting route to master and bailing out" | $LOGGER
fi
exit
fi
# don't flap too often
if [[ -f /tmp/last_failover ]] && [[ $(( $(date +%s) - $(stat /tmp/last_failover -c %Z) )) -lt $FAILOVER_THRESHOLD_SEC ]]; then
echo last failover was less than $FAILOVER_THRESHOLD_SEC s ago, skip checks | $LOGGER
else
# where are we? also record for automatic failback
[[ -z "$state" ]] && state=$(getstate)
echo "state: $state" | $LOGGER
$SUDO chown root:root /tmp/linkstates
$SUDO chmod 777 /tmp/linkstates
$SUDO bash -c "echo $(date +%s) $state >> /tmp/linkstates"
# decision table
# see function getstate() above for the meaning of the bits
# 10110 01110 01111 10111 11110 11111 - both active, impossible - do nothing
# 11100 11101 10100 10101 - main route reachable and active - do nothing
# 01011 - backup active and reachable , no failback - do nothing
# 11011 - backup active, main reachable, consider auto failback
# 01010 - same but failback not possible - do nothing
# 00000 00001 00010 00011 00100 00101 00110 00111 - nothing reachable, bummer - do nothing
# 11010 - main reachable, backup active - failback
# 11000 11001 10000 10001 - nothing active, main reachable - failback
# 10010 10011 - backup active but main reachable - failback
# 01000 01001 - nothing active, backup reachable - failover
# 01100 01101 - main down, backup up - failover
# for i in $(seq 0 31); do a="0000"$(bc <<< "obase=2; $i"); echo ${a:${#a}-5:5}; done
case "$state" in
# failover
01000|01001|01100|01101) failover; $SUDO touch /tmp/no_failback; $SUDO chmod 777 /tmp/no_failback; post_failover ;;
# failback
11010|11000|11001|10000|10001|10010|10011) $SUDO rm -f /tmp/no_failback; failback; post_failover ;;
# recent failover, consider auto failback?
11011)
# were we in the same state for at least an hour?
fbok=$(awk -vts=$(date -d "60 min ago" +%s) 'BEGIN { cnt=0; bad=0 } $1>ts { cnt++; if ($2 != "11011") bad++ } END { if (cnt==60 && bad==0 ) print "ok" }' /tmp/linkstates)
if [[ "$fbok" ]]; then
$SUDO mv -f /tmp/linkstates.2 /tmp/linkstates.3
$SUDO mv -f /tmp/linkstates.1 /tmp/linkstates.2
$SUDO mv -f /tmp/linkstates /tmp/linkstates.1
$SUDO rm -f /tmp/no_failback
failback;
post_failover
fi ;;
# if all is normal, do nothing
11100|11101|10100|10101) ;;
esac
fi
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment