Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save rogerbush8/3ebcedfdc295c1a248cc to your computer and use it in GitHub Desktop.
Save rogerbush8/3ebcedfdc295c1a248cc to your computer and use it in GitHub Desktop.
nat-heartbeat-failover-monitor-script-for-aws
#!/usr/bin/python
# aws-instance-monitor
#
# This is a command-line tool, as well as a monitor/failover script that is designed
# for HA NAT, but should be usable when the following conditions are met:
#
# 1. Used in AWS VPC Route Table - VPC routing describes methods of egress
# for outbound traffic. Typically, a NAT will have a 0.0.0.0/0 rule
# and the route will be in a table associated with a Private Subnet.
# Thus, all Internet traffic (0.0.0.0/0) leaving from the Private Subnet
# is routed over the user instance (NAT). There are other user instances
# that go into the routing table (e.g. IPsec tunnel). These should work
# also, as this mechanism just switches the routing by substituting a
# different (healthy) user instance into each of the routes which reference
# the (unhealthy) user instance.
# 2. Pair of instances - they monitor each other with ping check. Split
# network is possible, and not handled by this simple script (nor can
# it be handled, completely, by two scripts on two boxes). A split
# network might see the NATs competing for the route (flipping back and
# forth at the failover rate).
# 3. The monitor script is designed to be run on both of the instances in
# the pair.
#
# Ideas for this python script came from Jinesh Varia's BASH script and
# article on HA NAT instances in VPC: http://aws.amazon.com/articles/2781451301784570
#
# Commands:
#
# There are 3 commands: show-affected-routes, swap-instance, run-failover-monitor
#
# show-affected-routes essentially takes 3 search criteria: --env, --region, and --instance.
# The --env is given by an optional AWS Tag on the RouteTable. It may be ignored by
# passing in --env "" on the command-line. This is used, for example, to tag "prod" and
# "dev" environments, as an additional check to make sure we are not affecting production.
# show-affected-routes will select routes that match the --env, --region (AWS regions,
# e.g. 'us-east-1') and --instance (AWS instanceId of the NAT which will be swapped out
# for a new instanceId, e.g. 'i-301223ca').
#
# swap-instance uses show-affected-routes to find the routes that reference the instance
# to swap. It then makes a call to ec2-change-route for each route, and changes the
# instance. It does not do any checking on the CIDR number (e.g. 0.0.0.0/0), it merely
# keeps this CIDR number in place. This allows the script to be used, for example,
# for IPsec tunnel gateways.
#
# run-failover-monitor starts this script up in an infinite loop which performs a
# ping check on the peer instance. When the peer instance becomes unreachable, the
# script invokes the swap-instance action to swap this instance in as the healthy
# instance.
#
# Dependencies:
#
# This script is designed to have as few dependencies as possible. All of the code is in
# this single script, except for common Python libraries that are installed. In addition
# the script has the following dependencies:
#
# 1. AWS cli must be installed - the commands called are ec2-change-route and ec2-describe-routes.
# 2. Each instance must have the ability to successfully make the ec2-change-route and ec2-describe-routes
# command.
# 3. ping is used for the ping test
# 4. curl is used to fetch our InstanceId when we run in monitor mode.
#
import sys
import os
import subprocess
import json
import copy
import argparse
import glob
import imp
import re
import time
from datetime import date
from datetime import datetime
class App :
def __init__ (self) :
self.__init_aws_env ()
# This function executes the aws-apitools-common.sh and sets the resulting
# shell environment variables locally. If this is not done, the appropriate
# permissions are not set for aws calls on the role
def __init_aws_env (self) :
# Execute bash script and read the variables into Python
command = ['bash', '-c', '. /etc/profile.d/aws-apitools-common.sh && env']
proc = subprocess.Popen (command, stdout = subprocess.PIPE)
for line in proc.stdout :
(key, _, value) = line.partition ("=")
os.environ [key] = value
proc.communicate ()
# Flattens AWS tags in RouteTables to a map (from a list of n,v items)
def __fixup_route_tags (self, data) :
for rt in data ['RouteTables'] :
tags = { }
for item in rt ['Tags'] :
k = item ['Key']
v = item ['Value']
tags [k] = v
rt ['Tags'] = tags
# Execute a command in a subshell, saving stdout and stderr, and potentially throwing an error
# (on rc != 0). Will use error_map (symbol : regex) to match against stderr, returning symbol
# on a match. result object is :
# { out : stdout, err : stderr, rc : returncode, symbol : err symbol match }
def __exec_cmd (self, cmd, error_map={}, throws=False) :
proc = subprocess.Popen ([ cmd ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
(out, err) = proc.communicate ()
# strip off leading "\n" from err if exists
if err.find ("\n") == 0 :
err = err [1:]
x = { 'out' : out, 'err' : err, 'rc' : proc.returncode, 'symbol' : None }
err = x ['err']
if x ['rc'] != 0 and len (err) :
for symbol in error_map.keys () :
pattern = error_map [symbol]
m = re.search (pattern, err)
if m :
x ['symbol'] = symbol
if x ['rc'] == 0 or not throws:
return x
raise Exception (x ['symbol'], x ['rc'], x ['err'])
# Get routes for a region in json format. Output is altered: Tags flattened to dict
def __fetch_routes_raw (self, region) :
x = self.__exec_cmd ("aws ec2 describe-route-tables --region {0} --output json".format (region))
data = json.loads (x ['out'])
self.__fixup_route_tags (data)
return data
# Get instance. Output is altered: return just the instance part from describe-instances
def __describe_instance_unwrapped (self, region, instance_id) :
x = self.__exec_cmd ("aws ec2 describe-instances --region {0} --output json --instance-ids {1}".format (region, instance_id))
data = json.loads (x ['out'])
data2 = data ['Reservations'][0]['Instances']
obj = data2 [0]
if instance_id != obj ['InstanceId'] :
return None
return obj
# Start instance
def __start_instance (self, region, instance_id) :
x = self.__exec_cmd ("aws ec2 start-instances --region {0} --output json --instance-ids {1}".format (region, instance_id))
data = json.loads (x ['out'])
return data
# Stop instance
def __stop_instance (self, region, instance_id) :
x = self.__exec_cmd ("aws ec2 stop-instances --region {0} --output json --instance-ids {1}".format (region, instance_id))
data = json.loads (x ['out'])
return data
# rtb_cidr is an array (or None) of pairs of this format <rtb-routeTableInstanceId>,<dest-cidr>.
# These represent routes in a particular table. We will return a table of objects = { routeTableInstanceId: 'x', cidr: 'y' }
def __validate_and_format_rtb_cidr (self, rtb_cidr) :
if rtb_cidr == None :
return [ ]
tmp = [ ]
for s in rtb_cidr :
m = re.match (r"(rtb-.*),(\d+[.]\d+[.]\d+[.]\d+[/]\d+)", s)
if not m :
raise Exception ("--rtb_cidr did not match regex (must be rtb-id,x.x.x.x/x)")
tmp.append ( { 'routeTableInstanceId' : m.group (1), 'cidr' : m.group (2) })
return tmp
# Output formatted log (timestamp, message) to stderr
def log (self, message) :
ts = datetime.today ()
print >> sys.stderr, "{0} {1} ".format (ts.strftime ("%y-%m-%d %H:%M:%S"), message)
# Get contents at URL (uses curl)
def get_url (self, url) :
x = self.__exec_cmd ("curl -s {0}".format (url))
return x
# Returns array of route records, augmented with parent data, which have InstanceId == instance_id in region
def find_routes_with_instance (self, env, region, instance) :
data = self.__fetch_routes_raw (region)
rows = [ ]
for rt in data ['RouteTables'] :
routeTableId = rt ['RouteTableId']
vpcId = rt ['VpcId']
if 'Name' in rt ['Tags'] :
routeTableName = rt ['Tags']['Name']
if 'Env' in rt ['Tags'] :
routeTableEnv = rt ['Tags']['Env']
if '__INSTANCE_PREFS' in rt ['Tags'] :
routeTableEnv = rt ['Tags']['__INSTANCE_PREFS']
if env and routeTableEnv != env :
continue
for r in rt ['Routes'] :
if not 'InstanceId' in r :
continue
id = r ['InstanceId']
# Append row with all data, and augmented data
if id == instance :
out = copy.copy (r)
out ['RouteTableId'] = routeTableId
out ['VpcId'] = vpcId
out ['RouteTableName'] = routeTableName
out ['RouteTableEnv'] = routeTableEnv
rows.append (out)
return rows
# Command - outputs (stdout) json of routes that have instance (InstanceId). Output is altered:
# augmented information, such as Tag ['Name']
def show_routes_with_instance (self, env, region, instance) :
rows = self.find_routes_with_instance (env, region, instance)
json.dump (rows, sys.stdout, indent=4)
print
# Replace instance_id in existing route. Used to swap healthy instance for unhealthy
def replace_route (self, region, to_instance, route_table_id, dest_cidr, dry_run=False) :
cmd = "aws ec2 replace-route --route-table-id {0} --destination-cidr-block {1} --instance-id {2} --region {3}".format (
route_table_id, dest_cidr, to_instance, region)
try :
if not dry_run :
error_map = { 'InvalidInstanceId' : 'InvalidInstanceID.NotFound',
'InvalidRoute' : r"no route defined.*CreateRoute",
}
out = self.__exec_cmd (cmd, error_map, throws=True)
return { 'ok' : True, 'symbol' : None, 'rc' : 0, 'err' : None }
else :
self.log ("DRY RUN {0}".format (cmd))
except Exception as e :
return { 'ok' : False, 'symbol' : e [0], 'rc' : e [1], 'err' : e [2] }
# Create route. Used if route is not detected on startup to initialize. A monitoring instance
# "owns" routes specified by --rtb_cidr on startup, and these are initialized as the
# recovery sequence.
def create_route (self, region, to_instance, route_table_id, dest_cidr, dry_run=False) :
cmd = "aws ec2 create-route --route-table-id {0} --destination-cidr-block {1} --instance-id {2} --region {3}".format (
route_table_id, dest_cidr, to_instance, region)
try :
if not dry_run :
error_map = { 'InvalidInstanceId' : 'InvalidInstanceID.NotFound',
'InvalidRoute' : r"no route defined.*CreateRoute",
}
out = self.__exec_cmd (cmd, error_map, throws=True)
return { 'ok' : True, 'symbol' : None, 'rc' : 0, 'err' : None }
else :
self.log ("DRY RUN {0}".format (cmd))
except Exception as e :
return { 'ok' : False, 'symbol' : e [0], 'rc' : e [1], 'err' : e [2] }
# Finds all routes that have the instance and change InstanceId to to_instance. Used to swap
# all unhealthy for healthy in one method
def swap_routes_with_instance (self, env, region, instance, to_instance, dry_run=False) :
rows = self.find_routes_with_instance (env, region, instance)
for r in rows :
res = self.replace_route (region, to_instance, r ['RouteTableId'], r ['DestinationCidrBlock'], dry_run)
# This method implements an infinite loop, which is the failover monitor process. If this server
# detects a problem with the peer server (using ping), it will replace the routes with the unhealthy
# server's InstanceId with it's own, stop the unhealthy server, and restart it. When the unhealthy
# server comes up, as part of the initialization, it will reset the routes it owns to its own id.
def run_failover_monitor (self, env, region, instance, rtb_cidr=None, ip=None, to_instance=None, dry_run=False, verbose=False) :
dry_run_pre = "DRY RUN " if dry_run else ""
self.log ("AWS Peered Instance Failover Monitor Starting...")
self.log ("Peered instance to monitor is AWS instance = {0}".format (instance))
# Fetch to_instance (my instanceId) from AWS or use --to_instance
if not to_instance :
self.log ("Fetching my instance id from AWS (curl)...")
x = self.get_url ("http://169.254.169.254/latest/meta-data/instance-id")
to_instance = x ['out']
if to_instance == "" :
self.log ("Unable to automatically determine my AWS instanceId (script must be running on an AWS instance)")
exit (1)
else :
self.log ("Using my instance id passed in from command-line arg --to_instance")
self.log ("My AWS InstanceId = {0}".format (to_instance))
# Deal with route Table Instance Ids and Cidrs
arr_rtb_cidr = self.__validate_and_format_rtb_cidr (rtb_cidr)
if len (arr_rtb_cidr) == 0 :
self.log ("No --rtb_cidr(s) specified, so this instance acts as a 'hot idle spare'")
else :
self.log ("Starting up and setting --rtb_cidr(s) to have us as the instance (i.e. for recovery).")
# Set all of my owned routes (specified by rtb_cidr) to myself (recover on startup)
for x in arr_rtb_cidr :
routeTableInstanceId = x ['routeTableInstanceId']
cidr = x ['cidr']
msg = "{0}Startup Recovery: setting route {1} {2} {3} with my instance_id = {4}".format (
dry_run_pre, region, routeTableInstanceId, cidr, to_instance)
self.log (msg)
if not dry_run :
out = self.replace_route (region, to_instance, routeTableInstanceId, cidr, dry_run)
if not out ['ok'] :
if out ['symbol'] == 'InvalidRoute' :
self.log ("Route doesn't exist, creating...")
out = self.create_route (region, to_instance, routeTableInstanceId, cidr, dry_run)
if not out ['ok'] :
self.log ("Problem with creating route. Continuing... err={0}".format (out ['err']))
else :
self.log ("Problem with replacing route. Possible bad startup params, pleast check and fix. Continuing...")
# Fetch IP from AWS or use --ip passed in
if not ip :
self.log ("Fetching Private IP for peer from AWS...")
data = self.__describe_instance_unwrapped (region, instance)
if not data :
self.log ("ERROR: Couldn't find Private IP for peer instance {0}".format (instance))
else :
if not 'PrivateIpAddress' in data :
self.log ("ERROR: Peer instance {0} has no PrivateIpAddress!".format (instance))
else :
ip = data ['PrivateIpAddress']
else :
self.log ("Using peer IP addressed passed in with --ip")
if not ip :
self.log ("ERROR: Couldn't determine Peer IP for pingtest. Aborting...")
exit (1)
else :
self.log ("Peer IP for ping test = {0}".format (ip))
self.log ("AWS Peered Instance Failover Monitor Initialized and Running...")
num_pings = 3
wait_between_pings = 2
wait_for_instance_stop=60
wait_for_instance_start=300
done = False
i = 0
healthy_ping_count = 0
while not done :
i = i + 1
# N.B. -W ping_timeout omitted as this seems to fail on the Mac
cmd = "ping -c {0} {1} | grep time= | wc -l".format (num_pings, ip)
x = self.__exec_cmd (cmd)
# Healthy
ping_count = int (x ['out'])
if ping_count > 0 :
healthy_ping_count = healthy_ping_count + 1
if verbose :
self.log ("Pingtest SUCCESS {0} for Peer at {1}".format (healthy_ping_count, ip))
time.sleep (wait_between_pings)
continue
else :
self.log ("FAILURE DETECTED - Pingtest failed for Peer at {1}".format (healthy_ping_count, ip))
healthy_ping_count = 0
# Unhealthy
unhealthy_routes = self.find_routes_with_instance (env, region, instance)
nat_healthy = False
stopping_nat = False
while not nat_healthy :
self.log ("RECOVERY INITIATED")
# Swap routes
num_unhealthy_routes = len (unhealthy_routes)
if num_unhealthy_routes :
msg = "Found {0} unhealthy routes with instance {1}. Swapping " + \
"InstanceId to instance {2}..."
self.log (msg.format (num_unhealthy_routes, instance, to_instance))
self.swap_routes_with_instance (env, region, instance, to_instance, dry_run)
self.log ("Unhealthy routes swapped to instance {0}".format (to_instance))
# For now, let's say it works
unhealthy_routes = [ ]
else :
self.log ("No unhealthy routes found with instance {0} in {1} {2}".format (instance, env, region))
self.log ("Checking state for {0} ...".format (ip))
data = self.__describe_instance_unwrapped (region, instance)
state = data ['State']['Name']
self.log ("State for {0}, State = {1}".format (ip, state))
if state == 'stopped' :
self.log ("Instance {0} at {1} stopped, restarting...".format (instance, ip))
if not dry_run :
self.__start_instance (region, instance)
nat_healthy = True
self.log ("Waiting {0} seconds for restart of Instance {1}...".format (wait_for_instance_start, instance))
time.sleep (wait_for_instance_start)
else :
if not stopping_nat :
self.log ("Instance {0} at {1} is not stopped, so stopping...".format (instance, ip))
if not dry_run :
self.__stop_instance (region, instance)
stopping_nat = True
self.log ("Waiting {0} seconds for Instance {1} to stop...".format (wait_for_instance_stop, instance))
time.sleep (wait_for_instance_stop)
def main () :
app = App ()
parser = argparse.ArgumentParser ()
subparsers = parser.add_subparsers (help="aws-nat-monitor is a command-line tool and an " + \
"automated failover monitor process. It supports several commands useful for NAT (or other) " + \
"failover, as well as a command that sets it into failover monitor mode.\nCommands:")
# Subparser: swap-nat
swap_instance = subparsers.add_parser ('swap-instance',
help="Swap the existing (NAT) instance (--instance) (AWS instance-id, e.g. i-301223ca) to a different " + \
"(healthy NAT) instance (--to_instance) in every route that we find the instance listed. Performs an " + \
"ec2-replace-route on potentially many routes in a single --region.\nExample: " + \
"aws-instance-monitor swap-instance --env-prod --region us-east-1 --instance i-301223ca --to_instance i-57684fad")
swap_instance.set_defaults (func=app.swap_routes_with_instance)
# Subparser: swap_instance, options
swap_instance.add_argument ('--dry_run', action='store_true',
help="Run command in no write mode as a test")
swap_instance.add_argument ('--env', type=str, required=True,
help="env (Tag 'Env' on Route Table) to be affected (e.g. prod, dev)")
swap_instance.add_argument ('--region', type=str, required=True,
help="AWS region to be affected (e.g. us-east-1, us-west-2)")
swap_instance.add_argument ("--instance", type=str, required=True,
help="AWS InstanceId (of NAT) to SWAP")
swap_instance.add_argument ("--to_instance", type=str, required=True,
help="AWS InstanceId (of NAT) to change to")
# Subparser: show-affected-routes
show_affected = subparsers.add_parser ('show-affected-routes',
help="Shows metadata for routes that have the --instance. These will be routes that " + \
"would be affected by the 'swap-nat' --instance command.\nExample: " + \
"aws-nat-monitor show-affected-routes --env prod --region us-east-1 --instance i-301223ca")
show_affected.set_defaults (func=app.show_routes_with_instance)
# Subparser: show-affected-routes, options
show_affected.add_argument ('--env', type=str, required=True,
help="env (Tag 'Env' on Route Table) to be affected (e.g. prod, dev). Use '' empty string to specify 'any' env.")
show_affected.add_argument ('--region', type=str, required=True,
help="AWS region to be affected (e.g. us-east-1, us-west-2)")
show_affected.add_argument ("--instance", type=str, required=True,
help="AWS InstanceId (of NAT) to change")
run_failover = subparsers.add_parser ('run-failover-monitor',
help="Starts this script up as a failover monitor, that uses a pingcheck to determine the " + \
"availability of a peered machine, which also should be running the same program, with " + \
"this machine as the peer. When this script determines the peer is unhealthy, it invokes " + \
"the 'swap-nat' method, which changes all the entries in the routing tables which " + \
"reference the unhealthy server, to this server. This implements a hot/hot NAT failover system. " + \
"The switching technique and this script is general enough to be used by any user instance, " + \
"acting as a gateway.\nExample: aws-instance-monitor run-failover-monitor --instance i-57684fad" + \
"--region us-east-1 --env prod")
run_failover.set_defaults (func=app.run_failover_monitor)
# Subparser: run-failover-monitor, options
run_failover.add_argument ('--dry_run', action='store_true',
help="Run command in no write mode as a test")
run_failover.add_argument ('--verbose', action='store_true',
help="More output (e.g. ping test results)")
run_failover.add_argument ('--env', type=str, required=True,
help="env (Tag 'Env' on Route Table) to be affected (e.g. prod, dev). Use '' empty string to specify 'any' env.")
run_failover.add_argument ('--region', type=str, required=True,
help="AWS region to be affected (e.g. us-east-1, us-west-2)")
run_failover.add_argument ("--instance", type=str, required=True,
help="AWS InstanceId of peer (NAT) to monitor")
run_failover.add_argument ("--rtb_cidr", type=str, nargs='+',
help="1 to N route table and CIDR (dest), of format <rtb>,<cidr> (e.g. rtb-be2005db,0.0.0.0/0), each represents " + \
"a route that the monitor 'owns' (will set instance to itself on startup)")
run_failover.add_argument ("--to_instance", type=str,
help="AWS InstanceId of this server (NAT) to change to (will be fetched from AWS if not specified).")
run_failover.add_argument ("--ip", type=str,
help="AWS InstanceId of this server (NAT) to change to (will be fetched from AWS if not specified).")
args = parser.parse_args ()
# Convert argparse namespace to dict, remove func from args
arg_dict = vars (args)
func = arg_dict ['func']
del arg_dict ['func']
# Call by unpacking dict to function call
func (**arg_dict)
if __name__ == '__main__' :
sys.exit (main ())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment