rafaelfelix/nat_monitor.sh

## nat_monitor.sh
#!/bin/sh
# This script will monitor another NAT instance and take over its routes
# if communication with the other instance fails

# NAT instance variables
# Other instance's IP to ping and route to grab if other node goes down
NAT_ID=
NAT_RT_ID=

# My route to grab when I come back up
My_RT_ID=

# Specify the EC2 region that this will be running in (e.g. us-east-1)
REGION=

# Health Check variables
Num_Pings=3
Ping_Timeout=1
Wait_Between_Pings=2
Wait_for_Instance_Stop=60
Wait_for_Instance_Start=300

# Run aws-apitools-common.sh to set up default environment variables and to
# leverage AWS security credentials provided by EC2 roles
. /etc/profile.d/aws-apitools-common.sh

# Determine the NAT instance private IP so we can ping the other NAT instance, take over
# its route, and reboot it.  Requires EC2 DescribeInstances, ReplaceRoute, and Start/RebootInstances
# permissions.  The following example EC2 Roles policy will authorize these commands:
# {
#  "Statement": [
#    {
#      "Action": [
#        "ec2:DescribeInstances",
#        "ec2:CreateRoute",
#        "ec2:ReplaceRoute",
#        "ec2:StartInstances",
#        "ec2:StopInstances"
#      ],
#      "Effect": "Allow",
#      "Resource": "*"
#    }
#  ]
# }

# Get this instance's ID
Instance_ID=`/usr/bin/curl --silent http://169.254.169.254/latest/meta-data/instance-id`
# Get the other NAT instance's IP
NAT_IP=`/usr/bin/aws ec2 describe-instances --instance-ids $NAT_ID --region $REGION | jq '.Reservations[]|.Instances[]|.PrivateIpAddress' | tr -d '"'`

echo `date` "-- Starting NAT monitor"
echo `date` "-- Adding this instance to $My_RT_ID default route on start"
/usr/bin/aws ec2 replace-route --route-table-id $My_RT_ID --destination-cidr-block 0.0.0.0/0 --instance-id $Instance_ID --region $REGION
# If replace-route failed, then the route might not exist and may need to be created instead
if [ "$?" != "0" ]; then
   /usr/bin/aws ec2 create-route --route-table-id $My_RT_ID --destination-cidr-block 0.0.0.0/0 --instance-id $Instance_ID --region $REGION
fi

while [ . ]; do
  # Check health of other NAT instance
  pingresult=`ping -c $Num_Pings -W $Ping_Timeout $NAT_IP | grep time= | wc -l`
  # Check to see if any of the health checks succeeded, if not
  if [ "$pingresult" == "0" ]; then
    # Set HEALTHY variables to unhealthy (0)
    ROUTE_HEALTHY=0
    NAT_HEALTHY=0
    STOPPING_NAT=0
    while [ "$NAT_HEALTHY" == "0" ]; do
      # NAT instance is unhealthy, loop while we try to fix it
      if [ "$ROUTE_HEALTHY" == "0" ]; then
    	echo `date` "-- Other NAT heartbeat failed, taking over $NAT_RT_ID default route"
    	/usr/bin/aws ec2 replace-route --route-table-id $NAT_RT_ID --destination-cidr-block 0.0.0.0/0 --instance-id $Instance_ID --region $REGION
	ROUTE_HEALTHY=1
      fi
      # Check NAT state to see if we should stop it or start it again
      NAT_STATE=`/usr/bin/aws ec2 describe-instances --instance-ids $NAT_ID --region $REGION | jq  '.Reservations[]|.Instances[]|.State.Name' | tr -d '"'`
      if [ "$NAT_STATE" == "stopped" ]; then
    	echo `date` "-- Other NAT instance stopped, starting it back up"
        /usr/bin/aws ec2 start-instances --instance-ids $NAT_ID --region $REGION
	NAT_HEALTHY=1
        sleep $Wait_for_Instance_Start
      else
	if [ "$STOPPING_NAT" == "0" ]; then
    	  echo `date` "-- Other NAT instance $NAT_STATE, attempting to stop for reboot"
	  /usr/bin/aws ec2 stop-instances --instance-ids $NAT_ID --region $REGION
	  STOPPING_NAT=1
	fi
        sleep $Wait_for_Instance_Stop
      fi
    done
  else
    sleep $Wait_Between_Pings
  fi
done
	#!/bin/sh
	# This script will monitor another NAT instance and take over its routes
	# if communication with the other instance fails

	# NAT instance variables
	# Other instance's IP to ping and route to grab if other node goes down
	NAT_ID=
	NAT_RT_ID=

	# My route to grab when I come back up
	My_RT_ID=

	# Specify the EC2 region that this will be running in (e.g. us-east-1)
	REGION=

	# Health Check variables
	Num_Pings=3
	Ping_Timeout=1
	Wait_Between_Pings=2
	Wait_for_Instance_Stop=60
	Wait_for_Instance_Start=300

	# Run aws-apitools-common.sh to set up default environment variables and to
	# leverage AWS security credentials provided by EC2 roles
	. /etc/profile.d/aws-apitools-common.sh

	# Determine the NAT instance private IP so we can ping the other NAT instance, take over
	# its route, and reboot it. Requires EC2 DescribeInstances, ReplaceRoute, and Start/RebootInstances
	# permissions. The following example EC2 Roles policy will authorize these commands:
	# {
	# "Statement": [
	# {
	# "Action": [
	# "ec2:DescribeInstances",
	# "ec2:CreateRoute",
	# "ec2:ReplaceRoute",
	# "ec2:StartInstances",
	# "ec2:StopInstances"
	# ],
	# "Effect": "Allow",
	# "Resource": "*"
	# }
	# ]
	# }

	# Get this instance's ID
	Instance_ID=`/usr/bin/curl --silent http://169.254.169.254/latest/meta-data/instance-id`
	# Get the other NAT instance's IP
	NAT_IP=`/usr/bin/aws ec2 describe-instances --instance-ids $NAT_ID --region $REGION \| jq '.Reservations[]\|.Instances[]\|.PrivateIpAddress' \| tr -d '"'`

	echo `date` "-- Starting NAT monitor"
	echo `date` "-- Adding this instance to $My_RT_ID default route on start"
	/usr/bin/aws ec2 replace-route --route-table-id $My_RT_ID --destination-cidr-block 0.0.0.0/0 --instance-id $Instance_ID --region $REGION
	# If replace-route failed, then the route might not exist and may need to be created instead
	if [ "$?" != "0" ]; then
	/usr/bin/aws ec2 create-route --route-table-id $My_RT_ID --destination-cidr-block 0.0.0.0/0 --instance-id $Instance_ID --region $REGION
	fi

	while [ . ]; do
	# Check health of other NAT instance
	pingresult=`ping -c $Num_Pings -W $Ping_Timeout $NAT_IP \| grep time= \| wc -l`
	# Check to see if any of the health checks succeeded, if not
	if [ "$pingresult" == "0" ]; then
	# Set HEALTHY variables to unhealthy (0)
	ROUTE_HEALTHY=0
	NAT_HEALTHY=0
	STOPPING_NAT=0
	while [ "$NAT_HEALTHY" == "0" ]; do
	# NAT instance is unhealthy, loop while we try to fix it
	if [ "$ROUTE_HEALTHY" == "0" ]; then
	echo `date` "-- Other NAT heartbeat failed, taking over $NAT_RT_ID default route"
	/usr/bin/aws ec2 replace-route --route-table-id $NAT_RT_ID --destination-cidr-block 0.0.0.0/0 --instance-id $Instance_ID --region $REGION
	ROUTE_HEALTHY=1
	fi
	# Check NAT state to see if we should stop it or start it again
	NAT_STATE=`/usr/bin/aws ec2 describe-instances --instance-ids $NAT_ID --region $REGION \| jq '.Reservations[]\|.Instances[]\|.State.Name' \| tr -d '"'`
	if [ "$NAT_STATE" == "stopped" ]; then
	echo `date` "-- Other NAT instance stopped, starting it back up"
	/usr/bin/aws ec2 start-instances --instance-ids $NAT_ID --region $REGION
	NAT_HEALTHY=1
	sleep $Wait_for_Instance_Start
	else
	if [ "$STOPPING_NAT" == "0" ]; then
	echo `date` "-- Other NAT instance $NAT_STATE, attempting to stop for reboot"
	/usr/bin/aws ec2 stop-instances --instance-ids $NAT_ID --region $REGION
	STOPPING_NAT=1
	fi
	sleep $Wait_for_Instance_Stop
	fi
	done
	else
	sleep $Wait_Between_Pings
	fi
	done