Last active
August 28, 2017 14:26
-
-
Save rafaelfelix/5e170b3b732d0b595993 to your computer and use it in GitHub Desktop.
Newer version of Steve Morad's nat_monitor.sh script (reference: http://stevemorad.s3.amazonaws.com/reInvent/articles/nat_monitor_files/nat_monitor.sh). Depends on jq (https://stedolan.github.io/jq/) and awscli (https://aws.amazon.com/cli/)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# This script will monitor another NAT instance and take over its routes | |
# if communication with the other instance fails | |
# NAT instance variables | |
# Other instance's IP to ping and route to grab if other node goes down | |
NAT_ID= | |
NAT_RT_ID= | |
# My route to grab when I come back up | |
My_RT_ID= | |
# Specify the EC2 region that this will be running in (e.g. us-east-1) | |
REGION= | |
# Health Check variables | |
Num_Pings=3 | |
Ping_Timeout=1 | |
Wait_Between_Pings=2 | |
Wait_for_Instance_Stop=60 | |
Wait_for_Instance_Start=300 | |
# Run aws-apitools-common.sh to set up default environment variables and to | |
# leverage AWS security credentials provided by EC2 roles | |
. /etc/profile.d/aws-apitools-common.sh | |
# Determine the NAT instance private IP so we can ping the other NAT instance, take over | |
# its route, and reboot it. Requires EC2 DescribeInstances, ReplaceRoute, and Start/RebootInstances | |
# permissions. The following example EC2 Roles policy will authorize these commands: | |
# { | |
# "Statement": [ | |
# { | |
# "Action": [ | |
# "ec2:DescribeInstances", | |
# "ec2:CreateRoute", | |
# "ec2:ReplaceRoute", | |
# "ec2:StartInstances", | |
# "ec2:StopInstances" | |
# ], | |
# "Effect": "Allow", | |
# "Resource": "*" | |
# } | |
# ] | |
# } | |
# Get this instance's ID | |
Instance_ID=`/usr/bin/curl --silent http://169.254.169.254/latest/meta-data/instance-id` | |
# Get the other NAT instance's IP | |
NAT_IP=`/usr/bin/aws ec2 describe-instances --instance-ids $NAT_ID --region $REGION | jq '.Reservations[]|.Instances[]|.PrivateIpAddress' | tr -d '"'` | |
echo `date` "-- Starting NAT monitor" | |
echo `date` "-- Adding this instance to $My_RT_ID default route on start" | |
/usr/bin/aws ec2 replace-route --route-table-id $My_RT_ID --destination-cidr-block 0.0.0.0/0 --instance-id $Instance_ID --region $REGION | |
# If replace-route failed, then the route might not exist and may need to be created instead | |
if [ "$?" != "0" ]; then | |
/usr/bin/aws ec2 create-route --route-table-id $My_RT_ID --destination-cidr-block 0.0.0.0/0 --instance-id $Instance_ID --region $REGION | |
fi | |
while [ . ]; do | |
# Check health of other NAT instance | |
pingresult=`ping -c $Num_Pings -W $Ping_Timeout $NAT_IP | grep time= | wc -l` | |
# Check to see if any of the health checks succeeded, if not | |
if [ "$pingresult" == "0" ]; then | |
# Set HEALTHY variables to unhealthy (0) | |
ROUTE_HEALTHY=0 | |
NAT_HEALTHY=0 | |
STOPPING_NAT=0 | |
while [ "$NAT_HEALTHY" == "0" ]; do | |
# NAT instance is unhealthy, loop while we try to fix it | |
if [ "$ROUTE_HEALTHY" == "0" ]; then | |
echo `date` "-- Other NAT heartbeat failed, taking over $NAT_RT_ID default route" | |
/usr/bin/aws ec2 replace-route --route-table-id $NAT_RT_ID --destination-cidr-block 0.0.0.0/0 --instance-id $Instance_ID --region $REGION | |
ROUTE_HEALTHY=1 | |
fi | |
# Check NAT state to see if we should stop it or start it again | |
NAT_STATE=`/usr/bin/aws ec2 describe-instances --instance-ids $NAT_ID --region $REGION | jq '.Reservations[]|.Instances[]|.State.Name' | tr -d '"'` | |
if [ "$NAT_STATE" == "stopped" ]; then | |
echo `date` "-- Other NAT instance stopped, starting it back up" | |
/usr/bin/aws ec2 start-instances --instance-ids $NAT_ID --region $REGION | |
NAT_HEALTHY=1 | |
sleep $Wait_for_Instance_Start | |
else | |
if [ "$STOPPING_NAT" == "0" ]; then | |
echo `date` "-- Other NAT instance $NAT_STATE, attempting to stop for reboot" | |
/usr/bin/aws ec2 stop-instances --instance-ids $NAT_ID --region $REGION | |
STOPPING_NAT=1 | |
fi | |
sleep $Wait_for_Instance_Stop | |
fi | |
done | |
else | |
sleep $Wait_Between_Pings | |
fi | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @rafaelfelix !
I know it's been a long time since you've published this, but would you remember - by any chance - why you created this newer version? Was there a specific problem you were trying to fix?
I've had some cases where both NAT instances end up in a "STOPPED" state with the original script, so I'm trying to understand if this was an attempt to fix a similar issue...
Thanks in advance!