Skip to content

Instantly share code, notes, and snippets.

@kevinkarwaski
Last active March 26, 2018 06:04
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save kevinkarwaski/bc78df9eb5afa87b659df561f9746275 to your computer and use it in GitHub Desktop.
Save kevinkarwaski/bc78df9eb5afa87b659df561f9746275 to your computer and use it in GitHub Desktop.
import boto3
import json
import logging
import time
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def notify_on_error(message):
logger.error(message)
try:
sns_client = boto3.client('sns')
sns_response = sns_client.publish(
TopicArn='arn:aws:sns:us-east-1:xxxxxxxxxxxx:Lambda-Errors',
Message=message,
Subject='ASG/ECS Lifecycle Termination Error'
)
logger.info("SNS Publish HTTP Response: %s" % sns_response[u'ResponseMetadata'][u'HTTPStatusCode'])
except Exception as e:
logger.error(e)
def deregister_from_ecs_cluster(instanceid, cluster_name):
try:
ecs_client = boto3.client('ecs')
containerarn = None
ecsclusterinstances = ecs_client.list_container_instances(
cluster=cluster_name
)
for containerarns in ecsclusterinstances[u'containerInstanceArns']:
response = ecs_client.describe_container_instances(
cluster=cluster_name,
containerInstances=[
containerarns
]
)
if response[u'containerInstances'][0][u'ec2InstanceId'] == instanceid:
containerarn = containerarns
if containerarn:
logger.info("%s found registered with container ARN (%s) in ECS cluster. "
"Attempting to deregister from cluster." % (instanceid, containerarn))
ecs_response = ecs_client.deregister_container_instance(
cluster=cluster_name,
containerInstance=containerarn,
force=True
)
logger.info("ECS Deregistration HTTP Response: %s" % ecs_response[u'ResponseMetadata'][u'HTTPStatusCode'])
else:
logger.info('%s is NOT registered with the Cluster.' % instanceid)
return
except Exception as e:
notify_on_error(str(e))
def wait_for_alb(alb):
try:
alb_client = boto3.client('elbv2')
target_groups = alb_client.describe_target_groups(
LoadBalancerArn=alb
)
for target_group in target_groups[u'TargetGroups']:
for timer in range(0, 24):
healthy_count = 0
alb_response = alb_client.describe_target_health(
TargetGroupArn=target_group[u'TargetGroupArn']
)
print json.dumps(alb_response)
for target in alb_response[u'TargetHealthDescriptions']:
instanceid = target[u'Target'][u'Id']
if target[u'TargetHealth'][u'State'] == "healthy":
healthy_count += 1
logger.info("%s is healthy, bringing total healthy to %s" % (instanceid, healthy_count))
logger.info("Description: %s" % target)
time.sleep(5)
else:
logger.info("%s is not in a healthy state. It is in State: %s." % (instanceid, target[u'TargetHealth'][u'State']))
if healthy_count >= 2:
logger.info("Total healthy instances now at %s.. moving on" % healthy_count)
return
else:
logger.info("Whoops! %s took more than 2 minutes to get 2 healthy instances attached to the ALB!"
" Giving up on being being graceful!" % instanceid)
return
except Exception as e:
notify_on_error(str(e))
def wait_for_elb(elb):
try:
elb_client = boto3.client('elb')
for timer in range(0, 60):
elb_response = elb_client.describe_load_balancers(
LoadBalancerNames=[elb]
)
if len(elb_response[u'LoadBalancerDescriptions'][0][u'Instances']) <= 2:
logger.info("There are two or fewer instances registered with the ELB, "
"waiting for another instance to register before moving on. (5 min timeout)")
logger.info(elb_response[u'LoadBalancerDescriptions'][0][u'Instances'])
time.sleep(5)
else:
logger.info("Sweet! We have more than two instances in the ELB, "
"moving forward with termination request.")
logger.info(elb_response[u'LoadBalancerDescriptions'][0][u'Instances'])
break
else:
logger.info("Whoops! It took more than 5 minutes waiting for more instances to register in the ELB! "
"Giving up on being graceful!")
return
except Exception as e:
notify_on_error(str(e))
def wait_for_tasks(ecscluster, wait=60):
logger.info("Looking up status of tasks on %s" % ecscluster)
ecs_client = boto3.client('ecs')
if wait is 0:
logger.info("Whoops! It took more than 5 minutes waiting for all the tasks to be up in the cluster! "
"Giving up on being graceful!")
return
ecs_tasks = ecs_client.list_tasks(cluster=ecscluster)
if not ecs_tasks[u'taskArns']:
logger.info("No Tasks found.. Waiting for service to spawn tasks..")
time.sleep(5)
wait_for_tasks(ecscluster, wait-1)
else:
task_descs = ecs_client.describe_tasks(cluster=ecscluster, tasks=ecs_tasks[u'taskArns'])
status = []
for task in task_descs[u'tasks']:
status.append(task[u'lastStatus'])
if 'PENDING' in status:
logger.info("Status of Tasks %s .. Waiting for RUNNING status.." % status)
time.sleep(5)
wait_for_tasks(ecscluster, wait-1)
else:
logger.info("Great! Status of Tasks is %s .. moving on" % status)
return
def complete_asg_lifecycle(hookname, asg, actiontoken, instanceid):
try:
asg_client = boto3.client('autoscaling')
asg_response = asg_client.complete_lifecycle_action(
LifecycleHookName=hookname,
AutoScalingGroupName=asg,
LifecycleActionToken=actiontoken,
LifecycleActionResult='CONTINUE',
InstanceId=instanceid
)
logger.info("ASG Complete Lifecycle Action Response: %s" % asg_response[u'ResponseMetadata'][u'HTTPStatusCode'])
except Exception as e:
notify_on_error(str(e))
def lambda_handler(event, context):
logger.info(json.dumps(event))
message = json.loads(event[u'Records'][0][u'Sns'][u'Message'])
logger.info(message)
# Parse SNS message for required data.
ec2instanceid = message['EC2InstanceId']
stackname = message['NotificationMetadata']
asgname = message['AutoScalingGroupName']
lifecycleactiontoken = message['LifecycleActionToken']
lifecyclehookname = message['LifecycleHookName']
logger.info("EC2 Instance ID: %s" % ec2instanceid)
logger.info("CF Stack Name: %s" % stackname)
# Create Cloudformation connection object.
cf_client = boto3.client('cloudformation')
# Get stack details via describe_stacks
stackdetails = cf_client.describe_stacks(StackName=stackname)
# Initialize ALB var to None.
albarn = None
# Initialize ELB var to None.
elbname = None
# Parse the ALB ARN, ELB Name and ECS Cluster Name.
for output in stackdetails[u'Stacks'][0][u'Outputs']:
if output[u'OutputKey'] == 'alb':
logger.info("ALB ARN: %s" % output[u'OutputValue'])
albarn = output[u'OutputValue']
if output[u'OutputKey'] == 'elb':
logger.info("ELB NAME: %s" % output[u'OutputValue'])
elbname = output[u'OutputValue']
if output[u'OutputKey'] == 'ecscluster':
logger.info("ECS CLUSTER NAME: %s" % output[u'OutputValue'])
ecsclustername = output[u'OutputValue']
# Deregister the EC2 instance from the ECS Cluster to
# prevent new tasks from being launched on it.
deregister_from_ecs_cluster(ec2instanceid, ecsclustername)
# Check on health state of instance registered with
# target group in ALB.
if albarn:
wait_for_alb(albarn)
else:
logger.info("No ALB found in stackdetails; skipping de-registration.")
# Deregister the EC2 instance from the ELB to initiate
# connection draining before allowing the ASG to terminate.
if elbname:
wait_for_elb(elbname)
else:
logger.info("No ELB found in stackdetails; skipping de-registration.")
wait_for_tasks(ecsclustername)
# Notify ASG to complete lifecycle; don't wait for timeout.
complete_asg_lifecycle(lifecyclehookname, asgname, lifecycleactiontoken, ec2instanceid)
@danbf
Copy link

danbf commented Dec 22, 2016

I'm not really sure that this code: https://gist.github.com/kevinkarwaski/bc78df9eb5afa87b659df561f9746275#file-asg-graceful-lifecycle-termination-py-L86-L103 , does anything. the aws command reference, https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_DeregisterContainerInstance.html, says:

Any containers in orphaned service tasks that are registered with a Classic load balancer or an Application load balancer target group are deregistered, and they will begin connection draining according to the settings on the load balancer or target group.

so it seems that the call to DeregisterContainerInstance does starts the ELB de-registration independent of the code in this lambda function that attempts to manage it elb de-register.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment