kevinkarwaski/asg-graceful-lifecycle-termination.py

## asg-graceful-lifecycle-termination.py
import boto3
import json
import logging
import time

logger = logging.getLogger()
logger.setLevel(logging.INFO)


def notify_on_error(message):
    logger.error(message)
    try:
        sns_client = boto3.client('sns')
        sns_response = sns_client.publish(
            TopicArn='arn:aws:sns:us-east-1:xxxxxxxxxxxx:Lambda-Errors',
            Message=message,
            Subject='ASG/ECS Lifecycle Termination Error'
        )
        logger.info("SNS Publish HTTP Response: %s" % sns_response[u'ResponseMetadata'][u'HTTPStatusCode'])
    except Exception as e:
        logger.error(e)


def deregister_from_ecs_cluster(instanceid, cluster_name):
    try:
        ecs_client = boto3.client('ecs')
        containerarn = None
        ecsclusterinstances = ecs_client.list_container_instances(
            cluster=cluster_name
        )
        for containerarns in ecsclusterinstances[u'containerInstanceArns']:
            response = ecs_client.describe_container_instances(
                cluster=cluster_name,
                containerInstances=[
                    containerarns
                ]
            )
            if response[u'containerInstances'][0][u'ec2InstanceId'] == instanceid:
                containerarn = containerarns
        if containerarn:
            logger.info("%s found registered with container ARN (%s) in ECS cluster. "
                        "Attempting to deregister from cluster." % (instanceid, containerarn))
            ecs_response = ecs_client.deregister_container_instance(
                cluster=cluster_name,
                containerInstance=containerarn,
                force=True
            )
            logger.info("ECS Deregistration HTTP Response: %s" % ecs_response[u'ResponseMetadata'][u'HTTPStatusCode'])
        else:
            logger.info('%s is NOT registered with the Cluster.' % instanceid)
            return
    except Exception as e:
        notify_on_error(str(e))


def wait_for_alb(alb):
    try:
        alb_client = boto3.client('elbv2')
        target_groups = alb_client.describe_target_groups(
            LoadBalancerArn=alb
        )
        for target_group in target_groups[u'TargetGroups']:
            for timer in range(0, 24):
                healthy_count = 0
                alb_response = alb_client.describe_target_health(
                    TargetGroupArn=target_group[u'TargetGroupArn']
                )
                print json.dumps(alb_response)
                for target in alb_response[u'TargetHealthDescriptions']:
                    instanceid = target[u'Target'][u'Id']
                    if target[u'TargetHealth'][u'State'] == "healthy":
                        healthy_count += 1
                        logger.info("%s is healthy, bringing total healthy to %s" % (instanceid, healthy_count))
                        logger.info("Description: %s" % target)
                        time.sleep(5)
                    else:
                        logger.info("%s is not in a healthy state. It is in State: %s." % (instanceid, target[u'TargetHealth'][u'State']))
                    if healthy_count >= 2:
                        logger.info("Total healthy instances now at %s.. moving on" % healthy_count)
                        return
            else:
                logger.info("Whoops! %s took more than 2 minutes to get 2 healthy instances attached to the ALB!"
                            " Giving up on being being graceful!" % instanceid)
                return
    except Exception as e:
        notify_on_error(str(e))


def wait_for_elb(elb):
    try:
        elb_client = boto3.client('elb')
        for timer in range(0, 60):
            elb_response = elb_client.describe_load_balancers(
                LoadBalancerNames=[elb]
            )

            if len(elb_response[u'LoadBalancerDescriptions'][0][u'Instances']) <= 2:
                logger.info("There are two or fewer instances registered with the ELB, "
                            "waiting for another instance to register before moving on. (5 min timeout)")
                logger.info(elb_response[u'LoadBalancerDescriptions'][0][u'Instances'])
                time.sleep(5)
            else:
                logger.info("Sweet! We have more than two instances in the ELB, "
                            "moving forward with termination request.")
                logger.info(elb_response[u'LoadBalancerDescriptions'][0][u'Instances'])
                break
        else:
            logger.info("Whoops! It took more than 5 minutes waiting for more instances to register in the ELB! "
                        "Giving up on being graceful!")
        return
    except Exception as e:
        notify_on_error(str(e))


def wait_for_tasks(ecscluster, wait=60):
    logger.info("Looking up status of tasks on %s" % ecscluster)
    ecs_client = boto3.client('ecs')
    if wait is 0:
        logger.info("Whoops! It took more than 5 minutes waiting for all the tasks to be up in the cluster! "
                    "Giving up on being graceful!")
        return
    ecs_tasks = ecs_client.list_tasks(cluster=ecscluster)
    if not ecs_tasks[u'taskArns']:
        logger.info("No Tasks found.. Waiting for service to spawn tasks..")
        time.sleep(5)
        wait_for_tasks(ecscluster, wait-1)
    else:
        task_descs = ecs_client.describe_tasks(cluster=ecscluster, tasks=ecs_tasks[u'taskArns'])
        status = []
        for task in task_descs[u'tasks']:
            status.append(task[u'lastStatus'])
        if 'PENDING' in status:
            logger.info("Status of Tasks %s .. Waiting for RUNNING status.." % status)
            time.sleep(5)
            wait_for_tasks(ecscluster, wait-1)
        else:
            logger.info("Great! Status of Tasks is %s .. moving on" % status)
    return


def complete_asg_lifecycle(hookname, asg, actiontoken, instanceid):
    try:
        asg_client = boto3.client('autoscaling')
        asg_response = asg_client.complete_lifecycle_action(
            LifecycleHookName=hookname,
            AutoScalingGroupName=asg,
            LifecycleActionToken=actiontoken,
            LifecycleActionResult='CONTINUE',
            InstanceId=instanceid
        )
        logger.info("ASG Complete Lifecycle Action Response: %s" % asg_response[u'ResponseMetadata'][u'HTTPStatusCode'])
    except Exception as e:
        notify_on_error(str(e))


def lambda_handler(event, context):
    logger.info(json.dumps(event))
    message = json.loads(event[u'Records'][0][u'Sns'][u'Message'])
    logger.info(message)

    # Parse SNS message for required data.
    ec2instanceid = message['EC2InstanceId']
    stackname = message['NotificationMetadata']
    asgname = message['AutoScalingGroupName']
    lifecycleactiontoken = message['LifecycleActionToken']
    lifecyclehookname = message['LifecycleHookName']

    logger.info("EC2 Instance ID: %s" % ec2instanceid)
    logger.info("CF Stack Name: %s" % stackname)

    # Create Cloudformation connection object.
    cf_client = boto3.client('cloudformation')

    # Get stack details via describe_stacks
    stackdetails = cf_client.describe_stacks(StackName=stackname)

    # Initialize ALB var to None.
    albarn = None

    # Initialize ELB var to None.
    elbname = None

    # Parse the ALB ARN, ELB Name and ECS Cluster Name.
    for output in stackdetails[u'Stacks'][0][u'Outputs']:

        if output[u'OutputKey'] == 'alb':
            logger.info("ALB ARN: %s" % output[u'OutputValue'])
            albarn = output[u'OutputValue']

        if output[u'OutputKey'] == 'elb':
            logger.info("ELB NAME: %s" % output[u'OutputValue'])
            elbname = output[u'OutputValue']

        if output[u'OutputKey'] == 'ecscluster':
            logger.info("ECS CLUSTER NAME: %s" % output[u'OutputValue'])
            ecsclustername = output[u'OutputValue']

    # Deregister the EC2 instance from the ECS Cluster to
    # prevent new tasks from being launched on it.
    deregister_from_ecs_cluster(ec2instanceid, ecsclustername)

    # Check on health state of instance registered with
    # target group in ALB.
    if albarn:
        wait_for_alb(albarn)
    else:
        logger.info("No ALB found in stackdetails; skipping de-registration.")

    # Deregister the EC2 instance from the ELB to initiate
    # connection draining before allowing the ASG to terminate.
    if elbname:
        wait_for_elb(elbname)
    else:
        logger.info("No ELB found in stackdetails; skipping de-registration.")

    wait_for_tasks(ecsclustername)
    # Notify ASG to complete lifecycle; don't wait for timeout.
    complete_asg_lifecycle(lifecyclehookname, asgname, lifecycleactiontoken, ec2instanceid)
	import boto3
	import json
	import logging
	import time

	logger = logging.getLogger()
	logger.setLevel(logging.INFO)


	def notify_on_error(message):
	logger.error(message)
	try:
	sns_client = boto3.client('sns')
	sns_response = sns_client.publish(
	TopicArn='arn:aws:sns:us-east-1:xxxxxxxxxxxx:Lambda-Errors',
	Message=message,
	Subject='ASG/ECS Lifecycle Termination Error'
	)
	logger.info("SNS Publish HTTP Response: %s" % sns_response[u'ResponseMetadata'][u'HTTPStatusCode'])
	except Exception as e:
	logger.error(e)


	def deregister_from_ecs_cluster(instanceid, cluster_name):
	try:
	ecs_client = boto3.client('ecs')
	containerarn = None
	ecsclusterinstances = ecs_client.list_container_instances(
	cluster=cluster_name
	)
	for containerarns in ecsclusterinstances[u'containerInstanceArns']:
	response = ecs_client.describe_container_instances(
	cluster=cluster_name,
	containerInstances=[
	containerarns
	]
	)
	if response[u'containerInstances'][0][u'ec2InstanceId'] == instanceid:
	containerarn = containerarns
	if containerarn:
	logger.info("%s found registered with container ARN (%s) in ECS cluster. "
	"Attempting to deregister from cluster." % (instanceid, containerarn))
	ecs_response = ecs_client.deregister_container_instance(
	cluster=cluster_name,
	containerInstance=containerarn,
	force=True
	)
	logger.info("ECS Deregistration HTTP Response: %s" % ecs_response[u'ResponseMetadata'][u'HTTPStatusCode'])
	else:
	logger.info('%s is NOT registered with the Cluster.' % instanceid)
	return
	except Exception as e:
	notify_on_error(str(e))


	def wait_for_alb(alb):
	try:
	alb_client = boto3.client('elbv2')
	target_groups = alb_client.describe_target_groups(
	LoadBalancerArn=alb
	)
	for target_group in target_groups[u'TargetGroups']:
	for timer in range(0, 24):
	healthy_count = 0
	alb_response = alb_client.describe_target_health(
	TargetGroupArn=target_group[u'TargetGroupArn']
	)
	print json.dumps(alb_response)
	for target in alb_response[u'TargetHealthDescriptions']:
	instanceid = target[u'Target'][u'Id']
	if target[u'TargetHealth'][u'State'] == "healthy":
	healthy_count += 1
	logger.info("%s is healthy, bringing total healthy to %s" % (instanceid, healthy_count))
	logger.info("Description: %s" % target)
	time.sleep(5)
	else:
	logger.info("%s is not in a healthy state. It is in State: %s." % (instanceid, target[u'TargetHealth'][u'State']))
	if healthy_count >= 2:
	logger.info("Total healthy instances now at %s.. moving on" % healthy_count)
	return
	else:
	logger.info("Whoops! %s took more than 2 minutes to get 2 healthy instances attached to the ALB!"
	" Giving up on being being graceful!" % instanceid)
	return
	except Exception as e:
	notify_on_error(str(e))


	def wait_for_elb(elb):
	try:
	elb_client = boto3.client('elb')
	for timer in range(0, 60):
	elb_response = elb_client.describe_load_balancers(
	LoadBalancerNames=[elb]
	)

	if len(elb_response[u'LoadBalancerDescriptions'][0][u'Instances']) <= 2:
	logger.info("There are two or fewer instances registered with the ELB, "
	"waiting for another instance to register before moving on. (5 min timeout)")
	logger.info(elb_response[u'LoadBalancerDescriptions'][0][u'Instances'])
	time.sleep(5)
	else:
	logger.info("Sweet! We have more than two instances in the ELB, "
	"moving forward with termination request.")
	logger.info(elb_response[u'LoadBalancerDescriptions'][0][u'Instances'])
	break
	else:
	logger.info("Whoops! It took more than 5 minutes waiting for more instances to register in the ELB! "
	"Giving up on being graceful!")
	return
	except Exception as e:
	notify_on_error(str(e))


	def wait_for_tasks(ecscluster, wait=60):
	logger.info("Looking up status of tasks on %s" % ecscluster)
	ecs_client = boto3.client('ecs')
	if wait is 0:
	logger.info("Whoops! It took more than 5 minutes waiting for all the tasks to be up in the cluster! "
	"Giving up on being graceful!")
	return
	ecs_tasks = ecs_client.list_tasks(cluster=ecscluster)
	if not ecs_tasks[u'taskArns']:
	logger.info("No Tasks found.. Waiting for service to spawn tasks..")
	time.sleep(5)
	wait_for_tasks(ecscluster, wait-1)
	else:
	task_descs = ecs_client.describe_tasks(cluster=ecscluster, tasks=ecs_tasks[u'taskArns'])
	status = []
	for task in task_descs[u'tasks']:
	status.append(task[u'lastStatus'])
	if 'PENDING' in status:
	logger.info("Status of Tasks %s .. Waiting for RUNNING status.." % status)
	time.sleep(5)
	wait_for_tasks(ecscluster, wait-1)
	else:
	logger.info("Great! Status of Tasks is %s .. moving on" % status)
	return


	def complete_asg_lifecycle(hookname, asg, actiontoken, instanceid):
	try:
	asg_client = boto3.client('autoscaling')
	asg_response = asg_client.complete_lifecycle_action(
	LifecycleHookName=hookname,
	AutoScalingGroupName=asg,
	LifecycleActionToken=actiontoken,
	LifecycleActionResult='CONTINUE',
	InstanceId=instanceid
	)
	logger.info("ASG Complete Lifecycle Action Response: %s" % asg_response[u'ResponseMetadata'][u'HTTPStatusCode'])
	except Exception as e:
	notify_on_error(str(e))


	def lambda_handler(event, context):
	logger.info(json.dumps(event))
	message = json.loads(event[u'Records'][0][u'Sns'][u'Message'])
	logger.info(message)

	# Parse SNS message for required data.
	ec2instanceid = message['EC2InstanceId']
	stackname = message['NotificationMetadata']
	asgname = message['AutoScalingGroupName']
	lifecycleactiontoken = message['LifecycleActionToken']
	lifecyclehookname = message['LifecycleHookName']

	logger.info("EC2 Instance ID: %s" % ec2instanceid)
	logger.info("CF Stack Name: %s" % stackname)

	# Create Cloudformation connection object.
	cf_client = boto3.client('cloudformation')

	# Get stack details via describe_stacks
	stackdetails = cf_client.describe_stacks(StackName=stackname)

	# Initialize ALB var to None.
	albarn = None

	# Initialize ELB var to None.
	elbname = None

	# Parse the ALB ARN, ELB Name and ECS Cluster Name.
	for output in stackdetails[u'Stacks'][0][u'Outputs']:

	if output[u'OutputKey'] == 'alb':
	logger.info("ALB ARN: %s" % output[u'OutputValue'])
	albarn = output[u'OutputValue']

	if output[u'OutputKey'] == 'elb':
	logger.info("ELB NAME: %s" % output[u'OutputValue'])
	elbname = output[u'OutputValue']

	if output[u'OutputKey'] == 'ecscluster':
	logger.info("ECS CLUSTER NAME: %s" % output[u'OutputValue'])
	ecsclustername = output[u'OutputValue']

	# Deregister the EC2 instance from the ECS Cluster to
	# prevent new tasks from being launched on it.
	deregister_from_ecs_cluster(ec2instanceid, ecsclustername)

	# Check on health state of instance registered with
	# target group in ALB.
	if albarn:
	wait_for_alb(albarn)
	else:
	logger.info("No ALB found in stackdetails; skipping de-registration.")

	# Deregister the EC2 instance from the ELB to initiate
	# connection draining before allowing the ASG to terminate.
	if elbname:
	wait_for_elb(elbname)
	else:
	logger.info("No ELB found in stackdetails; skipping de-registration.")

	wait_for_tasks(ecsclustername)
	# Notify ASG to complete lifecycle; don't wait for timeout.
	complete_asg_lifecycle(lifecyclehookname, asgname, lifecycleactiontoken, ec2instanceid)