Last active
March 26, 2018 06:04
-
-
Save kevinkarwaski/bc78df9eb5afa87b659df561f9746275 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
import json | |
import logging | |
import time | |
logger = logging.getLogger() | |
logger.setLevel(logging.INFO) | |
def notify_on_error(message): | |
logger.error(message) | |
try: | |
sns_client = boto3.client('sns') | |
sns_response = sns_client.publish( | |
TopicArn='arn:aws:sns:us-east-1:xxxxxxxxxxxx:Lambda-Errors', | |
Message=message, | |
Subject='ASG/ECS Lifecycle Termination Error' | |
) | |
logger.info("SNS Publish HTTP Response: %s" % sns_response[u'ResponseMetadata'][u'HTTPStatusCode']) | |
except Exception as e: | |
logger.error(e) | |
def deregister_from_ecs_cluster(instanceid, cluster_name): | |
try: | |
ecs_client = boto3.client('ecs') | |
containerarn = None | |
ecsclusterinstances = ecs_client.list_container_instances( | |
cluster=cluster_name | |
) | |
for containerarns in ecsclusterinstances[u'containerInstanceArns']: | |
response = ecs_client.describe_container_instances( | |
cluster=cluster_name, | |
containerInstances=[ | |
containerarns | |
] | |
) | |
if response[u'containerInstances'][0][u'ec2InstanceId'] == instanceid: | |
containerarn = containerarns | |
if containerarn: | |
logger.info("%s found registered with container ARN (%s) in ECS cluster. " | |
"Attempting to deregister from cluster." % (instanceid, containerarn)) | |
ecs_response = ecs_client.deregister_container_instance( | |
cluster=cluster_name, | |
containerInstance=containerarn, | |
force=True | |
) | |
logger.info("ECS Deregistration HTTP Response: %s" % ecs_response[u'ResponseMetadata'][u'HTTPStatusCode']) | |
else: | |
logger.info('%s is NOT registered with the Cluster.' % instanceid) | |
return | |
except Exception as e: | |
notify_on_error(str(e)) | |
def wait_for_alb(alb): | |
try: | |
alb_client = boto3.client('elbv2') | |
target_groups = alb_client.describe_target_groups( | |
LoadBalancerArn=alb | |
) | |
for target_group in target_groups[u'TargetGroups']: | |
for timer in range(0, 24): | |
healthy_count = 0 | |
alb_response = alb_client.describe_target_health( | |
TargetGroupArn=target_group[u'TargetGroupArn'] | |
) | |
print json.dumps(alb_response) | |
for target in alb_response[u'TargetHealthDescriptions']: | |
instanceid = target[u'Target'][u'Id'] | |
if target[u'TargetHealth'][u'State'] == "healthy": | |
healthy_count += 1 | |
logger.info("%s is healthy, bringing total healthy to %s" % (instanceid, healthy_count)) | |
logger.info("Description: %s" % target) | |
time.sleep(5) | |
else: | |
logger.info("%s is not in a healthy state. It is in State: %s." % (instanceid, target[u'TargetHealth'][u'State'])) | |
if healthy_count >= 2: | |
logger.info("Total healthy instances now at %s.. moving on" % healthy_count) | |
return | |
else: | |
logger.info("Whoops! %s took more than 2 minutes to get 2 healthy instances attached to the ALB!" | |
" Giving up on being being graceful!" % instanceid) | |
return | |
except Exception as e: | |
notify_on_error(str(e)) | |
def wait_for_elb(elb): | |
try: | |
elb_client = boto3.client('elb') | |
for timer in range(0, 60): | |
elb_response = elb_client.describe_load_balancers( | |
LoadBalancerNames=[elb] | |
) | |
if len(elb_response[u'LoadBalancerDescriptions'][0][u'Instances']) <= 2: | |
logger.info("There are two or fewer instances registered with the ELB, " | |
"waiting for another instance to register before moving on. (5 min timeout)") | |
logger.info(elb_response[u'LoadBalancerDescriptions'][0][u'Instances']) | |
time.sleep(5) | |
else: | |
logger.info("Sweet! We have more than two instances in the ELB, " | |
"moving forward with termination request.") | |
logger.info(elb_response[u'LoadBalancerDescriptions'][0][u'Instances']) | |
break | |
else: | |
logger.info("Whoops! It took more than 5 minutes waiting for more instances to register in the ELB! " | |
"Giving up on being graceful!") | |
return | |
except Exception as e: | |
notify_on_error(str(e)) | |
def wait_for_tasks(ecscluster, wait=60): | |
logger.info("Looking up status of tasks on %s" % ecscluster) | |
ecs_client = boto3.client('ecs') | |
if wait is 0: | |
logger.info("Whoops! It took more than 5 minutes waiting for all the tasks to be up in the cluster! " | |
"Giving up on being graceful!") | |
return | |
ecs_tasks = ecs_client.list_tasks(cluster=ecscluster) | |
if not ecs_tasks[u'taskArns']: | |
logger.info("No Tasks found.. Waiting for service to spawn tasks..") | |
time.sleep(5) | |
wait_for_tasks(ecscluster, wait-1) | |
else: | |
task_descs = ecs_client.describe_tasks(cluster=ecscluster, tasks=ecs_tasks[u'taskArns']) | |
status = [] | |
for task in task_descs[u'tasks']: | |
status.append(task[u'lastStatus']) | |
if 'PENDING' in status: | |
logger.info("Status of Tasks %s .. Waiting for RUNNING status.." % status) | |
time.sleep(5) | |
wait_for_tasks(ecscluster, wait-1) | |
else: | |
logger.info("Great! Status of Tasks is %s .. moving on" % status) | |
return | |
def complete_asg_lifecycle(hookname, asg, actiontoken, instanceid): | |
try: | |
asg_client = boto3.client('autoscaling') | |
asg_response = asg_client.complete_lifecycle_action( | |
LifecycleHookName=hookname, | |
AutoScalingGroupName=asg, | |
LifecycleActionToken=actiontoken, | |
LifecycleActionResult='CONTINUE', | |
InstanceId=instanceid | |
) | |
logger.info("ASG Complete Lifecycle Action Response: %s" % asg_response[u'ResponseMetadata'][u'HTTPStatusCode']) | |
except Exception as e: | |
notify_on_error(str(e)) | |
def lambda_handler(event, context): | |
logger.info(json.dumps(event)) | |
message = json.loads(event[u'Records'][0][u'Sns'][u'Message']) | |
logger.info(message) | |
# Parse SNS message for required data. | |
ec2instanceid = message['EC2InstanceId'] | |
stackname = message['NotificationMetadata'] | |
asgname = message['AutoScalingGroupName'] | |
lifecycleactiontoken = message['LifecycleActionToken'] | |
lifecyclehookname = message['LifecycleHookName'] | |
logger.info("EC2 Instance ID: %s" % ec2instanceid) | |
logger.info("CF Stack Name: %s" % stackname) | |
# Create Cloudformation connection object. | |
cf_client = boto3.client('cloudformation') | |
# Get stack details via describe_stacks | |
stackdetails = cf_client.describe_stacks(StackName=stackname) | |
# Initialize ALB var to None. | |
albarn = None | |
# Initialize ELB var to None. | |
elbname = None | |
# Parse the ALB ARN, ELB Name and ECS Cluster Name. | |
for output in stackdetails[u'Stacks'][0][u'Outputs']: | |
if output[u'OutputKey'] == 'alb': | |
logger.info("ALB ARN: %s" % output[u'OutputValue']) | |
albarn = output[u'OutputValue'] | |
if output[u'OutputKey'] == 'elb': | |
logger.info("ELB NAME: %s" % output[u'OutputValue']) | |
elbname = output[u'OutputValue'] | |
if output[u'OutputKey'] == 'ecscluster': | |
logger.info("ECS CLUSTER NAME: %s" % output[u'OutputValue']) | |
ecsclustername = output[u'OutputValue'] | |
# Deregister the EC2 instance from the ECS Cluster to | |
# prevent new tasks from being launched on it. | |
deregister_from_ecs_cluster(ec2instanceid, ecsclustername) | |
# Check on health state of instance registered with | |
# target group in ALB. | |
if albarn: | |
wait_for_alb(albarn) | |
else: | |
logger.info("No ALB found in stackdetails; skipping de-registration.") | |
# Deregister the EC2 instance from the ELB to initiate | |
# connection draining before allowing the ASG to terminate. | |
if elbname: | |
wait_for_elb(elbname) | |
else: | |
logger.info("No ELB found in stackdetails; skipping de-registration.") | |
wait_for_tasks(ecsclustername) | |
# Notify ASG to complete lifecycle; don't wait for timeout. | |
complete_asg_lifecycle(lifecyclehookname, asgname, lifecycleactiontoken, ec2instanceid) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I'm not really sure that this code: https://gist.github.com/kevinkarwaski/bc78df9eb5afa87b659df561f9746275#file-asg-graceful-lifecycle-termination-py-L86-L103 , does anything. the aws command reference, https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_DeregisterContainerInstance.html, says:
so it seems that the call to DeregisterContainerInstance does starts the ELB de-registration independent of the code in this lambda function that attempts to manage it elb de-register.