Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@kevinmehall
Last active February 20, 2022 23:24
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save kevinmehall/f3e53a0be9d59f0759d8fbba872bc8cf to your computer and use it in GitHub Desktop.
Save kevinmehall/f3e53a0be9d59f0759d8fbba872bc8cf to your computer and use it in GitHub Desktop.
Script to drain and replace EC2 instances in an ECS cluster auto-scaling group after changing the AMI or instance type
#!/usr/bin/env python3
#
# Script to replace EC2 instances in an ECS cluster's auto-scaling group after
# changing the AMI or instance type in the launch configuration. It
# checks for instances with the incorrect AMI or type, scales up the
# auto-scaling group with replacement instances, then drains the tasks
# from the old instances.
#
# Usage: aws-vault exec profile-name -- python3 replace_ecs_cluster_instances.py --group=asg-name --cluster=ecs-cluster-name --count=3
#
# The count is specified so that it knows what the "real" desired count is in
# case it is interrupted and restarted after increasing the desired count.
#
# License: ISC
# Copyright 2019 3D Robotics
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
# RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE
# USE OR PERFORMANCE OF THIS SOFTWARE.
#
import boto3
import argparse
import time
autoscaling = boto3.client('autoscaling')
ec2 = boto3.client('ec2')
ecs = boto3.client('ecs')
parser = argparse.ArgumentParser(description='Update the instances in the autoscaling group.')
parser.add_argument('--group', metavar='NAME', required=True, help='Autoscaling group name')
parser.add_argument('--cluster', metavar='NAME', required=True, help='ECS cluster name')
parser.add_argument('--count', metavar='ARN', type=int, required=True, help='Desired count of instances in autoscaling group')
args = parser.parse_args()
group_name = args.group
cluster_name = args.cluster
desired_count = args.count
describe_group = autoscaling.describe_auto_scaling_groups(AutoScalingGroupNames=[group_name])['AutoScalingGroups'][0]
assert(describe_group['AutoScalingGroupName'] == group_name)
desired_capacity = describe_group['DesiredCapacity']
target_launch_template = describe_group['LaunchTemplate']
asg_instances = [instance['InstanceId'] for instance in describe_group['Instances']]
prev_desired_count = describe_group['DesiredCapacity']
print(f"Target launch template {target_launch_template['LaunchTemplateId']} {target_launch_template['Version']}")
describe_launch_template = ec2.describe_launch_template_versions(
LaunchTemplateId = target_launch_template['LaunchTemplateId'],
Versions = [target_launch_template['Version']]
)['LaunchTemplateVersions'][0]
target_ami = describe_launch_template['LaunchTemplateData']['ImageId']
target_instance_type = describe_launch_template['LaunchTemplateData']['InstanceType']
print(f"Target AMI {target_ami} on {target_instance_type}")
describe_instances_reservations = ec2.describe_instances(InstanceIds=asg_instances)['Reservations']
instances_to_replace = []
for reservation in describe_instances_reservations:
for instance in reservation['Instances']:
instance_id = instance['InstanceId']
instance_ami = instance['ImageId']
instance_type = instance['InstanceType']
instance_launched = instance['LaunchTime']
needs_replace = instance_ami != target_ami or instance_type !=target_instance_type
if needs_replace:
instances_to_replace.append(instance_id)
print(f"Instance {instance_id}, created {instance_launched.ctime()}, type {instance_type}, AMI {instance_ami} -- {'REPLACE' if needs_replace else 'OK'}")
new_desired_count = max(prev_desired_count, desired_count + len(instances_to_replace))
print(f"Temporarily scaling cluster from {prev_desired_count} to {new_desired_count} instances")
autoscaling.set_desired_capacity(AutoScalingGroupName=group_name, DesiredCapacity=new_desired_count)
while True:
print('\n----\n')
list_container_instances = ecs.list_container_instances(cluster = cluster_name)['containerInstanceArns']
container_instances = ecs.describe_container_instances(cluster = cluster_name, containerInstances = list_container_instances)['containerInstances']
container_instances.sort(key = lambda ci: ci['registeredAt'])
available_instances = 0
remaining_tasks = 0
to_drain = []
for ci in container_instances:
ci_ec2_id = ci['ec2InstanceId']
ci_arn = ci['containerInstanceArn']
running_tasks = ci['runningTasksCount']
status = ci['status']
print(f"{ci_ec2_id} {status}, {running_tasks} tasks")
if ci_ec2_id in instances_to_replace:
remaining_tasks += running_tasks
if status == 'ACTIVE':
to_drain.append(ci_arn)
elif status == 'ACTIVE':
available_instances += 1
if available_instances < desired_count:
print("Waiting for new instances to boot")
elif len(to_drain) > 0:
print("Draining instances:", to_drain)
ecs.update_container_instances_state(cluster = cluster_name, containerInstances = to_drain, status='DRAINING')
elif remaining_tasks == 0:
break
else:
print("Waiting for instances to drain")
time.sleep(10)
for instance_id in instances_to_replace:
if input(f"Terminate instance {instance_id}? (y/n) ") == "y":
autoscaling.terminate_instance_in_auto_scaling_group(InstanceId=instance_id, ShouldDecrementDesiredCapacity=True)
print("Terminated instance")
else:
print("Not terminating this instance")
print("Done")
@briancurt
Copy link

@kevinmehall Thanks for sharing the script. I had a case where the temporary new_desired_count exceeded the maximium capacity for the autoscaing group. I fixed that by adding

max_capacity = describe_group['MaxSize']
if new_desired_count > max_capacity:
  print(f"Temporarily setting autoscaling group max size to {new_desired_count} instances")
  autoscaling.update_auto_scaling_group(AutoScalingGroupName=group_name, MaxSize=new_desired_count)

after line 82. Then autoscaling.update_auto_scaling_group(AutoScalingGroupName=group_name, MaxSize=max_capacity) to bring the autoscaling max size back to normal at the end.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment