tommyblue/replace_instances.py Secret

## 159 changes: 159 additions & 0 deletions replace_instances.py
@@ -0,0 +1,159 @@

    #!/usr/bin/env python3
#!/usr/bin/env python3

    """
"""

    Replaces all instances of a cluster in a region.
Replaces all instances of a cluster in a region.

    This only works if all instances are managed by scaling group
This only works if all instances are managed by scaling group

    """
"""

    import argparse
import argparse

    import functools
import functools

    import logging
import logging

    import os
import os

    import time
import time


    import boto3
import boto3


    ### START CONFIGURATIONS
### START CONFIGURATIONS

    CLUSTER = ""  # Name of the cluster
CLUSTER = ""  # Name of the cluster

    REGION = ""   # The region where the cluster is running
REGION = ""   # The region where the cluster is running

    ASG_NAME = "" # Name of the autoscaling group
ASG_NAME = "" # Name of the autoscaling group

    ### END CONFIGURATIONS
### END CONFIGURATIONS


    def main():
def main():

        ecs_client = boto3.client("ecs", REGION)
    ecs_client = boto3.client("ecs", REGION)

        asg_client = boto3.client("autoscaling", REGION)
    asg_client = boto3.client("autoscaling", REGION)


        # Can't replace if desired instances is different from running
    # Can't replace if desired instances is different from running

        if _get_desired_capacity(asg_client) != _get_running_instances():
    if _get_desired_capacity(asg_client) != _get_running_instances():

            logging.critical("Can't replace instances if autoscaling activity is ongoing")
        logging.critical("Can't replace instances if autoscaling activity is ongoing")

            os._exit(1)
        os._exit(1)


        logging.info("Finding the number of desired instances in the autoscaling group")
    logging.info("Finding the number of desired instances in the autoscaling group")

        desired_instances = _get_desired_capacity(asg_client)
    desired_instances = _get_desired_capacity(asg_client)


        logging.info("Set the running instances status as 'DRAINING'")
    logging.info("Set the running instances status as 'DRAINING'")

        _set_running_instances_as_draining(ecs_client)
    _set_running_instances_as_draining(ecs_client)


        logging.info("Modifying the autoscaling group doubling the desired instances")
    logging.info("Modifying the autoscaling group doubling the desired instances")

        _set_desired_capacity(asg_client, desired_instances*2)
    _set_desired_capacity(asg_client, desired_instances*2)


        logging.info("Waiting for the new instances to be launched")
    logging.info("Waiting for the new instances to be launched")

        if not _wait_instances(desired_instances*2):
    if not _wait_instances(desired_instances*2):

            os._exit(1)
        os._exit(1)


        logging.info("Waiting all tasks in the draining instances to be stopped")
    logging.info("Waiting all tasks in the draining instances to be stopped")

        _wait_draining_instances_are_empty(ecs_client)
    _wait_draining_instances_are_empty(ecs_client)


        logging.info("Bringing back the desired count in the asg to its initial value")
    logging.info("Bringing back the desired count in the asg to its initial value")

        _set_desired_capacity(asg_client, desired_instances)
    _set_desired_capacity(asg_client, desired_instances)


        logging.info("Waiting for the drained instances to be shutdown")
    logging.info("Waiting for the drained instances to be shutdown")

        if not _wait_instances(desired_instances):
    if not _wait_instances(desired_instances):

            os._exit(1)
        os._exit(1)


    def _get_desired_capacity(asg_client) -> int:
def _get_desired_capacity(asg_client) -> int:

        resp = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[ASG_NAME])
    resp = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[ASG_NAME])

        if len(resp['AutoScalingGroups']) != 1:
    if len(resp['AutoScalingGroups']) != 1:

            logging.critical("Too many ASG! {}".format(resp))
        logging.critical("Too many ASG! {}".format(resp))

            os._exit(1)
        os._exit(1)

        desired = resp['AutoScalingGroups'][0]["DesiredCapacity"]
    desired = resp['AutoScalingGroups'][0]["DesiredCapacity"]

        return desired
    return desired


    def _set_running_instances_as_draining(ecs_client):
def _set_running_instances_as_draining(ecs_client):

        all_instances = ecs_client.list_container_instances(cluster=CLUSTER)['containerInstanceArns']
    all_instances = ecs_client.list_container_instances(cluster=CLUSTER)['containerInstanceArns']

        ecs_client.update_container_instances_state(
    ecs_client.update_container_instances_state(

            cluster=CLUSTER,
        cluster=CLUSTER,

            containerInstances=all_instances,
        containerInstances=all_instances,

            status='DRAINING'
        status='DRAINING'

        )
    )


    def _set_desired_capacity(asg_client, desired_instances):
def _set_desired_capacity(asg_client, desired_instances):

        asg_client.set_desired_capacity(
    asg_client.set_desired_capacity(

            AutoScalingGroupName=ASG_NAME,
        AutoScalingGroupName=ASG_NAME,

            DesiredCapacity=desired_instances,
        DesiredCapacity=desired_instances,

            HonorCooldown=False
        HonorCooldown=False

        )
    )


    @with_sleep(sleep_time=30, max_attempts=20)
@with_sleep(sleep_time=30, max_attempts=20)

    def _wait_instances(desired_instances):
def _wait_instances(desired_instances):

        running = _get_running_instances()
    running = _get_running_instances()

        if running == desired_instances:
    if running == desired_instances:

            logging.info("Done!")
        logging.info("Done!")

            return True
        return True


    def _get_running_instances() -> int:
def _get_running_instances() -> int:

        instances = _describe_container_instances()
    instances = _describe_container_instances()

        return len(instances['containerInstances'])
    return len(instances['containerInstances'])


    @with_sleep(sleep_time=30, max_attempts=20)
@with_sleep(sleep_time=30, max_attempts=20)

    def _wait_draining_instances_are_empty(ecs_client):
def _wait_draining_instances_are_empty(ecs_client):

        tasks_per_instance = _get_tasks_per_instance(ecs_client, status=["DRAINING"])
    tasks_per_instance = _get_tasks_per_instance(ecs_client, status=["DRAINING"])

        if sum(tasks_per_instance.values()) == 0:
    if sum(tasks_per_instance.values()) == 0:

            return True
        return True


    def _get_tasks_per_instance(ecs_client, status=None):
def _get_tasks_per_instance(ecs_client, status=None):

        tasks_list = ecs_client.list_tasks(cluster=CLUSTER)
    tasks_list = ecs_client.list_tasks(cluster=CLUSTER)

        return _tasks_per_instance(
    return _tasks_per_instance(

            ecs_client, tasks_list['taskArns'], status=status)
        ecs_client, tasks_list['taskArns'], status=status)


    def _describe_container_instances():
def _describe_container_instances():

        ecs_client = boto3.client("ecs", REGION)
    ecs_client = boto3.client("ecs", REGION)


        containers_response = ecs_client.list_container_instances(cluster=CLUSTER)
    containers_response = ecs_client.list_container_instances(cluster=CLUSTER)


        cluster_instances = _describe_container_instances(
    cluster_instances = _describe_container_instances(

            cluster=CLUSTER, containerInstances=containers_response['containerInstanceArns'])
        cluster=CLUSTER, containerInstances=containers_response['containerInstanceArns'])


        return cluster_instances
    return cluster_instances


    def _tasks_per_instance(ecs_client, tasks_list: list, status=None) -> dict:
def _tasks_per_instance(ecs_client, tasks_list: list, status=None) -> dict:

        """
    """

        Receives an `instances` dictionary with the instances arn as key and 0 as values, returns
    Receives an `instances` dictionary with the instances arn as key and 0 as values, returns

        a dictionary where the arn is replaced with the id and the value is the number of tasks
    a dictionary where the arn is replaced with the id and the value is the number of tasks

        running on that instance
    running on that instance

        """
    """

        if status is None:
    if status is None:

            status = ["ACTIVE"]
        status = ["ACTIVE"]

        instances = get_instances_dict(ecs_client, CLUSTER)
    instances = get_instances_dict(ecs_client, CLUSTER)

        tasks_desc = ecs_client.describe_tasks(cluster=CLUSTER, tasks=tasks_list)
    tasks_desc = ecs_client.describe_tasks(cluster=CLUSTER, tasks=tasks_list)

        for t in tasks_desc['tasks']:
    for t in tasks_desc['tasks']:

            instances[t['containerInstanceArn']] += 1
        instances[t['containerInstanceArn']] += 1


        response = ecs_client.describe_container_instances(
    response = ecs_client.describe_container_instances(

            cluster=CLUSTER, containerInstances=list(instances.keys()))
        cluster=CLUSTER, containerInstances=list(instances.keys()))


        instances_as_ids = {
    instances_as_ids = {

            i['ec2InstanceId']: instances[i['containerInstanceArn']]
        i['ec2InstanceId']: instances[i['containerInstanceArn']]

            for i in response['containerInstances'] if i['status'] in status
        for i in response['containerInstances'] if i['status'] in status

        }
    }


        return instances_as_ids
    return instances_as_ids


    def with_sleep(sleep_time=5, max_attempts=3):
def with_sleep(sleep_time=5, max_attempts=3):

        def sleep_decorator(func):
    def sleep_decorator(func):

            @functools.wraps(func)
        @functools.wraps(func)

            def wrapper_sleep(*args, **kwargs):
        def wrapper_sleep(*args, **kwargs):

                attempts = 0
            attempts = 0

                while True:
            while True:

                    attempts += 1
                attempts += 1

                    time.sleep(sleep_time)
                time.sleep(sleep_time)


                    ret = func(*args, **kwargs)
                ret = func(*args, **kwargs)

                    if ret is not None:
                if ret is not None:

                        return ret
                    return ret


                    if attempts > max_attempts:
                if attempts > max_attempts:

                        print(
                    print(

                            "Still not ready after %s seconds, please investigate." % (sleep_time*max_attempts))
                        "Still not ready after %s seconds, please investigate." % (sleep_time*max_attempts))

                        return False
                    return False

            return wrapper_sleep
        return wrapper_sleep

        return sleep_decorator
    return sleep_decorator


    if __name__ == '__main__':
if __name__ == '__main__':

        main()
    main()