Skip to content

Instantly share code, notes, and snippets.

@fredsig
Last active May 15, 2020 12:41
Lambda function that drains tasks from an ECS instance. To be triggered by EC2 Spot Instance Interruption Warning.
import re
import boto3
from slack import Slack
boto3.setup_default_session(region_name='eu-west-1')
cw = boto3.client('cloudwatch')
ec2 = boto3.client('ec2')
ecs = boto3.client('ecs')
def get_cw_autoscale_alarm(ecs_cluster):
alarm_name = ecs_cluster + "-spot-autoscale-in"
# We're not using ec2.describe_spot_fleet_request_history because of a 30s delay.
alarm = cw.describe_alarms(AlarmNames=[alarm_name], StateValue='ALARM')['MetricAlarms']
if not alarm:
return False
else:
return True
def get_ecs_cluster(instance_id):
tags = ec2.describe_instances(InstanceIds=[instance_id])['Reservations'][0]['Instances'][0]['Tags']
for t in tags:
if t['Key'] == 'Name':
try:
ecs_cluster = re.search('(.+?)-ecs-cluster-(.+?)-ec2-instance-spot', t['Value'])
ecs_cluster = ecs_cluster.group(1) + '-' + ecs_cluster.group(2)
return ecs_cluster
except AttributeError:
print("ECS spot fleet instance not found.")
return False
def describe_ecs_instance_tasks(ecs_cluster, ecs_instance, msg):
msg += ["Affected Services:"]
list_tasks = []
list_tasks_instance = ecs.list_tasks(cluster=ecs_cluster, containerInstance=ecs_instance)['taskArns']
if list_tasks_instance:
describe_tasks = ecs.describe_tasks(cluster=ecs_cluster, tasks=list_tasks_instance)['tasks']
for task in describe_tasks:
list_tasks.append(task['group'])
for service in set(list_tasks):
msg += [service + ':' + str(list_tasks.count(service))]
else:
msg += ["No tasks running."]
return msg
def get_ecs_instance(ecs_cluster, instance_id, msg):
list_instances = ecs.list_container_instances(cluster=ecs_cluster)['containerInstanceArns']
describe_instances = ecs.describe_container_instances(cluster=ecs_cluster, containerInstances=list_instances)['containerInstances']
for e in describe_instances:
ecs_instance = e['containerInstanceArn']
status = e['status']
tasks = e['runningTasksCount']
if e['ec2InstanceId'] == instance_id:
msg += ["ECS instance: " + ecs_instance + ", Running Tasks: " + str(tasks) + ", Status: " + status]
if status == 'ACTIVE':
return [ecs_instance], msg
else:
msg += ["Instance not in ACTIVE status. Nothing to do."]
return False, msg
msg += ["Something went wrong, couldn't find ECS instance ARN"]
return False, msg
def drain_ecs_instance(ecs_cluster, ecs_instance, msg):
msg += ["Sending a DRAINING request..."]
drain = ecs.update_container_instances_state(cluster=ecs_cluster, containerInstances=ecs_instance, status='DRAINING')
if drain['containerInstances'][0]['status'] == 'DRAINING':
msg += ["Successfully set container instance status to DRAINING."]
else:
msg += ["Something went wrong."]
print(drain)
return msg
def slack(msg, slack_n=False):
message = '\n'.join(msg)
print(message)
if slack_n:
myslack = Slack()
myslack.message('alerts-monitor-info', message)
def lambda_handler(event, context):
print("Starting ec2-spot-ecs-drainer")
e = event
print(e)
if e['detail']['instance-action'] == 'terminate':
instance_id = e['detail']['instance-id']
ecs_cluster = get_ecs_cluster(instance_id)
if ecs_cluster:
if get_cw_autoscale_alarm(ecs_cluster) == False:
slack_n = True
else:
slack_n = False
msg = ["*Termination Notification ECS cluster:* " + ecs_cluster]
ecs_instance, msg = get_ecs_instance(ecs_cluster, instance_id, msg)
if ecs_instance:
msg += ["EC2 instance: " + e['resources'][0]]
msg = [describe_ecs_instance_tasks(ecs_cluster, ecs_instance[0], msg)][0]
msg = drain_ecs_instance(ecs_cluster, ecs_instance, msg)
slack(msg, slack_n)
else:
print("Nothing to do.")
return {}
import requests
import json
class Slack:
def __init__(self):
self.slack_webhook_url = '<replace_me_with_my_slack_webhook_url>'
def message(self, channel, message):
headers = {}
payload = {'channel': channel, 'attachments': [{'text': message, 'color': '#f7a311'}]}
headers['Content-Type'] = 'application/json'
try:
requests.post(self.slack_webhook_url, data=json.dumps(payload), headers=headers)
except Exception as e:
print('[Slack] Unexpected error: %s' % e)
resource "aws_cloudwatch_event_rule" "ec2-instance-termination" {
name = "capture-ec2-spot-instance-termination"
description = "Capture EC2 Spot Instance Interruption Warning"
event_pattern = <<PATTERN
{
"source": [
"aws.ec2"
],
"detail-type": [
"EC2 Spot Instance Interruption Warning"
]
}
PATTERN
}
resource "aws_cloudwatch_event_target" "trigger-ec2-spot-ecs-drainer-lambda" {
rule = "${aws_cloudwatch_event_rule.ec2-instance-termination.name}"
arn = "${replace-with-my-lambda.arn}"
target_id = "ec2-spot-ecs-drainer"
}
resource "aws_lambda_permission" "lambda-permission" {
statement_id = "AllowExecutionFromCloudWatch"
action = "lambda:InvokeFunction"
function_name = "ec2-spot-ecs-drainer"
principal = "events.amazonaws.com"
source_arn = "${aws_cloudwatch_event_rule.ec2-instance-termination.arn}"
}
resource "aws_iam_role_policy" "lambda-ec2-spot-ecs-drainer_role-policy" {
name = "lambda-ec2-spot-ecs-drainer_role-policy"
role = "${replace-with-my-lambda-role.id}"
policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"ec2:Describe*",
"ecs:DescribeContainerInstances",
"ecs:ListContainerInstances",
"ecs:ListTasks",
"ecs:DescribeTasks",
"ecs:UpdateContainerInstancesState",
"cloudwatch:DescribeAlarms"
],
"Resource": "*"
}
]
}
EOF
}
@fredsig
Copy link
Author

fredsig commented Sep 21, 2018

Notes:

  1. We only log to Slack if there is no autoscaling alarm triggered. You may want to remove the call to get_cw_autoscale_alarm(). Change slack.py to use your webhook_url and slack() function to use a specific channel.
  2. We find the ECS cluster a given EC2 instance belongs to via the instance name Tag. Each instance has name Tag of ecs-cluster-name-ec2-instance-spot. If you need to change this, just replace the regular expression at get_ecs_cluster().
  3. terraform-lambda-trigger.tf contains a sample Terraform configuration for capturing the EC2 Spot Instance Interruption Warning with CloudWatch and trigger your Lambda. This assumes you already have created the Lambda and uploaded the code (ecs_spot_drainer.py and slack.py). There is also a sample aws_iam_role_policy resource to be used with this Lambda.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment