Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Lambda function that drains tasks from an ECS instance. To be triggered by EC2 Spot Instance Interruption Warning.
import re
import boto3
from slack import Slack
cw = boto3.client('cloudwatch')
ec2 = boto3.client('ec2')
ecs = boto3.client('ecs')
def get_cw_autoscale_alarm(ecs_cluster):
alarm_name = ecs_cluster + "-spot-autoscale-in"
# We're not using ec2.describe_spot_fleet_request_history because of a 30s delay.
alarm = cw.describe_alarms(AlarmNames=[alarm_name], StateValue='ALARM')['MetricAlarms']
if not alarm:
return False
return True
def get_ecs_cluster(instance_id):
tags = ec2.describe_instances(InstanceIds=[instance_id])['Reservations'][0]['Instances'][0]['Tags']
for t in tags:
if t['Key'] == 'Name':
ecs_cluster ='(.+?)-ecs-cluster-(.+?)-ec2-instance-spot', t['Value'])
ecs_cluster = + '-' +
return ecs_cluster
except AttributeError:
print("ECS spot fleet instance not found.")
return False
def describe_ecs_instance_tasks(ecs_cluster, ecs_instance, msg):
msg += ["Affected Services:"]
list_tasks = []
list_tasks_instance = ecs.list_tasks(cluster=ecs_cluster, containerInstance=ecs_instance)['taskArns']
if list_tasks_instance:
describe_tasks = ecs.describe_tasks(cluster=ecs_cluster, tasks=list_tasks_instance)['tasks']
for task in describe_tasks:
for service in set(list_tasks):
msg += [service + ':' + str(list_tasks.count(service))]
msg += ["No tasks running."]
return msg
def get_ecs_instance(ecs_cluster, instance_id, msg):
list_instances = ecs.list_container_instances(cluster=ecs_cluster)['containerInstanceArns']
describe_instances = ecs.describe_container_instances(cluster=ecs_cluster, containerInstances=list_instances)['containerInstances']
for e in describe_instances:
ecs_instance = e['containerInstanceArn']
status = e['status']
tasks = e['runningTasksCount']
if e['ec2InstanceId'] == instance_id:
msg += ["ECS instance: " + ecs_instance + ", Running Tasks: " + str(tasks) + ", Status: " + status]
if status == 'ACTIVE':
return [ecs_instance], msg
msg += ["Instance not in ACTIVE status. Nothing to do."]
return False, msg
msg += ["Something went wrong, couldn't find ECS instance ARN"]
return False, msg
def drain_ecs_instance(ecs_cluster, ecs_instance, msg):
msg += ["Sending a DRAINING request..."]
drain = ecs.update_container_instances_state(cluster=ecs_cluster, containerInstances=ecs_instance, status='DRAINING')
if drain['containerInstances'][0]['status'] == 'DRAINING':
msg += ["Successfully set container instance status to DRAINING."]
msg += ["Something went wrong."]
return msg
def slack(msg, slack_n=False):
message = '\n'.join(msg)
if slack_n:
myslack = Slack()
myslack.message('alerts-monitor-info', message)
def lambda_handler(event, context):
print("Starting ec2-spot-ecs-drainer")
e = event
if e['detail']['instance-action'] == 'terminate':
instance_id = e['detail']['instance-id']
ecs_cluster = get_ecs_cluster(instance_id)
if ecs_cluster:
if get_cw_autoscale_alarm(ecs_cluster) == False:
slack_n = True
slack_n = False
msg = ["*Termination Notification ECS cluster:* " + ecs_cluster]
ecs_instance, msg = get_ecs_instance(ecs_cluster, instance_id, msg)
if ecs_instance:
msg += ["EC2 instance: " + e['resources'][0]]
msg = [describe_ecs_instance_tasks(ecs_cluster, ecs_instance[0], msg)][0]
msg = drain_ecs_instance(ecs_cluster, ecs_instance, msg)
slack(msg, slack_n)
print("Nothing to do.")
return {}
import requests
import json
class Slack:
def __init__(self):
self.slack_webhook_url = '<replace_me_with_my_slack_webhook_url>'
def message(self, channel, message):
headers = {}
payload = {'channel': channel, 'attachments': [{'text': message, 'color': '#f7a311'}]}
headers['Content-Type'] = 'application/json'
try:, data=json.dumps(payload), headers=headers)
except Exception as e:
print('[Slack] Unexpected error: %s' % e)
resource "aws_cloudwatch_event_rule" "ec2-instance-termination" {
name = "capture-ec2-spot-instance-termination"
description = "Capture EC2 Spot Instance Interruption Warning"
event_pattern = <<PATTERN
"source": [
"detail-type": [
"EC2 Spot Instance Interruption Warning"
resource "aws_cloudwatch_event_target" "trigger-ec2-spot-ecs-drainer-lambda" {
rule = "${}"
arn = "${replace-with-my-lambda.arn}"
target_id = "ec2-spot-ecs-drainer"
resource "aws_lambda_permission" "lambda-permission" {
statement_id = "AllowExecutionFromCloudWatch"
action = "lambda:InvokeFunction"
function_name = "ec2-spot-ecs-drainer"
principal = ""
source_arn = "${aws_cloudwatch_event_rule.ec2-instance-termination.arn}"
resource "aws_iam_role_policy" "lambda-ec2-spot-ecs-drainer_role-policy" {
name = "lambda-ec2-spot-ecs-drainer_role-policy"
role = "${}"
policy = <<EOF
"Version": "2012-10-17",
"Statement": [
"Effect": "Allow",
"Action": [
"Resource": "*"

This comment has been minimized.

Copy link
Owner Author

@fredsig fredsig commented Sep 21, 2018


  1. We only log to Slack if there is no autoscaling alarm triggered. You may want to remove the call to get_cw_autoscale_alarm(). Change to use your webhook_url and slack() function to use a specific channel.
  2. We find the ECS cluster a given EC2 instance belongs to via the instance name Tag. Each instance has name Tag of ecs-cluster-name-ec2-instance-spot. If you need to change this, just replace the regular expression at get_ecs_cluster().
  3. contains a sample Terraform configuration for capturing the EC2 Spot Instance Interruption Warning with CloudWatch and trigger your Lambda. This assumes you already have created the Lambda and uploaded the code ( and There is also a sample aws_iam_role_policy resource to be used with this Lambda.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment