Skip to content

Instantly share code, notes, and snippets.

@fredsig
Last active May 15, 2020 12:41
Show Gist options
  • Save fredsig/989a02b85206d48067fd75faba70a804 to your computer and use it in GitHub Desktop.
Save fredsig/989a02b85206d48067fd75faba70a804 to your computer and use it in GitHub Desktop.
Lambda function that drains tasks from an ECS instance. To be triggered by EC2 Spot Instance Interruption Warning.
import re
import boto3
from slack import Slack
boto3.setup_default_session(region_name='eu-west-1')
cw = boto3.client('cloudwatch')
ec2 = boto3.client('ec2')
ecs = boto3.client('ecs')
def get_cw_autoscale_alarm(ecs_cluster):
alarm_name = ecs_cluster + "-spot-autoscale-in"
# We're not using ec2.describe_spot_fleet_request_history because of a 30s delay.
alarm = cw.describe_alarms(AlarmNames=[alarm_name], StateValue='ALARM')['MetricAlarms']
if not alarm:
return False
else:
return True
def get_ecs_cluster(instance_id):
tags = ec2.describe_instances(InstanceIds=[instance_id])['Reservations'][0]['Instances'][0]['Tags']
for t in tags:
if t['Key'] == 'Name':
try:
ecs_cluster = re.search('(.+?)-ecs-cluster-(.+?)-ec2-instance-spot', t['Value'])
ecs_cluster = ecs_cluster.group(1) + '-' + ecs_cluster.group(2)
return ecs_cluster
except AttributeError:
print("ECS spot fleet instance not found.")
return False
def describe_ecs_instance_tasks(ecs_cluster, ecs_instance, msg):
msg += ["Affected Services:"]
list_tasks = []
list_tasks_instance = ecs.list_tasks(cluster=ecs_cluster, containerInstance=ecs_instance)['taskArns']
if list_tasks_instance:
describe_tasks = ecs.describe_tasks(cluster=ecs_cluster, tasks=list_tasks_instance)['tasks']
for task in describe_tasks:
list_tasks.append(task['group'])
for service in set(list_tasks):
msg += [service + ':' + str(list_tasks.count(service))]
else:
msg += ["No tasks running."]
return msg
def get_ecs_instance(ecs_cluster, instance_id, msg):
list_instances = ecs.list_container_instances(cluster=ecs_cluster)['containerInstanceArns']
describe_instances = ecs.describe_container_instances(cluster=ecs_cluster, containerInstances=list_instances)['containerInstances']
for e in describe_instances:
ecs_instance = e['containerInstanceArn']
status = e['status']
tasks = e['runningTasksCount']
if e['ec2InstanceId'] == instance_id:
msg += ["ECS instance: " + ecs_instance + ", Running Tasks: " + str(tasks) + ", Status: " + status]
if status == 'ACTIVE':
return [ecs_instance], msg
else:
msg += ["Instance not in ACTIVE status. Nothing to do."]
return False, msg
msg += ["Something went wrong, couldn't find ECS instance ARN"]
return False, msg
def drain_ecs_instance(ecs_cluster, ecs_instance, msg):
msg += ["Sending a DRAINING request..."]
drain = ecs.update_container_instances_state(cluster=ecs_cluster, containerInstances=ecs_instance, status='DRAINING')
if drain['containerInstances'][0]['status'] == 'DRAINING':
msg += ["Successfully set container instance status to DRAINING."]
else:
msg += ["Something went wrong."]
print(drain)
return msg
def slack(msg, slack_n=False):
message = '\n'.join(msg)
print(message)
if slack_n:
myslack = Slack()
myslack.message('alerts-monitor-info', message)
def lambda_handler(event, context):
print("Starting ec2-spot-ecs-drainer")
e = event
print(e)
if e['detail']['instance-action'] == 'terminate':
instance_id = e['detail']['instance-id']
ecs_cluster = get_ecs_cluster(instance_id)
if ecs_cluster:
if get_cw_autoscale_alarm(ecs_cluster) == False:
slack_n = True
else:
slack_n = False
msg = ["*Termination Notification ECS cluster:* " + ecs_cluster]
ecs_instance, msg = get_ecs_instance(ecs_cluster, instance_id, msg)
if ecs_instance:
msg += ["EC2 instance: " + e['resources'][0]]
msg = [describe_ecs_instance_tasks(ecs_cluster, ecs_instance[0], msg)][0]
msg = drain_ecs_instance(ecs_cluster, ecs_instance, msg)
slack(msg, slack_n)
else:
print("Nothing to do.")
return {}
import requests
import json
class Slack:
def __init__(self):
self.slack_webhook_url = '<replace_me_with_my_slack_webhook_url>'
def message(self, channel, message):
headers = {}
payload = {'channel': channel, 'attachments': [{'text': message, 'color': '#f7a311'}]}
headers['Content-Type'] = 'application/json'
try:
requests.post(self.slack_webhook_url, data=json.dumps(payload), headers=headers)
except Exception as e:
print('[Slack] Unexpected error: %s' % e)
resource "aws_cloudwatch_event_rule" "ec2-instance-termination" {
name = "capture-ec2-spot-instance-termination"
description = "Capture EC2 Spot Instance Interruption Warning"
event_pattern = <<PATTERN
{
"source": [
"aws.ec2"
],
"detail-type": [
"EC2 Spot Instance Interruption Warning"
]
}
PATTERN
}
resource "aws_cloudwatch_event_target" "trigger-ec2-spot-ecs-drainer-lambda" {
rule = "${aws_cloudwatch_event_rule.ec2-instance-termination.name}"
arn = "${replace-with-my-lambda.arn}"
target_id = "ec2-spot-ecs-drainer"
}
resource "aws_lambda_permission" "lambda-permission" {
statement_id = "AllowExecutionFromCloudWatch"
action = "lambda:InvokeFunction"
function_name = "ec2-spot-ecs-drainer"
principal = "events.amazonaws.com"
source_arn = "${aws_cloudwatch_event_rule.ec2-instance-termination.arn}"
}
resource "aws_iam_role_policy" "lambda-ec2-spot-ecs-drainer_role-policy" {
name = "lambda-ec2-spot-ecs-drainer_role-policy"
role = "${replace-with-my-lambda-role.id}"
policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"ec2:Describe*",
"ecs:DescribeContainerInstances",
"ecs:ListContainerInstances",
"ecs:ListTasks",
"ecs:DescribeTasks",
"ecs:UpdateContainerInstancesState",
"cloudwatch:DescribeAlarms"
],
"Resource": "*"
}
]
}
EOF
}
@fredsig
Copy link
Author

fredsig commented Sep 21, 2018

Notes:

  1. We only log to Slack if there is no autoscaling alarm triggered. You may want to remove the call to get_cw_autoscale_alarm(). Change slack.py to use your webhook_url and slack() function to use a specific channel.
  2. We find the ECS cluster a given EC2 instance belongs to via the instance name Tag. Each instance has name Tag of ecs-cluster-name-ec2-instance-spot. If you need to change this, just replace the regular expression at get_ecs_cluster().
  3. terraform-lambda-trigger.tf contains a sample Terraform configuration for capturing the EC2 Spot Instance Interruption Warning with CloudWatch and trigger your Lambda. This assumes you already have created the Lambda and uploaded the code (ecs_spot_drainer.py and slack.py). There is also a sample aws_iam_role_policy resource to be used with this Lambda.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment