Skip to content

Instantly share code, notes, and snippets.

@bcambel
Forked from fredsig/ec2-spot-ecs-drainer.png
Created October 28, 2018 07:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bcambel/edfcd61d4f749642c8cd9ab0a67239d2 to your computer and use it in GitHub Desktop.
Save bcambel/edfcd61d4f749642c8cd9ab0a67239d2 to your computer and use it in GitHub Desktop.
Lambda function that drains tasks from an ECS instance. To be triggered by EC2 Spot Instance Interruption Warning.
import re
import boto3
from slack import Slack
boto3.setup_default_session(region_name='eu-west-1')
cw = boto3.client('cloudwatch')
ec2 = boto3.client('ec2')
ecs = boto3.client('ecs')
def get_cw_autoscale_alarm(ecs_cluster):
alarm_name = ecs_cluster + "-spot-autoscale-in"
# We're not using ec2.describe_spot_fleet_request_history because of a 30s delay.
alarm = cw.describe_alarms(AlarmNames=[alarm_name], StateValue='ALARM')['MetricAlarms']
if not alarm:
return False
else:
return True
def get_ecs_cluster(instance_id):
tags = ec2.describe_instances(InstanceIds=[instance_id])['Reservations'][0]['Instances'][0]['Tags']
for t in tags:
if t['Key'] == 'Name':
try:
ecs_cluster = re.search('(.+?)-ecs-cluster-(.+?)-ec2-instance-spot', t['Value'])
ecs_cluster = ecs_cluster.group(1) + '-' + ecs_cluster.group(2)
return ecs_cluster
except AttributeError:
print("ECS spot fleet instance not found.")
return False
def describe_ecs_instance_tasks(ecs_cluster, ecs_instance, msg):
msg += ["Affected Services:"]
list_tasks = []
list_tasks_instance = ecs.list_tasks(cluster=ecs_cluster, containerInstance=ecs_instance)['taskArns']
describe_tasks = ecs.describe_tasks(cluster=ecs_cluster, tasks=list_tasks_instance)['tasks']
for t in describe_tasks:
list_tasks.append(t['group'])
for t in list_tasks:
msg += [t + ':' + str(list_tasks.count(t))]
return msg
def get_ecs_instance(ecs_cluster, instance_id, msg):
list_instances = ecs.list_container_instances(cluster=ecs_cluster)['containerInstanceArns']
describe_instances = ecs.describe_container_instances(cluster=ecs_cluster, containerInstances=list_instances)['containerInstances']
for e in describe_instances:
ecs_instance = e['containerInstanceArn']
status = e['status']
tasks = e['runningTasksCount']
if e['ec2InstanceId'] == instance_id:
if status == 'ACTIVE' and tasks > 0:
msg += ["ECS instance: " + ecs_instance + ", Running Tasks: " + str(tasks) + ", Status: " + status]
return [ecs_instance], msg
else:
msg += ["ECS instance: " + ecs_instance + ", Running Tasks: " + str(tasks) + ", Status: " + status]
msg += ["Instance not in ACTIVE status and/or has no running tasks. Nothing to do."]
return False, msg
msg += ["Something went wrong, couldn't find ECS instance ARN"]
return False, msg
def drain_ecs_instance(ecs_cluster, ecs_instance, msg):
msg += ["Sending a DRAINING request..."]
drain = ecs.update_container_instances_state(cluster=ecs_cluster, containerInstances=ecs_instance, status='DRAINING')
if drain['containerInstances'][0]['status'] == 'DRAINING':
msg += ["Successfully set container instance status to DRAINING."]
else:
msg += ["Something went wrong."]
print(drain)
return msg
def slack(msg, slack_n=False):
message = '\n'.join(msg)
print(message)
if slack_n:
myslack = Slack()
myslack.message('alerts-monitor-info', message)
def lambda_handler(event, context):
print("Starting ec2-spot-ecs-drainer")
e = event
print(e)
if e['detail']['instance-action'] == 'terminate':
instance_id = e['detail']['instance-id']
ecs_cluster = get_ecs_cluster(instance_id)
if ecs_cluster:
if get_cw_autoscale_alarm(ecs_cluster) == False:
slack_n = True
else:
slack_n = False
msg = ["*Termination Notification ECS cluster:* " + ecs_cluster]
ecs_instance, msg = get_ecs_instance(ecs_cluster, instance_id, msg)
if ecs_instance:
msg += ["EC2 instance: " + e['resources'][0]]
msg = [describe_ecs_instance_tasks(ecs_cluster, ecs_instance[0], msg)][0]
msg = drain_ecs_instance(ecs_cluster, ecs_instance, msg)
slack(msg, slack_n)
else:
print("Nothing to do.")
return {}
import requests
import json
class Slack:
def __init__(self):
self.slack_webhook_url = '<replace_me_with_my_slack_webhook_url>'
def message(self, channel, message):
headers = {}
payload = {'channel': channel, 'attachments': [{'text': message, 'color': '#f7a311'}]}
headers['Content-Type'] = 'application/json'
try:
requests.post(self.slack_webhook_url, data=json.dumps(payload), headers=headers)
except Exception as e:
print('[Slack] Unexpected error: %s' % e)
resource "aws_cloudwatch_event_rule" "ec2-instance-termination" {
name = "capture-ec2-spot-instance-termination"
description = "Capture EC2 Spot Instance Interruption Warning"
event_pattern = <<PATTERN
{
"source": [
"aws.ec2"
],
"detail-type": [
"EC2 Spot Instance Interruption Warning"
]
}
PATTERN
}
resource "aws_cloudwatch_event_target" "trigger-ec2-spot-ecs-drainer-lambda" {
rule = "${aws_cloudwatch_event_rule.ec2-instance-termination.name}"
arn = "${replace-with-my-lambda.arn}"
target_id = "ec2-spot-ecs-drainer"
}
resource "aws_lambda_permission" "lambda-permission" {
statement_id = "AllowExecutionFromCloudWatch"
action = "lambda:InvokeFunction"
function_name = "ec2-spot-ecs-drainer"
principal = "events.amazonaws.com"
source_arn = "${aws_cloudwatch_event_rule.ec2-instance-termination.arn}"
}
resource "aws_iam_role_policy" "lambda-ec2-spot-ecs-drainer_role-policy" {
name = "lambda-ec2-spot-ecs-drainer_role-policy"
role = "${replace-with-my-lambda-role.id}"
policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"ec2:Describe*",
"ecs:DescribeContainerInstances",
"ecs:ListContainerInstances",
"ecs:ListTasks",
"ecs:DescribeTasks",
"ecs:UpdateContainerInstancesState",
"cloudwatch:DescribeAlarms"
],
"Resource": "*"
}
]
}
EOF
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment