Created
April 20, 2017 20:54
-
-
Save meetupwayne/79341a87ddb68517f7f4f6cd101ae6ec to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
import json | |
import urllib, urllib2 | |
import os | |
from datetime import datetime | |
from dateutil import tz | |
import time | |
ecs = boto3.client('ecs') | |
sns = boto3.client('sns') | |
cloudformation = boto3.resource('cloudformation') | |
PD_ENDPOINT="https://events.pagerduty.com/generic/2010-04-15/create_event.json" | |
CLUSTER_BASE_URL="https://console.aws.amazon.com/ecs/home?region=us-east-1#/clusters/" | |
def convert_to_et(date_string): | |
from_zone = tz.gettz('UTC') | |
to_zone = tz.gettz('America/New_York') | |
return datetime.strptime(date_string,'%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo=from_zone).astimezone(to_zone).strftime("%A, %B %d %Y %I:%M%p") | |
def convert_to_ts(date_string): | |
from_zone = tz.gettz('UTC') | |
to_zone = tz.gettz('America/New_York') | |
return time.mktime(datetime.strptime(date_string,'%Y-%m-%dT%H:%M:%S.%fZ').timetuple()) | |
def get_running_time(startedAt,stoppedAt): | |
t1=datetime.strptime(stoppedAt,'%Y-%m-%dT%H:%M:%S.%fZ') | |
t0=datetime.strptime(startedAt,'%Y-%m-%dT%H:%M:%S.%fZ') | |
t=t1-t0 | |
return t.seconds | |
def get_reasons(detail): | |
containers = detail.get('containers') | |
reasons = [] | |
should_alert =0 | |
should_slack =0 | |
supress_alert_reasons = ['Scaling activity initiated'] | |
trigger_alert_reasons = ['CannotPullContainerError', 'OutOfMemoryError: Container killed due to memory usage', 'Host EC2*terminated.'] | |
exitCodeMsg = '' | |
for container in containers: | |
name = container.get('name') | |
lastStatus = container.get('lastStatus',' ') | |
exitCode = container.get('exitCode','') | |
reason = container.get('reason','') | |
stoppedreason = detail.get('stoppedReason', '') | |
if exitCode != 0: | |
exitCodeMsg = 'with exit code: %s' % exitCode | |
should_slack = 1 | |
if exitCode > 0: | |
for supress_alert_reason in supress_alert_reasons: | |
if supress_alert_reason in reason: | |
should_alert = 0 | |
else: | |
should_alert = 1 | |
elif exitCode == 0 or exitCode: | |
if reason: | |
reason = stoppedreason | |
for trigger_alert_reason in trigger_alert_reasons: | |
if trigger_alert_reason in reason: | |
should_alert = 1 | |
should_slack = 1 | |
else: | |
should_alert = 1 | |
container_reason = ('* %s %s %s %s') % ( name, lastStatus, exitCodeMsg, reason) | |
reasons.append(container_reason) | |
reasons = ("\n".join(reasons)) | |
return reasons, should_alert, should_slack | |
def publish_to_slack(detail,webhook_url,task_containers): | |
starttime = convert_to_et(detail['startedAt']) | |
stoptime = convert_to_et(detail['stoppedAt']) | |
timestamp =convert_to_ts(detail['stoppedAt']) | |
task_running_time=get_running_time(detail['startedAt'],detail['stoppedAt']) | |
cluster = detail['clusterArn'].split('/')[1] | |
stoppedReason = detail.get('stoppedReason','') | |
taskname = detail['taskDefinitionArn'].split('/')[1].split(':')[0] | |
title = "%s has stopped running: %s" % ( taskname, stoppedReason) | |
info = 'Task: %s \n Started: %s \n Stopped: %s \n Duration: %s seconds \nTask Containers:\n %s \n Reason: %s' % (taskname, starttime, stoptime, task_running_time, task_containers ,stoppedReason) | |
payload = { | |
"attachments":[ | |
{ | |
"fallback":"Task Failure Encountered", | |
"color":"#D00000", | |
"ts": timestamp, | |
"username": cluster, | |
"icon_emoji": ":skull:", | |
"fields":[ | |
{ | |
"title": title, | |
"value": info | |
} | |
] | |
} | |
] | |
} | |
# make the request show me what broke | |
response = urllib.urlopen(webhook_url, urllib.urlencode({'payload':json.dumps(payload)})) | |
print webhook_url, payload, detail | |
print response.read() | |
def publish_to_pagerduty(service_key,detail): | |
task_containers, should_alert, should_slack = get_reasons(detail) | |
starttime = convert_to_et(detail['createdAt']) | |
stoptime = convert_to_et(detail['stoppedAt']) | |
stoppedReason = detail.get('stoppedReason','') | |
taskname = detail['taskDefinitionArn'].split('/')[1].split(':')[0] | |
description = "%s has stopped running: %s" % ( taskname, stoppedReason) | |
cluster = detail['clusterArn'].split('/')[1] | |
client_url="%s%s/" % (CLUSTER_BASE_URL, cluster) | |
t1=datetime.strptime(detail['stoppedAt'],'%Y-%m-%dT%H:%M:%S.%fZ') | |
t0=datetime.strptime(detail['startedAt'],'%Y-%m-%dT%H:%M:%S.%fZ') | |
t = (t1-t0) | |
duration = "%.2f seconds" % t.seconds | |
pd_payload = { | |
"service_key": service_key, | |
"event_type": "trigger", | |
"description": description, | |
"client": cluster, | |
"client_url": client_url, | |
"details": { | |
"Cluster":cluster, | |
"Task": taskname, | |
"Started": starttime, | |
"Stopped":stoptime, | |
"Task Containers":task_containers, | |
"Reason":stoppedReason, | |
"Duration":duration | |
}, | |
"contexts":[ | |
{ | |
"type": "link", | |
"href": "http://meetup.pagerduty.com", | |
"text": "View the incident on PagerDuty" | |
} | |
] | |
} | |
pd_payload=json.dumps(pd_payload) | |
while True: | |
try: | |
response = urllib2.urlopen(url=PD_ENDPOINT,data=pd_payload,timeout=5).read() | |
break | |
except urllib2.HTTPError, err: | |
if err.errno == 403: | |
time.sleep(1) | |
continue | |
else: | |
raise | |
def get_cluster_topic(detail): | |
clustet_event_stack = detail['clusterArn'].split('/')[1].replace('-cluster','-ce-rule') | |
topic = cloudformation.StackResource(stack_name=clustet_event_stack, logical_id='TaskStoppedTopic').physical_resource_id | |
return topic | |
def lambda_handler(event, context): | |
# Check to see if this event is a task event and, if so, if it contains | |
# information about an event failure. If so, send an SNS notification. | |
notify_reasons = ['Container instance deregistration forced by user','Host EC2','Task failed ELB health checks','Essential container in task exited'] | |
print "starting" | |
webhook_url = os.getenv('Custom').split('|')[0] | |
service_key = os.getenv('Custom').split('|')[1] | |
print json.dumps(event) | |
if "detail-type" not in event: | |
raise ValueError("ERROR: event object is not a valid CloudWatch event") | |
else: | |
if event["detail-type"] == "ECS Task State Change": | |
detail = event["detail"] | |
taskDefArn = event['detail']['taskDefinitionArn'] | |
taskId = event['id'] | |
cluster = event['detail']['clusterArn'] | |
if detail["lastStatus"] == "STOPPED": | |
task_containers, should_alert, should_slack = get_reasons(detail) | |
if should_slack == 1: | |
publish_to_slack(detail,webhook_url,task_containers) | |
if should_alert == 1: | |
publish_to_pagerduty(service_key,detail) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment