Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@meetupwayne
Created April 20, 2017 20:54
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save meetupwayne/79341a87ddb68517f7f4f6cd101ae6ec to your computer and use it in GitHub Desktop.
Save meetupwayne/79341a87ddb68517f7f4f6cd101ae6ec to your computer and use it in GitHub Desktop.
import boto3
import json
import urllib, urllib2
import os
from datetime import datetime
from dateutil import tz
import time
ecs = boto3.client('ecs')
sns = boto3.client('sns')
cloudformation = boto3.resource('cloudformation')
PD_ENDPOINT="https://events.pagerduty.com/generic/2010-04-15/create_event.json"
CLUSTER_BASE_URL="https://console.aws.amazon.com/ecs/home?region=us-east-1#/clusters/"
def convert_to_et(date_string):
from_zone = tz.gettz('UTC')
to_zone = tz.gettz('America/New_York')
return datetime.strptime(date_string,'%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo=from_zone).astimezone(to_zone).strftime("%A, %B %d %Y %I:%M%p")
def convert_to_ts(date_string):
from_zone = tz.gettz('UTC')
to_zone = tz.gettz('America/New_York')
return time.mktime(datetime.strptime(date_string,'%Y-%m-%dT%H:%M:%S.%fZ').timetuple())
def get_running_time(startedAt,stoppedAt):
t1=datetime.strptime(stoppedAt,'%Y-%m-%dT%H:%M:%S.%fZ')
t0=datetime.strptime(startedAt,'%Y-%m-%dT%H:%M:%S.%fZ')
t=t1-t0
return t.seconds
def get_reasons(detail):
containers = detail.get('containers')
reasons = []
should_alert =0
should_slack =0
supress_alert_reasons = ['Scaling activity initiated']
trigger_alert_reasons = ['CannotPullContainerError', 'OutOfMemoryError: Container killed due to memory usage', 'Host EC2*terminated.']
exitCodeMsg = ''
for container in containers:
name = container.get('name')
lastStatus = container.get('lastStatus',' ')
exitCode = container.get('exitCode','')
reason = container.get('reason','')
stoppedreason = detail.get('stoppedReason', '')
if exitCode != 0:
exitCodeMsg = 'with exit code: %s' % exitCode
should_slack = 1
if exitCode > 0:
for supress_alert_reason in supress_alert_reasons:
if supress_alert_reason in reason:
should_alert = 0
else:
should_alert = 1
elif exitCode == 0 or exitCode:
if reason:
reason = stoppedreason
for trigger_alert_reason in trigger_alert_reasons:
if trigger_alert_reason in reason:
should_alert = 1
should_slack = 1
else:
should_alert = 1
container_reason = ('* %s %s %s %s') % ( name, lastStatus, exitCodeMsg, reason)
reasons.append(container_reason)
reasons = ("\n".join(reasons))
return reasons, should_alert, should_slack
def publish_to_slack(detail,webhook_url,task_containers):
starttime = convert_to_et(detail['startedAt'])
stoptime = convert_to_et(detail['stoppedAt'])
timestamp =convert_to_ts(detail['stoppedAt'])
task_running_time=get_running_time(detail['startedAt'],detail['stoppedAt'])
cluster = detail['clusterArn'].split('/')[1]
stoppedReason = detail.get('stoppedReason','')
taskname = detail['taskDefinitionArn'].split('/')[1].split(':')[0]
title = "%s has stopped running: %s" % ( taskname, stoppedReason)
info = 'Task: %s \n Started: %s \n Stopped: %s \n Duration: %s seconds \nTask Containers:\n %s \n Reason: %s' % (taskname, starttime, stoptime, task_running_time, task_containers ,stoppedReason)
payload = {
"attachments":[
{
"fallback":"Task Failure Encountered",
"color":"#D00000",
"ts": timestamp,
"username": cluster,
"icon_emoji": ":skull:",
"fields":[
{
"title": title,
"value": info
}
]
}
]
}
# make the request show me what broke
response = urllib.urlopen(webhook_url, urllib.urlencode({'payload':json.dumps(payload)}))
print webhook_url, payload, detail
print response.read()
def publish_to_pagerduty(service_key,detail):
task_containers, should_alert, should_slack = get_reasons(detail)
starttime = convert_to_et(detail['createdAt'])
stoptime = convert_to_et(detail['stoppedAt'])
stoppedReason = detail.get('stoppedReason','')
taskname = detail['taskDefinitionArn'].split('/')[1].split(':')[0]
description = "%s has stopped running: %s" % ( taskname, stoppedReason)
cluster = detail['clusterArn'].split('/')[1]
client_url="%s%s/" % (CLUSTER_BASE_URL, cluster)
t1=datetime.strptime(detail['stoppedAt'],'%Y-%m-%dT%H:%M:%S.%fZ')
t0=datetime.strptime(detail['startedAt'],'%Y-%m-%dT%H:%M:%S.%fZ')
t = (t1-t0)
duration = "%.2f seconds" % t.seconds
pd_payload = {
"service_key": service_key,
"event_type": "trigger",
"description": description,
"client": cluster,
"client_url": client_url,
"details": {
"Cluster":cluster,
"Task": taskname,
"Started": starttime,
"Stopped":stoptime,
"Task Containers":task_containers,
"Reason":stoppedReason,
"Duration":duration
},
"contexts":[
{
"type": "link",
"href": "http://meetup.pagerduty.com",
"text": "View the incident on PagerDuty"
}
]
}
pd_payload=json.dumps(pd_payload)
while True:
try:
response = urllib2.urlopen(url=PD_ENDPOINT,data=pd_payload,timeout=5).read()
break
except urllib2.HTTPError, err:
if err.errno == 403:
time.sleep(1)
continue
else:
raise
def get_cluster_topic(detail):
clustet_event_stack = detail['clusterArn'].split('/')[1].replace('-cluster','-ce-rule')
topic = cloudformation.StackResource(stack_name=clustet_event_stack, logical_id='TaskStoppedTopic').physical_resource_id
return topic
def lambda_handler(event, context):
# Check to see if this event is a task event and, if so, if it contains
# information about an event failure. If so, send an SNS notification.
notify_reasons = ['Container instance deregistration forced by user','Host EC2','Task failed ELB health checks','Essential container in task exited']
print "starting"
webhook_url = os.getenv('Custom').split('|')[0]
service_key = os.getenv('Custom').split('|')[1]
print json.dumps(event)
if "detail-type" not in event:
raise ValueError("ERROR: event object is not a valid CloudWatch event")
else:
if event["detail-type"] == "ECS Task State Change":
detail = event["detail"]
taskDefArn = event['detail']['taskDefinitionArn']
taskId = event['id']
cluster = event['detail']['clusterArn']
if detail["lastStatus"] == "STOPPED":
task_containers, should_alert, should_slack = get_reasons(detail)
if should_slack == 1:
publish_to_slack(detail,webhook_url,task_containers)
if should_alert == 1:
publish_to_pagerduty(service_key,detail)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment