Skip to content

Instantly share code, notes, and snippets.

@jhovell
Created February 1, 2017 21:52
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jhovell/e6639a0dceecf903193d37e181124110 to your computer and use it in GitHub Desktop.
Save jhovell/e6639a0dceecf903193d37e181124110 to your computer and use it in GitHub Desktop.
ECS BurstBalance monitoring, alerting and healthcheck
{
"AWSTemplateFormatVersion" : "2010-09-09",
"Description" : "Alerts on ECS burst balance and terminates unhealthy hosts",
"Parameters" : {
"ClusterName": {
"Type": "String",
"Description": "ECS Cluster name"
},
"AlertingSnsTopicArn": {
"Description": "ARN for alerting",
"Type":"String"
}
},
"Resources" : {
"EcsBurstBalanceRule": {
"Type" : "AWS::Events::Rule",
"Properties" : {
"Description" : "Rule to check if ECS hosts have low BurstBalance on gp2 volumes",
"Name" : { "Fn::Join" : [ "", [ { "Ref": "ClusterName" }, "EcsBurstBalanceRule" ]]},
"ScheduleExpression" : "cron(*/5 * * * ? *)",
"State" : "ENABLED",
"Targets" : [ {
"Arn" : { "Fn::GetAtt": ["EcsBurstBalanceLambda", "Arn"] },
"Id" : "TargetFunctionV1"
}
]
}
},
"EcsBurstBalanceLambda": {
"Type": "AWS::Lambda::Function",
"Properties": {
"Description": "Checks on on ECS burst balance and terminates unhealthy hosts",
"Handler": "index.handler",
"MemorySize": 128,
"Runtime": "nodejs4.3",
"Timeout": "10",
"Role": { "Fn::GetAtt" : ["EcsBurstBalanceLambdaRole", "Arn"] },
"Code": {
"ZipFile": { "Fn::Join": ["\n", [
"'use strict';",
"var AWS = require('aws-sdk');",
"var cloudwatch = new AWS.CloudWatch();",
"var ecs = new AWS.ECS();",
"var ec2 = new AWS.EC2();",
"var autoscaling = new AWS.AutoScaling();",
"",
"exports.handler = (event, context, callback) => {",
" var params = {",
{ "Fn::Join" : [ "", [ " cluster: '", { "Ref": "ClusterName" }, "'"] ] },
" };",
" ecs.listContainerInstances(params, describeContainers);",
" callback(null, 'Completed burst balance check');",
"};",
"",
"var describeContainers = function(err, data) {",
" if (err) {",
" console.log(err, err.stack);",
" } else {",
" var params = {",
{ "Fn::Join" : [ "", [ " cluster: '", { "Ref": "ClusterName" }, "',"] ] },
" containerInstances: data.containerInstanceArns",
" };",
" ecs.describeContainerInstances(params, getInstanceId);",
" }",
"};",
"",
"var getInstanceId = function(err, data) {",
" if (err) {",
" console.log(err, err.stack);",
" } else {",
" var instanceIds = data.containerInstances.map(i => i.ec2InstanceId);",
" instanceIds.forEach(describeInstance);",
" }",
"};",
"",
"var describeInstance = function(instanceId) {",
" var params = {",
" Attribute: 'blockDeviceMapping', ",
" InstanceId: instanceId",
" };",
" ec2.describeInstanceAttribute(params, findVolume);",
"};",
"",
"var findVolume = function(err, data) {",
" if (err) {",
" console.log(err, err.stack);",
" } else {",
" data.BlockDeviceMappings.filter(b => b.DeviceName === '/dev/xvdcz')",
" .map(b => b.Ebs.VolumeId).forEach(getBurstBalance(data.InstanceId));",
" }",
"};",
"",
"var getBurstBalance = function(instanceId) {",
" return function(volumeId) {",
" var params = {",
" EndTime: new Date(),",
" MetricName: 'BurstBalance',",
" Namespace: 'AWS/EBS',",
" Period: 60,",
" StartTime: ((new Date().valueOf() / 1000) - 60* 15), // fifteen minutes",
" Dimensions: [",
" {",
" Name: 'VolumeId',",
" Value: volumeId",
" }",
" ],",
" Statistics: [",
" 'Minimum'",
" ],",
" Unit: 'Percent'",
" };",
" cloudwatch.getMetricStatistics(params, getResult(volumeId, instanceId));",
" };",
"};",
"",
"var getResult = function(volumeId, instanceId) {",
" return function(err, data) {",
" if (err) {",
" console.log(err, err.stack);",
" } else {",
" var minimum = data.Datapoints.map(d => d.Minimum).sort(function(x,y) { return x < y })[0];",
" console.log('minimum for ' + volumeId + ' and instanceId ' + instanceId + ' is ' + minimum);",
" reportBurstBalance(minimum);",
" if (minimum < 10) {",
" var params = {",
" InstanceId: instanceId,",
" HealthStatus: 'Unhealthy'",
" };",
" autoscaling.setInstanceHealth(params, function(err, data) {",
" if (err) {",
" console.log(err, err.stack);",
" }",
" });",
" }",
" }",
" };",
"};",
"",
"var reportBurstBalance = function(minimum) {",
" var now = new Date().valueOf() / 1000;",
" var params = {",
" MetricData: [",
" {",
" MetricName: 'BurstBalance',",
" Dimensions: [",
" {",
" Name: 'ClusterName',",
{ "Fn::Join" : [ "", [ " Value: '", { "Ref": "ClusterName" }, "'"] ] },
" }",
" ],",
" Timestamp: now,",
" Unit: 'Percent',",
" Value: minimum",
" }",
" ],",
" Namespace: 'AWS/ECS'",
" };",
" cloudwatch.putMetricData(params, function(err, data) {",
" if (err) {",
" console.log(err, err.stack);",
" }",
" });",
"};"
]]}
}
}
},
"LambdaInvokePermission": {
"Type": "AWS::Lambda::Permission",
"Properties": {
"FunctionName" : { "Fn::GetAtt" : ["EcsBurstBalanceLambda", "Arn"] },
"Action": "lambda:InvokeFunction",
"Principal": "events.amazonaws.com",
"SourceArn": { "Fn::GetAtt": ["EcsBurstBalanceRule", "Arn"] }
}
},
"EcsBurstBalanceLambdaRole": {
"Type": "AWS::IAM::Role",
"Properties": {
"AssumeRolePolicyDocument": {
"Version": "2012-10-17",
"Statement": [{
"Effect": "Allow",
"Principal": {"Service": ["lambda.amazonaws.com"]},
"Action": ["sts:AssumeRole"]
}]
},
"Path": "/",
"Policies": [{
"PolicyName": "root",
"PolicyDocument": {
"Version": "2012-10-17",
"Statement": [{
"Effect": "Allow",
"Action": ["logs:CreateLogGroup","logs:CreateLogStream","logs:PutLogEvents"],
"Resource": "arn:aws:logs:*:*:*"
},
{
"Sid": "StmtBurstBalanceGlobal",
"Effect": "Allow",
"Action": [
"autoscaling:SetInstanceHealth",
"cloudwatch:GetMetricStatistics",
"cloudwatch:PutMetricData",
"ec2:DescribeInstanceAttribute",
"ecs:DescribeContainerInstances"
],
"Resource": [
"*"
]
},
{
"Sid": "StmtEcsCluster",
"Action": [
"ecs:ListContainerInstances"
],
"Effect": "Allow",
"Resource": { "Fn::Join" : [ "", [ "arn:aws:ecs:", { "Ref": "AWS::Region" }, ":", { "Ref": "AWS::AccountId" }, ":cluster/", { "Ref": "ClusterName" } ] ] }
}
]
}
}]
}
},
"EcsBurstBalanceAlarm": {
"Type" : "AWS::CloudWatch::Alarm",
"Properties" : {
"ActionsEnabled" : true,
"AlarmActions" : [ { "Ref": "AlertingSnsTopicArn" } ],
"AlarmDescription" : "Alerts on low burst balance on ECS hosts",
"AlarmName" : { "Fn::Join": ["-", [{ "Ref": "ClusterName" }, "EcsBurstBalanceAlarm"]] },
"ComparisonOperator" : "LessThanThreshold",
"Dimensions" : [
{
"Name": "ClusterName",
"Value": { "Ref": "ClusterName" }
}
],
"EvaluationPeriods" : "2",
"InsufficientDataActions" : [ { "Ref": "AlertingSnsTopicArn" } ],
"MetricName" : "BurstBalance",
"Namespace" : "AWS/ECS",
"OKActions" : [ { "Ref": "AlertingSnsTopicArn" } ],
"Period" : "300",
"Statistic" : "Minimum",
"Threshold" : "30",
"Unit" : "Percent"
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment