Skip to content

Instantly share code, notes, and snippets.

@djenriquez
Last active October 10, 2016 09:34
Show Gist options
  • Save djenriquez/68ed05b648cbf7d5897e to your computer and use it in GitHub Desktop.
Save djenriquez/68ed05b648cbf7d5897e to your computer and use it in GitHub Desktop.
AWS ECS Container Autoscale Lambda function: Create a CloudWatch CPU metric for high and low alarms. Create SNS topics to trigger from each alarm. Subscribe the lambda function to the SNS topics. Let magic happen.
console.log('Loading event');
//Import AWS-SDK
var AWS = require('aws-sdk');
//Fetch the assigned cpu requirements for a task definition
var fetchRequiredCPU = function(taskDefinitionData)
{
var cpu = taskDefinitionData.taskDefinition.containerDefinitions[0].cpu;
return cpu;
};
//Fetch the assigned memory requirements for a task definition
var fetchRequiredMemory = function(taskDefinitionData)
{
var cpu = taskDefinitionData.taskDefinition.containerDefinitions[0].memory;
return cpu;
};
//Parse the Alarm reason and retrieve the alarm CPU util using regex
var fetchUtilFromReason = function(newStateReason)
{
var valueRegex = /Threshold Crossed: \d+ data\S+ \(([0-9\.E-]+)\)/g;
var cpuUtil = valueRegex.exec(newStateReason);
console.log("Alarm CPUUtil:", cpuUtil);
var cpuUtilFloat = parseFloat(cpuUtil[1]);
return cpuUtilFloat | 0;
};
//Parse the Alarm reason and retrieve the CPU util goal using regex
var fetchDesiredUtilFromReason = function(newStateReason)
{
var valueRegex = /Threshold Crossed: \d+ data\S+ \(([0-9\.E-]+)\).+\(([0-9\.]+)\)/g;
var cpuUtil = valueRegex.exec(newStateReason);
console.log("Desired CPUUtil:", cpuUtil);
var cpuUtilFloat = parseFloat(cpuUtil[2]);
return cpuUtilFloat | 0;
};
//Use logic to figure out desired count of containers
//serviceData contains the service definition
//ecsService is the name of the ECS service
var calculateDesiredCount = function(desiredCPUUtil, serviceData, ecsServiceName, alarmCPUUtil)
{
//Container Count = ((#running containers * cpuUtil)/desiredCPUUtil) - #running containers)
var runningCount = serviceData.runningCount;
//Retrieve desired count and increment
var desiredCount = ((runningCount * alarmCPUUtil)/desiredCPUUtil) | 0;
if(desiredCount <= 0)
return 1;
else
return desiredCount;
};
var updateServiceDefinitionResp = function(err, data)
{
if (err)
{
console.log(err);
context.fail();
}
else
{
console.log("Successfully updated service definition:", data);
}
};
var changeAlarmStateResp = function(err, data)
{
if(err)
{
console.log(err);
context.fail();
}
else
{
console.log("Successfully changed alarm state:", data);
}
};
exports.handler = function(event, context) {
var ecsService = "";
var ecsCluster = "";
var message = JSON.parse(event.Records[0].Sns.Message);
console.log("Received message:", message);
var msgDimensions = message.Trigger.Dimensions;
var ecsRegion = message.Region.toLowerCase();
var alarmCPUUtil = fetchUtilFromReason(message.NewStateReason);
var desiredCPUUtil = fetchDesiredUtilFromReason(message.NewStateReason);
var ecs = new AWS.ECS({region: ecsRegion});
var cws = new AWS.CloudWatch({region: ecsRegion});
//Fetch ECS Service name and ECS Cluster name
for(var i = 0; i < msgDimensions.length; i++)
{
switch(msgDimensions[i].name)
{
case "ServiceName":
ecsService = msgDimensions[i].value;
console.log("Parsing service:", ecsService);
break;
case "ClusterName":
ecsCluster = msgDimensions[i].value;
console.log("Parsing cluster:", ecsCluster);
break;
default:
break;
}
}
//If unable to get ECS service, log failure
if(ecsService == "" || ecsCluster == "")
{
console.log("Unable to retrieve service name and/or cluster name from SNS event: ", message);
context.fail("Unable to retrieve service name and/or cluster name from SNS event: ", message);
}
//Grab service metadata
ecs.describeServices({services:[ecsService], cluster: ecsCluster}, function(err, data)
{
if (err)
{
console.log("Unable to retrieve service definition for:", ecsService);
context.fail(err, err.stack);
}
else
{
var serviceDefinition = data.services[0];
console.log("Retrieved service definition:", serviceDefinition);
var serviceCpuReq = 0;
var serviceMemReq = 0;
//Grab task definition required CPU and Memory
ecs.describeTaskDefinition({taskDefinition: serviceDefinition.taskDefinition}, function(err2, taskDefinition)
{
if (err2)
{
console.log("Unable to retrieve task definition for:", serviceDefinition.taskDefinition, err2, err2.stack);
context.fail(err2, err2.stack);
}
else
{
console.log("Retrieved task definition:", taskDefinition);
serviceCpuReq = fetchRequiredCPU(taskDefinition);
serviceMemReq = fetchRequiredMemory(taskDefinition);
}
});
//Find desired count required to equalize load
var desiredCount = calculateDesiredCount(desiredCPUUtil, serviceDefinition, ecsService, alarmCPUUtil);
console.log("Calculated desired count of", desiredCount);
//Calculate the total required cluster resources
var clusterReqCPU = serviceCpuReq * desiredCount;
var clusterReqMemory = serviceMemReq * desiredCount;
//Verify that any instance has enough resources
//Fetch all instances ARNs for this cluster
ecs.listContainerInstances({cluster: ecsCluster}, function(err3, instance_arns)
{
if (err3)
{
console.log("Unable to fetch container instances: ", err3);
}
else
{
console.log("Retrieved container instance ARNs:", instance_arns);
//Fetch instance metadata
ecs.describeContainerInstances({containerInstances: instance_arns.containerInstanceArns, cluster: ecsCluster}, function(err4, instance_metadata)
{
if(err4)
{
console.log("Unable to describe container instances: ", err4, err);
}
else
{
console.log("Retrieved container instance data: ", instance_metadata);
var clusterCPU = 0;
var clusterMemory = 0;
var instanceCPU = 0;
var instanceMemory = 0;
var isThereEnoughResources = false;
//loop through each instance and check if any have
//enough resources to run the service
for(var i = 0; i < instance_metadata.containerInstances.length; i++)
{
resources = instance_metadata.containerInstances[i].remainingResources;
for(var j = 0; j < resources.length; j++)
{
switch(resources[j].name)
{
case "CPU":
instanceCPU = resources[j].integerValue;
console.log(instance_metadata.containerInstances[i].containerInstanceArn, "CPU:", instanceCPU);
break;
case "MEMORY":
instanceMemory = resources[j].integerValue;
console.log(instance_metadata.containerInstances[i].containerInstanceArn, "Memory:", instanceMemory);
break;
default:
break;
}
if(instanceCPU >= serviceCpuReq && instanceMemory >= serviceMemReq)
{
clusterCPU = clusterCPU + instanceCPU;
clusterMemory = clusterMemory + instanceMemory;
}
}
//If there are not enough resources, maximize the cluster and let the ASG increment the instance count
if(clusterCPU < clusterReqCPU || clusterMemory < clusterReqMemory)
{
//Maximize the cluster
var maxContainersByCPU = (clusterCPU / instanceCPU) | 0;
var maxContainersByMemory = (clusterMemory / instanceMemory) | 0;
if(maxContainersByCPU < maxContainersByMemory)
desiredCount = maxContainersByCPU;
else
desiredCount = maxContainersByMemory;
}
//Call API to update service with new desired count
ecs.updateService({service: ecsService, cluster: ecsCluster, desiredCount: desiredCount, taskDefinition: serviceDefinition.taskDefinition}, updateServiceDefinitionResp);
console.log("Incrementing service", ecsService, "on cluster", ecsCluster, "task count to", desiredCount, " Task definition ARN:", serviceDefinition.taskDefinition);
//Call cloudwatch alarm state service to temporarily disable the alarm
cws.setAlarmState({AlarmName: message.AlarmName, StateReason: 'Temporarily disabling for container autoscaling script', StateValue: 'OK'}, changeAlarmStateResp);
console.log("Success.");
context.succeed();
}
}
});
}
});
}
});
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment