Skip to content

Instantly share code, notes, and snippets.

@kuzemkon
Last active June 18, 2024 09:47
Show Gist options
  • Save kuzemkon/03633f2e8aeb801053b18162db7f28e2 to your computer and use it in GitHub Desktop.
Save kuzemkon/03633f2e8aeb801053b18162db7f28e2 to your computer and use it in GitHub Desktop.
AWS ECS events CloudWatch metrics gathering Terraform
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 4.0"
}
}
}
provider "aws" {
region = "us-east-1"
}
# CloudWatch logs group for ECS events
resource "aws_cloudwatch_log_group" "ecs_events" {
# the log group name must always start with "/aws/events/" otherwise it won't work
name = "/aws/events/ecs"
# always add logs retention as ECS produces huge amount of events
retention_in_days = 7
}
# AWS EventBridge rule
resource "aws_cloudwatch_event_rule" "ecs_events" {
name = "ecs-events"
description = "Capture all ECS events"
event_pattern = jsonencode({
"source" : ["aws.ecs"],
"detail" : {
"clusterArn" : ["arn:aws:ecs:us-east-1:123456798098:cluster/cluster-name"]
}
})
}
# AWS EventBridge target
resource "aws_cloudwatch_event_target" "logs" {
rule = aws_cloudwatch_event_rule.ecs_events.name
target_id = "send-to-cloudwatch"
arn = aws_cloudwatch_log_group.ecs_events.arn
}
# CloudWatch logs error filter metric
resource "aws_cloudwatch_log_metric_filter" "ecs_errors" {
name = "ECS Errors"
pattern = <<PATTERN
{
($.detail.group = "*" && $.detail.stopCode = "TaskFailedToStart") ||
($.detail-type = "ECS Service Action" && ($.detail.eventName = "SERVICE_DEPLOYMENT_FAILED" || $.detail.eventName = "SERVICE_TASK_PLACEMENT_FAILURE" || $.detail.eventName = "SERVICE_STEADY_STATE_TIMEOUT")) ||
($.detail-type = "ECS Task State Change" && ($.detail.stoppedReason = "OutOfMemoryError" || $.detail.stoppedReason = "EssentialContainerExited" || $.detail.stoppedReason != "" || $.detail.stopCode = "TaskFailed"))
}
PATTERN
log_group_name = aws_cloudwatch_log_group.ecs_events.name
metric_transformation {
name = "ECSErrors"
namespace = "ECSEvents"
value = "1"
unit = "Count"
dimensions = {
group = "$.detail.group"
}
}
}
# AWS CloudWatch metric alarm
resource "aws_cloudwatch_metric_alarm" "service_crashes" {
alarm_name = "ECS service is stopped with error"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "ECSErrors"
namespace = "ECSEvents"
period = "300"
statistic = "SampleCount"
threshold = "1"
alarm_description = "crashes occured"
alarm_actions = [aws_sns_topic.monitoring.arn]
ok_actions = [aws_sns_topic.monitoring.arn]
treat_missing_data = "notBreaching"
dimensions = {
group = "service:our-ecs-service"
}
}
# AWS SNS topic
resource "aws_sns_topic" "monitoring" {
name = "monitoring"
lambda_success_feedback_role_arn = aws_iam_role.sns_delivery_status.arn
lambda_failure_feedback_role_arn = aws_iam_role.sns_delivery_status.arn
lambda_success_feedback_sample_rate = 100
tags = {
environment = var.env
}
}
# AWS IAM role with policy
resource "aws_iam_role" "sns_delivery_status" {
name = "sns-delivery-status"
assume_role_policy = jsonencode({
"Version" : "2012-10-17",
"Statement" : [
{
"Sid" : "",
"Effect" : "Allow",
"Action" : "sts:AssumeRole"
"Principal" : {
"Service" : "sns.amazonaws.com"
},
}
]
})
}
resource "aws_iam_policy" "sns_delivery_status" {
name = "sns-delivery-status"
policy = jsonencode({
"Version" : "2012-10-17",
"Statement" : [
{
"Effect" : "Allow",
"Action" : [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents",
"logs:PutMetricFilter",
"logs:PutRetentionPolicy"
],
"Resource" : [
"*"
]
}
]
})
}
resource "aws_iam_policy_attachment" "sns_delivery_status" {
name = "sns-delivery-status"
roles = [aws_iam_role.sns_delivery_status.name]
policy_arn = aws_iam_policy.sns_delivery_status.arn
}
@WassimReg
Copy link

the metric is always in alarm when deployed, insufficient data and doesn.t go back to normal

@kuzemkon
Copy link
Author

kuzemkon commented Mar 6, 2024

@WassimReg thank you for mentioning it. I've applied some fixes to the gist that will solve the issue:

  1. The name of the aws_cloudwatch_log_group resource is updated. It must follow this pattern to work with EventBridge: /aws/events/.*
  2. The treat_missing_data property of the aws_cloudwatch_metric_alarm resource is updated from breaching to notBreaching

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment