Last active
June 18, 2024 09:47
-
-
Save kuzemkon/03633f2e8aeb801053b18162db7f28e2 to your computer and use it in GitHub Desktop.
AWS ECS events CloudWatch metrics gathering Terraform
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
terraform { | |
required_providers { | |
aws = { | |
source = "hashicorp/aws" | |
version = "~> 4.0" | |
} | |
} | |
} | |
provider "aws" { | |
region = "us-east-1" | |
} | |
# CloudWatch logs group for ECS events | |
resource "aws_cloudwatch_log_group" "ecs_events" { | |
# the log group name must always start with "/aws/events/" otherwise it won't work | |
name = "/aws/events/ecs" | |
# always add logs retention as ECS produces huge amount of events | |
retention_in_days = 7 | |
} | |
# AWS EventBridge rule | |
resource "aws_cloudwatch_event_rule" "ecs_events" { | |
name = "ecs-events" | |
description = "Capture all ECS events" | |
event_pattern = jsonencode({ | |
"source" : ["aws.ecs"], | |
"detail" : { | |
"clusterArn" : ["arn:aws:ecs:us-east-1:123456798098:cluster/cluster-name"] | |
} | |
}) | |
} | |
# AWS EventBridge target | |
resource "aws_cloudwatch_event_target" "logs" { | |
rule = aws_cloudwatch_event_rule.ecs_events.name | |
target_id = "send-to-cloudwatch" | |
arn = aws_cloudwatch_log_group.ecs_events.arn | |
} | |
# CloudWatch logs error filter metric | |
resource "aws_cloudwatch_log_metric_filter" "ecs_errors" { | |
name = "ECS Errors" | |
pattern = <<PATTERN | |
{ | |
($.detail.group = "*" && $.detail.stopCode = "TaskFailedToStart") || | |
($.detail-type = "ECS Service Action" && ($.detail.eventName = "SERVICE_DEPLOYMENT_FAILED" || $.detail.eventName = "SERVICE_TASK_PLACEMENT_FAILURE" || $.detail.eventName = "SERVICE_STEADY_STATE_TIMEOUT")) || | |
($.detail-type = "ECS Task State Change" && ($.detail.stoppedReason = "OutOfMemoryError" || $.detail.stoppedReason = "EssentialContainerExited" || $.detail.stoppedReason != "" || $.detail.stopCode = "TaskFailed")) | |
} | |
PATTERN | |
log_group_name = aws_cloudwatch_log_group.ecs_events.name | |
metric_transformation { | |
name = "ECSErrors" | |
namespace = "ECSEvents" | |
value = "1" | |
unit = "Count" | |
dimensions = { | |
group = "$.detail.group" | |
} | |
} | |
} | |
# AWS CloudWatch metric alarm | |
resource "aws_cloudwatch_metric_alarm" "service_crashes" { | |
alarm_name = "ECS service is stopped with error" | |
comparison_operator = "GreaterThanOrEqualToThreshold" | |
evaluation_periods = "1" | |
metric_name = "ECSErrors" | |
namespace = "ECSEvents" | |
period = "300" | |
statistic = "SampleCount" | |
threshold = "1" | |
alarm_description = "crashes occured" | |
alarm_actions = [aws_sns_topic.monitoring.arn] | |
ok_actions = [aws_sns_topic.monitoring.arn] | |
treat_missing_data = "notBreaching" | |
dimensions = { | |
group = "service:our-ecs-service" | |
} | |
} | |
# AWS SNS topic | |
resource "aws_sns_topic" "monitoring" { | |
name = "monitoring" | |
lambda_success_feedback_role_arn = aws_iam_role.sns_delivery_status.arn | |
lambda_failure_feedback_role_arn = aws_iam_role.sns_delivery_status.arn | |
lambda_success_feedback_sample_rate = 100 | |
tags = { | |
environment = var.env | |
} | |
} | |
# AWS IAM role with policy | |
resource "aws_iam_role" "sns_delivery_status" { | |
name = "sns-delivery-status" | |
assume_role_policy = jsonencode({ | |
"Version" : "2012-10-17", | |
"Statement" : [ | |
{ | |
"Sid" : "", | |
"Effect" : "Allow", | |
"Action" : "sts:AssumeRole" | |
"Principal" : { | |
"Service" : "sns.amazonaws.com" | |
}, | |
} | |
] | |
}) | |
} | |
resource "aws_iam_policy" "sns_delivery_status" { | |
name = "sns-delivery-status" | |
policy = jsonencode({ | |
"Version" : "2012-10-17", | |
"Statement" : [ | |
{ | |
"Effect" : "Allow", | |
"Action" : [ | |
"logs:CreateLogGroup", | |
"logs:CreateLogStream", | |
"logs:PutLogEvents", | |
"logs:PutMetricFilter", | |
"logs:PutRetentionPolicy" | |
], | |
"Resource" : [ | |
"*" | |
] | |
} | |
] | |
}) | |
} | |
resource "aws_iam_policy_attachment" "sns_delivery_status" { | |
name = "sns-delivery-status" | |
roles = [aws_iam_role.sns_delivery_status.name] | |
policy_arn = aws_iam_policy.sns_delivery_status.arn | |
} |
@WassimReg thank you for mentioning it. I've applied some fixes to the gist that will solve the issue:
- The name of the
aws_cloudwatch_log_group
resource is updated. It must follow this pattern to work with EventBridge:/aws/events/.*
- The
treat_missing_data
property of theaws_cloudwatch_metric_alarm
resource is updated frombreaching
tonotBreaching
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
the metric is always in alarm when deployed, insufficient data and doesn.t go back to normal