Skip to content

Instantly share code, notes, and snippets.

@toricls
Last active Apr 29, 2021
Embed
What would you like to do?
Resilient Fargate task scheduling with Step Functions and EventBridge
AWSTemplateFormatVersion: '2010-09-09'
Description: 'Sample Task Definition'
Resources:
TaskDefinition:
Type: AWS::ECS::TaskDefinition
Properties:
RequiresCompatibilities:
- "FARGATE"
Cpu: 256
Memory: 512
NetworkMode: awsvpc
ContainerDefinitions:
- Image: "amazon/amazon-ecs-sample"
Name: "fargate-app"
PortMappings:
- ContainerPort: 80
HostPort: 80
Protocol: tcp
Essential: true
EntryPoint:
- 'sh'
- '-c'
Command:
- /bin/sh -c "if [ ${IS_STANDALONE_EXECUTION} == 1 ]; then echo 'Hello from AWS Step Functions!'; else /usr/sbin/apache2 -D FOREGROUND; fi"
Outputs:
TaskDefinitionArn:
Value: !Ref TaskDefinition
AWSTemplateFormatVersion: '2010-09-09'
Description: 'Sample Task Definition'
Resources:
SNSTopic:
Type: AWS::SNS::Topic
Outputs:
SNSTopicArn:
Value: !Ref SNSTopic
AWSTemplateFormatVersion: '2010-09-09'
Description: 'Creates once-per-15-min Fargate task with Step Functions and EventBridge'
Parameters:
TaskDefinitionArn:
Type: 'String'
Description: 'Example: arn:aws:ecs:<region>:<account-id>:task-definition/<task-def-name>:<task-def-revision>'
PublicSubnets:
Type: 'List<AWS::EC2::Subnet::Id>'
Description: "This CloudFormation template requires 'public' subnets to run Fargate task. See the comment for the 'StateMachine' resource's 'AssignPublicIp' property."
SecurityGroups:
Type: 'List<AWS::EC2::SecurityGroup::Id>'
SNSTopicArn:
Type: 'String'
Description: 'An SNS topic ARN to notify ECS task success/failures, and also failures of the SFn state machine itself. Example: arn:aws:sns:<region>:<account-id>:<topic-name>'
Resources:
Cluster:
Type: 'AWS::ECS::Cluster'
Properties:
ClusterSettings:
- Name: containerInsights
Value: enabled
Rule:
Type: 'AWS::Events::Rule'
Properties:
ScheduleExpression: 'rate(15 minutes)'
State: ENABLED
Targets:
- Arn: !Ref StateMachine
Id: statemachine
RoleArn: !GetAtt 'RuleRole.Arn'
RuleRole:
Type: 'AWS::IAM::Role'
Properties:
AssumeRolePolicyDocument:
Statement:
- Effect: Allow
Principal:
Service: 'events.amazonaws.com'
Action: 'sts:AssumeRole'
Policies:
- PolicyName: EventRulePolicy
PolicyDocument:
Statement:
- Effect: Allow
Action: 'states:StartExecution'
Resource: !Ref StateMachine
StateMachine:
Type: 'AWS::StepFunctions::StateMachine'
Properties:
RoleArn: !GetAtt 'StateMachineRole.Arn'
LoggingConfiguration:
Destinations:
- CloudWatchLogsLogGroup:
LogGroupArn: !GetAtt StateMachineExecutionLogGroup.Arn
IncludeExecutionData: 'true'
Level: 'FATAL'
DefinitionSubstitutions:
Cluster: !GetAtt Cluster.Arn
TaskDefinition: !Ref TaskDefinitionArn
Subnets: !Join
- '","'
- !Ref PublicSubnets
AssignPublicIp: 'ENABLED' # Should be DISABLED if we use private subnets
SecurityGroups: !Join
- '","'
- !Ref SecurityGroups
Timeout: 900 # seconds
SNSTopicArn: !Ref SNSTopicArn
# The state machine definition below is a bit outdated, I'd add some changes if I run this today
## - Use 'CapacityProviderStrategy' instead of 'LaunchType'
## - Use specific 'PlatformVersion' and not LATEST
## - Enable long arn format to use tag propagation
DefinitionString: |-
{
"Version": "1.0",
"Comment": "Run AWS Fargate task",
"TimeoutSeconds": ${Timeout},
"StartAt": "Run Fargate Task",
"States": {
"Run Fargate Task": {
"Type": "Task",
"Resource": "arn:aws:states:::ecs:runTask.sync",
"Parameters": {
"LaunchType": "FARGATE",
"Cluster": "${Cluster}",
"TaskDefinition": "${TaskDefinition}",
"Group.$": "$$.Execution.Name",
"NetworkConfiguration": {
"AwsvpcConfiguration": {
"Subnets": ["${Subnets}"],
"AssignPublicIp": "${AssignPublicIp}",
"SecurityGroups": ["${SecurityGroups}"]
}
},
"Overrides": {
"ContainerOverrides": [
{
"Name": "fargate-app",
"Environment": [
{
"Name": "IS_STANDALONE_EXECUTION",
"Value": "1"
}
]
}
]
}
},
"Retry": [
{
"ErrorEquals": [
"States.TaskFailed"
],
"IntervalSeconds": 3,
"MaxAttempts": 3,
"BackoffRate": 1.0
},
{
"ErrorEquals": [
"ECS.AmazonECSException"
],
"IntervalSeconds": 10,
"MaxAttempts": 3,
"BackoffRate": 2.0
}
],
"Next": "Notify Success",
"Catch": [
{
"ErrorEquals": [
"States.ALL"
],
"Next": "Notify Failure"
}
]
},
"Notify Success": {
"Type": "Task",
"Resource": "arn:aws:states:::sns:publish",
"Parameters": {
"Message": "AWS Fargate Task started by Step Functions succeeded",
"MessageAttributes": {
"ExitCode": {
"DataType": "String",
"StringValue.$": "$.Containers[?(@.Name=='fargate-app')].ExitCode"
},
"FullTaskResult": {
"DataType": "String",
"StringValue.$": "$"
}
},
"TopicArn": "${SNSTopicArn}"
},
"End": true
},
"Notify Failure": {
"Type": "Task",
"Resource": "arn:aws:states:::sns:publish",
"Parameters": {
"Message": "AWS Fargate Task started by Step Functions failed",
"MessageAttributes": {
"ExitCode": {
"DataType": "String",
"StringValue.$": "$.Containers[?(@.Name=='fargate-app')].ExitCode"
},
"FullTaskResult": {
"DataType": "String",
"StringValue.$": "$"
}
},
"TopicArn": "${SNSTopicArn}"
},
"End": true
}
}
}
StateMachineExecutionLogGroup:
Type: 'AWS::Logs::LogGroup'
Properties:
RetentionInDays: 7
TaskExecutionRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Statement:
- Effect: Allow
Principal:
Service: ecs-tasks.amazonaws.com
Action: 'sts:AssumeRole'
ManagedPolicyArns:
- 'arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy'
# If the task uses Secrets Manager integration
#Policies:
#- PolicyDocument:
# Statement:
# - Effect: Allow
# Action: 'secretsmanager:GetSecretValue'
# Resource:
# - arn:aws:secretsmanager:<region>:<aws_account_id>:secret:secret_name
# - arn:aws:kms:<region>:<aws_account_id>:key/key_id
TaskRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Statement:
- Effect: Allow
Principal:
Service: ecs-tasks.amazonaws.com
Action: 'sts:AssumeRole'
#Policies:
#-
StateMachineRole:
Type: 'AWS::IAM::Role'
Properties:
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Principal:
Service: 'states.amazonaws.com'
Action: 'sts:AssumeRole'
Policies:
- PolicyName: StateMachine
PolicyDocument:
Statement:
- Effect: Allow
Action: 'iam:PassRole'
Resource:
- !GetAtt TaskExecutionRole.Arn
- !GetAtt TaskRole.Arn
- Effect: Allow
Action: 'ecs:RunTask'
Resource: !Ref TaskDefinitionArn
Condition:
ArnEquals:
'ecs:cluster': !GetAtt Cluster.Arn
- Effect: Allow
Action:
- 'ecs:StopTask'
- 'ecs:DescribeTasks'
Resource: '*'
Condition:
ArnEquals:
'ecs:cluster': !GetAtt Cluster.Arn
- Effect: Allow
Action:
- 'sns:Publish'
Resource: !Ref SNSTopicArn
- Effect: Allow
Action:
- 'logs:CreateLogDelivery'
- 'logs:GetLogDelivery'
- 'logs:UpdateLogDelivery'
- 'logs:DeleteLogDelivery'
- 'logs:ListLogDeliveries'
- 'logs:PutResourcePolicy'
- 'logs:DescribeResourcePolicies'
- 'logs:DescribeLogGroups'
Resource: '*' # CWL doesn't support resource-level permissions
- Effect: Allow
Action:
- 'events:PutTargets'
- 'events:PutRule'
- 'events:DescribeRule'
Resource: !Sub 'arn:${AWS::Partition}:events:${AWS::Region}:${AWS::AccountId}:rule/StepFunctionsGetEventsForECSTaskRule'
ExecutionsFailedAlarm:
Type: 'AWS::CloudWatch::Alarm'
Properties:
AlarmDescription: 'Failure while executing scheduled task.'
Namespace: 'AWS/States'
MetricName: ExecutionsFailed
Dimensions:
- Name: StateMachineArn
Value: !Ref StateMachine
Statistic: Sum
Period: 300
DatapointsToAlarm: 1
EvaluationPeriods: 1
Threshold: 0
TreatMissingData: notBreaching
ComparisonOperator: GreaterThanThreshold
AlarmActions:
- !Ref SNSTopicArn
ExecutionsTimeoutAlarm:
Type: 'AWS::CloudWatch::Alarm'
Properties:
AlarmDescription: 'Executing scheduled task timed out.'
Namespace: 'AWS/States'
MetricName: ExecutionsTimedOut
Dimensions:
- Name: StateMachineArn
Value: !Ref StateMachine
Statistic: Sum
Period: 300
DatapointsToAlarm: 1
EvaluationPeriods: 1
Threshold: 0
TreatMissingData: notBreaching
ComparisonOperator: GreaterThanThreshold
AlarmActions:
- !Ref SNSTopicArn
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment