Skip to content

Instantly share code, notes, and snippets.

@chriskl
Created April 30, 2021 01:21
Show Gist options
  • Save chriskl/495ff766f9dfa8e8f6fcd00fae6e58f3 to your computer and use it in GitHub Desktop.
Save chriskl/495ff766f9dfa8e8f6fcd00fae6e58f3 to your computer and use it in GitHub Desktop.
Meltano Dockerfile
AWSTemplateFormatVersion: 2010-09-09
Description: An AWS Batch compute environment
Parameters:
ComputeEnvironmentName:
Type: String
Description: Descriptive compute environment name
VpcId:
Type: AWS::EC2::VPC::Id
Description: VPC in which to run the AWS instances
Subnets:
Type: List<AWS::EC2::Subnet::Id>
Description: Subnets in which to launch AWS Batch instances
MaxvCpus:
Type: Number
Description: Maximum amount of VCPUs that can be running in this compute environment at once
Resources:
BatchServiceRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: 2012-10-17
Statement:
- Effect: Allow
Principal:
Service: batch.amazonaws.com
Action: sts:AssumeRole
ManagedPolicyArns:
- arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole
SecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
GroupDescription: Security Group for Fargate task launched in the VPC by AWS Batch
VpcId: !Ref VpcId
ComputeEnvironment:
Type: AWS::Batch::ComputeEnvironment
Properties:
ComputeEnvironmentName: !Ref ComputeEnvironmentName
Type: MANAGED
ServiceRole: !Ref BatchServiceRole
ComputeResources:
Type: FARGATE
MaxvCpus: !Ref MaxvCpus
Subnets: !Ref Subnets
SecurityGroupIds:
- Ref: SecurityGroup
Outputs:
ComputeEnvironmentArn:
Description: Compute Environment Arn
Value: !Ref ComputeEnvironment
AWSTemplateFormatVersion: "2010-09-09"
Description: "An AWS Batch job queue"
Parameters:
JobQueueName:
Type: String
Description: Descriptive job queue name
Priority:
Type: Number
Default: 1
Description: Relative priority of the queue
ComputeEnvironmentArn:
Type: String
Description: ARN of the compute environment that this queue uses
Resources:
JobQueue:
Type: AWS::Batch::JobQueue
Properties:
JobQueueName: !Ref JobQueueName
Priority: !Ref Priority
ComputeEnvironmentOrder:
- Order: 1
ComputeEnvironment: !Ref ComputeEnvironmentArn
Outputs:
JobQueueArn:
Description: Job Queue Arn
Value: !Ref JobQueue
version: 0.2
env:
secrets-manager:
DOCKER_HUB_USERNAME: docker-hub:username
DOCKER_HUB_PASSWORD: docker-hub:password
phases:
install:
runtime-versions:
python: 3.7
pre_build:
commands:
- aws --version
- echo "$DOCKER_HUB_PASSWORD" | docker login --username "$DOCKER_HUB_USERNAME" --password-stdin
- $(aws ecr get-login --region $AWS_DEFAULT_REGION --no-include-email)
build:
commands:
- docker build -t $REPOSITORY_URI .
- docker tag $REPOSITORY_URI $REPOSITORY_URI
post_build:
commands:
- docker push $REPOSITORY_URI
AWSTemplateFormatVersion: "2010-09-09"
Description: "Builds an executable Docker image upon commit to a code project."
Parameters:
BuildName:
Type: String
Description: Lowercase, hyphen-separated build name. Should be the same as the Github repo name, eg 'dbt'
GithubRepoUrl:
Type: String
Default: ""
Description: Enter the github repository url of your project e.g. https://github.com/myuser/dbt-batch.git
GithubRepoBranch:
Type: String
Default: "refs/heads/main"
Description: Enter full ref path to desired branch, eg. "refs/heads/main"
DockerHubSecret:
Type: String
Default: docker-hub
Description: Secret name containing the Docker Hub username and password
BuildSpec:
Type: String
Default: "buildspec.yml"
Description: Enter path relative to repo root to CodeBuild buildspec.yml
MonitoringEmail:
Type: String
Description: Email address to which to send build failures
Resources:
CodeBuildProject:
Type: AWS::CodeBuild::Project
Properties:
Name: !Ref BuildName
Description: !Sub "Builds an executable Docker image for ${GithubRepoUrl} upon commit to ${GithubRepoBranch} and stores it in ECR"
ServiceRole: !GetAtt CodeBuildRole.Arn
Artifacts:
Type: "NO_ARTIFACTS"
Environment:
Type: LINUX_CONTAINER
ComputeType: BUILD_GENERAL1_SMALL
Image: aws/codebuild/amazonlinux2-x86_64-standard:3.0
PrivilegedMode: true
EnvironmentVariables:
- Name: REPOSITORY_URI
Type: PLAINTEXT
Value: !Sub "${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/${BuildName}:latest"
- Name: AWS_DEFAULT_REGION
Type: PLAINTEXT
Value: !Ref AWS::Region
Source:
BuildSpec: !Ref BuildSpec
Location: !Ref GithubRepoUrl
Type: GITHUB
SourceVersion: !Ref GithubRepoBranch
TimeoutInMinutes: 10
Triggers:
Webhook: true
FilterGroups:
- - Type: EVENT
Pattern: PUSH
- Type: HEAD_REF
Pattern: !Sub "^${GithubRepoBranch}$"
CodeBuildRole:
Type: AWS::IAM::Role
Properties:
ManagedPolicyArns:
- arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryPowerUser
AssumeRolePolicyDocument:
Statement:
- Action: ["sts:AssumeRole"]
Effect: Allow
Principal:
Service: [codebuild.amazonaws.com]
Version: 2012-10-17
Path: /
Policies:
- PolicyName: GetDockerSecret
PolicyDocument:
Statement:
- Effect: Allow
Action:
- secretsmanager:GetSecretValue
Resource:
- !Sub "arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${DockerHubSecret}-*"
CodeBuildAccessPolicy:
Type: AWS::IAM::ManagedPolicy
Properties:
PolicyDocument:
Version: 2012-10-17
Statement:
- Action:
- "logs:CreateLogGroup"
- "logs:CreateLogStream"
- "logs:PutLogEvents"
Effect: Allow
Resource: "*"
Roles:
- !Ref CodeBuildRole
CloudWatchEventsCodeBuildRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: 2012-10-17
Statement:
- Effect: Allow
Principal:
Service:
- events.amazonaws.com
Action: sts:AssumeRole
CloudWatchBuildAccessPolicy:
Type: AWS::IAM::ManagedPolicy
Properties:
PolicyDocument:
Version: 2012-10-17
Statement:
- Effect: Allow
Action:
- "codebuild:StartBuild"
Resource: !GetAtt CodeBuildProject.Arn
Roles:
- !Ref CloudWatchEventsCodeBuildRole
# See: https://docs.aws.amazon.com/codebuild/latest/userguide/sample-build-notifications.html#sample-build-notifications-ref
AWSBatchEventsRule:
Type: AWS::Events::Rule
Properties:
Description: !Sub "Events Rule for CodeBuild Job ${BuildName}"
EventPattern:
source:
- aws.codebuild
detail-type:
- CodeBuild Build State Change
detail:
projectName:
- !Ref BuildName
buildStatus:
- FAILED
State: ENABLED
Targets:
- Arn:
Ref: ErrorsTopic
Id: cloudwatch-codebuild-eventrules
InputTransformer:
InputPathsMap:
logDeepLink: "$.detail.additional-information.logs.deep-link"
time: "$.time"
projectName: "$.detail.project-name"
InputTemplate: '"Your CodeBuild build <projectName> has failed at <time>. Please check the logs at <logDeepLink>."'
ErrorsTopic:
Type: AWS::SNS::Topic
Properties:
Subscription:
- Endpoint:
Ref: MonitoringEmail
Protocol: "email"
ErrorsTopicPolicy:
Type: AWS::SNS::TopicPolicy
Properties:
PolicyDocument:
Statement:
- Effect: Allow
Principal:
Service: events.amazonaws.com
Action: sns:Publish
Resource: '*'
Topics:
- !Ref ErrorsTopic
BatchProcessRepository:
Type: AWS::ECR::Repository
Properties:
RepositoryName: !Ref BuildName
ImageScanningConfiguration:
{
"ScanOnPush": true
}
LifecyclePolicy:
LifecyclePolicyText: |
{
"rules": [
{
"rulePriority": 1,
"description": "Expire images older than 14 days",
"selection": {
"tagStatus": "untagged",
"countType": "sinceImagePushed",
"countUnit": "days",
"countNumber": 14
},
"action": {
"type": "expire"
}
}
]
}
Outputs:
CodeBuildProjectName:
Description: Code Build Project Name
Value: !Ref CodeBuildProject
CodeBuildProjectArn:
Description: Code Build Project Arn
Value: !GetAtt CodeBuildProject.Arn
EcrImageTag:
Description: ECR Image Tag
Value: !Sub "${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/${BuildName}:latest"
ARG MELTANO_IMAGE=meltano/meltano:latest
FROM $MELTANO_IMAGE
WORKDIR /project
# Install chamber
RUN curl -s https://packagecloud.io/install/repositories/segment/chamber/script.deb.sh | /bin/bash \
&& apt-get install -y chamber \
&& apt-get clean
# Install any additional requirements
COPY ./requirements.txt .
RUN pip install -r requirements.txt
# Install all plugins into the `.meltano` directory
COPY ./meltano.yml .
RUN meltano install
# Pin `discovery.yml` manifest by copying cached version to project root
RUN cp -n .meltano/cache/discovery.yml . 2>/dev/null || :
# Don't allow changes to containerized project files
ENV MELTANO_PROJECT_READONLY 1
# Tell Chamber to use the account default KMS key, and where to find it
ENV AWS_REGION ap-southeast-2
ENV CHAMBER_KMS_KEY_ALIAS aws/ssm
# Copy over remaining project files
COPY . .
# Expose default port used by `meltano ui`
EXPOSE 5000
ENTRYPOINT []
AWSTemplateFormatVersion: 2010-09-09
Description: Template to set up a recurring AWS Batch that executes a single tap/target ELT using the specified meltano ECR image on a computing environment.
Parameters:
JobName:
Type: String
Description: Descriptive job name
JobQueueArn:
Type: String
Description: ARN of job queue
EcrImageTag:
Type: String
Description: URI and tag of image the job will run
MonitoringEmail:
Type: String
Default: ""
Description: Email address that will receive monitoring alerts
JobFrequency:
Type: String
Default: 0 4 * * ? *
Description: Frequency of the DBT job in CRON format. Time will be in UTC timezone. For example, "0 4 * * ? *" (without quotes) will run the job everyday at 4:00 UTC.
RequiredMemory:
Type: String
Default: 2048
Description: Memory requirement of Fargate task
RequiredVCPU:
Type: String
Default: 1
Description: VCPU requirement of Fargate task
MeltanoExtractor:
Type: String
Description: Name of tap. eg. tap-salesforce
MeltanoLoader:
Type: String
Default: target-redshift
Description: Name of target. eg. target-redshift
Resources:
FargateTaskExecutionRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: 2008-10-17
Statement:
- Effect: Allow
Principal:
Service: ecs-tasks.amazonaws.com
Action: sts:AssumeRole
ManagedPolicyArns:
- arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy
FargateJobRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: 2008-10-17
Statement:
- Effect: Allow
Principal:
Service: ecs-tasks.amazonaws.com
Action: sts:AssumeRole
Policies:
- PolicyName: GetParametersByPath
PolicyDocument:
Statement:
- Effect: Allow
Action:
- ssm:GetParametersByPath
Resource:
- !Sub "arn:aws:ssm:*:*:parameter/meltano/${MeltanoExtractor}/*"
- !Sub "arn:aws:ssm:*:*:parameter/meltano/${MeltanoLoader}/*"
JobDefinition:
Type: AWS::Batch::JobDefinition
Properties:
Type: container
JobDefinitionName: !Ref JobName
PlatformCapabilities:
- FARGATE
PropagateTags: true
ContainerProperties:
ExecutionRoleArn: !GetAtt FargateTaskExecutionRole.Arn
JobRoleArn: !GetAtt FargateJobRole.Arn
Image: !Ref EcrImageTag
ResourceRequirements:
- Type: MEMORY
Value: !Ref RequiredMemory
- Type: VCPU
Value: !Ref RequiredVCPU
Command:
- chamber
- exec
- !Sub "meltano/${MeltanoExtractor}"
- !Sub "meltano/${MeltanoLoader}"
- --
- meltano
- elt
- !Ref MeltanoExtractor
- !Ref MeltanoLoader
- --job_id
- !Ref JobName
RetryStrategy:
Attempts: 1
AWSBatchEventsRule:
Type: AWS::Events::Rule
Properties:
Description: !Sub "Events Rule for AWS Batch Job ${JobName}"
EventPattern:
source:
- aws.batch
detail-type:
- Batch Job State Change
detail:
jobName:
- !Ref JobName
status:
- "FAILED"
State: ENABLED
Targets:
- Arn:
Ref: ErrorsTopic
Id: cloudwatch-batch-eventrules
InputTransformer:
InputPathsMap:
logStream: "$.detail.container.logStreamName"
time: "$.time"
jobName: "$.detail.jobName"
InputTemplate: '"Your DBT job <jobName> has failed at <time>. Please check the logs at https://console.aws.amazon.com/cloudwatch/home?#logEventViewer:group=/aws/batch/job;stream=<logStream>."'
ErrorsTopic:
Type: AWS::SNS::Topic
Properties:
Subscription:
- Endpoint:
Ref: MonitoringEmail
Protocol: "email"
ErrorsTopicPolicy:
Type: AWS::SNS::TopicPolicy
Properties:
PolicyDocument:
Statement:
- Effect: Allow
Principal:
Service: events.amazonaws.com
Action: sns:Publish
Resource: '*'
Topics:
- !Ref ErrorsTopic
CronjobEvent:
Type: AWS::Events::Rule
Properties:
Description: !Sub "CloudWatch Event that triggers AWS Batch Job ${JobName}"
ScheduleExpression: !Sub cron(${JobFrequency})
State: ENABLED
RoleArn: !GetAtt CloudWatchRole.Arn
Targets:
- Arn: !Ref JobQueueArn
Id: AWSBatchCronjob
RoleArn: !GetAtt CloudWatchRole.Arn
BatchParameters:
JobDefinition: !Ref JobDefinition
JobName: !Ref JobName
CloudWatchRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: 2012-10-17
Statement:
- Effect: Allow
Principal:
Service:
- events.amazonaws.com
Action:
- sts:AssumeRole
CloudWatchBatchPolicy:
Type: AWS::IAM::ManagedPolicy
Properties:
PolicyDocument:
Version: 2012-10-17
Statement:
- Effect: Allow
Action:
- batch:SubmitJob
Resource:
- !Ref JobDefinition
- !Ref JobQueueArn
Roles:
- !Ref CloudWatchRole
Outputs:
JobDefinitionArn:
Description: Job Definition ARN
Value: !Ref JobDefinition
@chriskl
Copy link
Author

chriskl commented Apr 30, 2021

For bonus points, a dbt job definition for AWS Batch:

AWSTemplateFormatVersion: 2010-09-09
Description: Template to set up a recurring AWS Batch that executes a specified dbt ECR image on a computing environment.

Parameters:

  JobName:
    Type: String
    Description: Descriptive job name
  JobQueueArn:
    Type: String
    Description: ARN of job queue
  EcrImageTag:
    Type: String
    Default: "${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/${BuildName}:latest"
    Description: URI and tag of image the job will run
  MonitoringEmail:
    Type: String
    Default: ""
    Description: Email address that will receive monitoring alerts
  JobFrequency:
    Type: String
    Default: 0 4 * * ? *
    Description: Frequency of the DBT job in CRON format. Time will be in UTC timezone. For example, "0 4 * * ? *" (without quotes) will run the job everyday at 4:00 UTC.
  DbtThreads:
    Type: String
    Default: 1
    Description: No. dbt threads to run in parallel
  DbtMethod:
    Type: String
    Default: database
    AllowedValues:
      - database
      - iam
    Description: Redshift authentication method
  DbtClusterId:
    Type: String
    Default: testred
    Description: Redshift cluster ID
  DbtHost:
    Type: String
    Description: Redshift cluster hostname
  DbtPort:
    Type: String
    Default: 5439
    Description: Redshift cluster port number
  DbtCredentialsSecret:
    Type: String
    Description: AWS Secret containing Redshift credentials.  Must contain a JSON key of 'username and 'password'.
  DbtDbname:
    Type: String
    Default: testred
    Description: Redshift database name
  DbtSchema:
    Type: String
    Description: Redshift schema name
  DbtCommand:
    Type: CommaDelimitedList    
    Description: Command to pass to the container by AWS Batch
  RequiredMemory:
    Type: String
    Default: 2048
    Description: Memory requirement of Fargate task
  RequiredVCPU:
    Type: String
    Default: 1
    Description: VCPU requirement of Fargate task

Resources:

  FargateTaskExecutionRole:
    Type: AWS::IAM::Role
    Properties:
      AssumeRolePolicyDocument:
        Version: 2008-10-17
        Statement:
          - Effect: Allow
            Principal:
              Service: ecs-tasks.amazonaws.com
            Action: sts:AssumeRole
      ManagedPolicyArns:
        - arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy
      Policies:
        - PolicyName: GetRedshiftSecret
          PolicyDocument:
            Statement:
              - Effect: Allow
                Action:
                  - secretsmanager:GetSecretValue
                Resource:
                  - !Sub "arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${DbtCredentialsSecret}-*"

  JobDefinition:
    Type: AWS::Batch::JobDefinition
    Properties:
      Type: container
      JobDefinitionName: !Ref JobName
      PlatformCapabilities: 
        - FARGATE
      PropagateTags: true
      ContainerProperties:
        ExecutionRoleArn: !GetAtt FargateTaskExecutionRole.Arn
        Image: !Ref EcrImageTag
        ResourceRequirements:
          - Type: MEMORY
            Value: !Ref RequiredMemory
          - Type: VCPU
            Value: !Ref RequiredVCPU
        Environment:
          - Name: DBT_THREADS
            Value: !Ref DbtThreads
          - Name: DBT_METHOD
            Value: !Ref DbtMethod
          - Name: DBT_CLUSTER_ID
            Value: !Ref DbtClusterId
          - Name: DBT_HOST
            Value: !Ref DbtHost
          - Name: DBT_PORT
            Value: !Ref DbtPort
          - Name: DBT_USER
            Value: !Sub "{{resolve:secretsmanager:${DbtCredentialsSecret}:SecretString:username}}"
          - Name: DBT_DBNAME
            Value: !Ref DbtDbname
          - Name: DBT_SCHEMA
            Value: !Ref DbtSchema
          - Name: DBT_PROFILES_DIR
            Value: "."
        Secrets:
          - Name: DBT_PASS
            ValueFrom: !Sub "arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${DbtCredentialsSecret}:password::"
        Command: !Ref DbtCommand
      RetryStrategy:
        Attempts: 1

  AWSBatchEventsRule:
    Type: AWS::Events::Rule
    Properties:
      Description: !Sub "Events Rule for AWS Batch Job ${JobName}"
      EventPattern:
        source:
          - aws.batch
        detail-type:
          - Batch Job State Change
        detail:
          jobName:
            - !Ref JobName
          status:
            - "FAILED"
      State: ENABLED
      Targets:
        - Arn:
            Ref: ErrorsTopic
          Id: cloudwatch-batch-eventrules
          InputTransformer:
            InputPathsMap:
              logStream: "$.detail.container.logStreamName"
              time: "$.time"
              jobName: "$.detail.jobName"
            InputTemplate: '"Your DBT job <jobName> has failed at <time>. Please check the logs at https://console.aws.amazon.com/cloudwatch/home?#logEventViewer:group=/aws/batch/job;stream=<logStream>."'

  ErrorsTopic:
    Type: AWS::SNS::Topic
    Properties:
      Subscription:
        - Endpoint:
            Ref: MonitoringEmail
          Protocol: "email"

  ErrorsTopicPolicy:
    Type: AWS::SNS::TopicPolicy
    Properties:
      PolicyDocument:
        Statement:
          - Effect: Allow
            Principal:
              Service: events.amazonaws.com
            Action: sns:Publish
            Resource: '*'
      Topics:
        - !Ref ErrorsTopic

  CronjobEvent:
    Type: AWS::Events::Rule
    Properties:
      Description: !Sub "CloudWatch Event that triggers AWS Batch Job ${JobName}"
      ScheduleExpression: !Sub cron(${JobFrequency})
      State: ENABLED
      RoleArn: !GetAtt CloudWatchRole.Arn
      Targets:
        - Arn: !Ref JobQueueArn
          Id: AWSBatchCronjob
          RoleArn: !GetAtt CloudWatchRole.Arn
          BatchParameters:
            JobDefinition: !Ref JobDefinition
            JobName: !Ref JobName

  CloudWatchRole:
    Type: AWS::IAM::Role
    Properties:
      AssumeRolePolicyDocument:
        Version: 2012-10-17
        Statement:
          - Effect: Allow
            Principal:
              Service:
                - events.amazonaws.com
            Action:
              - sts:AssumeRole

  CloudWatchBatchPolicy:
    Type: AWS::IAM::ManagedPolicy
    Properties:
      PolicyDocument:
        Version: 2012-10-17
        Statement:
          - Effect: Allow
            Action:
              - batch:SubmitJob
            Resource:
              - !Ref JobDefinition
              - !Ref JobQueueArn
      Roles:
        - !Ref CloudWatchRole

Outputs:

  JobDefinitionArn:
    Description: Job Definition ARN
    Value: !Ref JobDefinition

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment