Skip to content

Instantly share code, notes, and snippets.

@mdwint
Last active June 5, 2021 11:09
Show Gist options
  • Save mdwint/73ac6aa361fbba692c92177f06d7d918 to your computer and use it in GitHub Desktop.
Save mdwint/73ac6aa361fbba692c92177f06d7d918 to your computer and use it in GitHub Desktop.
import abc
import logging
import time
from dataclasses import dataclass
import boto3
log = logging.getLogger()
autoscaling = boto3.client("autoscaling")
ecs = boto3.client("ecs")
@dataclass
class State:
desired: int
running: int
@property
def is_enabled(self) -> bool:
return self.desired > 0
@property
def is_stable(self) -> bool:
return self.running == self.desired
class Target(abc.ABC):
@abc.abstractmethod
def get_state(self):
pass
@abc.abstractmethod
def set_enabled(self, enabled: bool):
pass
def toggle(self, enabled: bool, wait: bool = False):
if self.get_state().is_enabled != enabled:
log.info(f"{'En' if enabled else 'Dis'}abling {self}")
self.set_enabled(enabled)
if wait and not self.get_state().is_stable:
log.info(f"Waiting on {self}")
self.wait_until_stable()
def wait_until_stable(
self,
poll_interval: int = 5,
max_attempts: int = 60,
wait_after_stable: int = 30,
):
while max_attempts > 0:
state = self.get_state()
if state.is_stable:
time.sleep(wait_after_stable)
return
max_attempts -= 1
time.sleep(poll_interval)
raise ValueError(f"{self} did not stabilize: {state}")
@dataclass
class AutoScalingGroup(Target):
name: str
def get_state(self) -> State:
s = autoscaling.describe_auto_scaling_groups(AutoScalingGroupNames=[self.name])
s = s["AutoScalingGroups"][0]
return State(desired=s["DesiredCapacity"], running=len(s["Instances"]))
def set_enabled(self, enabled: bool):
n = int(enabled)
autoscaling.update_auto_scaling_group(
AutoScalingGroupName=self.name, DesiredCapacity=n, MinSize=n
)
@dataclass
class ECSService(Target):
cluster: str
name: str
def get_state(self) -> State:
s = ecs.describe_services(cluster=self.cluster, services=[self.name])
s = s["services"][0]
return State(desired=s["desiredCount"], running=s["runningCount"])
def set_enabled(self, enabled: bool):
n = int(enabled)
ecs.update_service(cluster=self.cluster, service=self.name, desiredCount=n)
import json
import logging
from dataclasses import dataclass
from functools import wraps
import boto3
logging.basicConfig()
log = logging.getLogger()
log.setLevel(logging.INFO)
@dataclass
class CloudWatchAlarm:
name: str
descr: str
state: str
reason: str
def cloudwatch_alarm_handler(func):
@wraps(func)
def wrapper(event: dict, context):
for record in event["Records"]:
msg = json.loads(record["Sns"]["Message"])
alarm = CloudWatchAlarm(
name=msg["AlarmName"],
descr=msg["AlarmDescription"],
state=msg["NewStateValue"],
reason=msg["NewStateReason"],
)
log.info(f"[{alarm.state}] {alarm.descr} (reason: {alarm.reason})")
if alarm.state == "ALARM":
func(alarm, context)
return wrapper
@cloudwatch_alarm_handler
def reset_alarm(alarm, context):
alarm_to_reset = alarm.descr.split()[-1]
reason = f"Reset by alarm {alarm.name}"
cw = boto3.client("cloudwatch")
for alarm_name in (alarm_to_reset, alarm.name):
cw.set_alarm_state(AlarmName=alarm_name, StateValue="OK", StateReason=reason)
import logging
import os
from gorilla.hibernation.autoscaling import AutoScalingGroup, ECSService
from gorilla.hibernation.cloudwatch import cloudwatch_alarm_handler
logging.basicConfig()
log = logging.getLogger()
log.setLevel(logging.INFO)
def toggle_service(enabled: bool):
cluster = os.environ["cluster"]
prefix = os.environ["service_prefix"]
ECSService(cluster, f"{prefix}-scheduler").toggle(enabled, wait=not enabled)
if not enabled:
# NOTE: Workers are enabled by the scheduler's autoscaling
ECSService(cluster, f"{prefix}-worker").toggle(enabled)
toggle_autoscaling_group(enabled)
def toggle_autoscaling_group(enabled: bool):
asg = os.environ["autoscaling_group"]
AutoScalingGroup(asg).toggle(enabled)
@cloudwatch_alarm_handler
def hibernate(alarm, context):
toggle_service(False)
@cloudwatch_alarm_handler
def wake(alarm, context):
toggle_service(True)
def wake_on_deploy(event, context):
toggle_autoscaling_group(True)
# For an introduction to Serverless Framework, see:
# https://www.serverless.com/framework/docs/providers/aws/guide/intro/
service: gorilla-hibernation
plugins:
- serverless-plugin-conditional-functions
- serverless-pseudo-parameters
- serverless-python-requirements
custom:
# Customize these to your needs:
prefix: example
lambda_prefix: ${self:service}-${self:provider.stage}
service_prefix: my-service
autoscaling_group: my-auto-scaling-group
cpu_low: 3 # percent
# ...
pythonRequirements:
dockerizePip: non-linux
useDownloadCache: true
slim: true
provider:
name: aws
runtime: python3.7
memorySize: 256
timeout: 300
region: ${env:REGION, 'eu-west-1'}
stage: ${opt:stage, 'dev'}
environment:
cluster: ${self:custom.prefix}
service_prefix: ${self:custom.service_prefix}
autoscaling_group: ${self:custom.autoscaling_group}
iamRoleStatements:
- Effect: Allow
Action:
- autoscaling:DescribeAutoScalingGroups
Resource: '*'
- Effect: Allow
Action: autoscaling:UpdateAutoScalingGroup
Resource:
- arn:aws:autoscaling:#{AWS::Region}:#{AWS::AccountId}:autoScalingGroup:*:autoScalingGroupName/${self:custom.autoscaling_group}
- Effect: Allow
Action:
- ecs:DescribeServices
- ecs:UpdateService
Resource:
- arn:aws:ecs:#{AWS::Region}:#{AWS::AccountId}:service/${self:custom.prefix}/${self:custom.service_prefix}-*
- Effect: Allow
Action: cloudwatch:SetAlarmState
Resource:
- !GetAtt JobsWaitingAlarm.Arn
- !GetAtt ResetJobsWaitingAlarm.Arn
package:
exclude:
- '**'
include:
- gorilla/hibernation/**
functions:
hibernate:
handler: gorilla/hibernation/main.hibernate
events:
- sns: ${self:custom.prefix}-hibernate
wake:
handler: gorilla/hibernation/main.wake
events:
- sns: ${self:custom.prefix}-wake
wake-on-deploy:
handler: gorilla/hibernation/main.wake_on_deploy
events:
- cloudwatchEvent:
event:
source: [aws.ecs]
detail-type: [ECS Service Action]
detail:
eventName: [SERVICE_TASK_PLACEMENT_FAILURE]
resources:
- arn:aws:ecs:#{AWS::Region}:#{AWS::AccountId}:service/${self:custom.service_prefix}-worker
reset-alarm:
handler: gorilla/hibernation/cloudwatch.reset_alarm
events:
- sns: ${self:custom.prefix}-reset-alarm
resources:
Resources:
NoRecentActivityAlarm:
Type: AWS::CloudWatch::Alarm
Properties:
AlarmDescription: Cluster did not show any activity recently
Metrics:
- Id: cpu
ReturnData: false
MetricStat:
Metric:
Namespace: AWS/ECS
MetricName: CPUUtilization
Dimensions:
- Name: ClusterName
Value: ${self:custom.prefix}
- Name: ServiceName
Value: ${self:custom.service_prefix}-worker
Stat: Maximum
Period: 60
- Id: jobs_finished
ReturnData: false
MetricStat:
Metric:
Namespace: AWS/SNS
MetricName: NumberOfMessagesPublished
Dimensions:
- Name: TopicName
Value: ${self:custom.prefix}-notifications
Stat: Average
Period: 60
- Id: activity
Expression: IF(cpu >= ${self:custom.cpu_low} OR jobs_finished, 1, 0)
ComparisonOperator: LessThanThreshold
Threshold: 1
EvaluationPeriods: ${self:custom.idle_period}
TreatMissingData: breaching
AlarmActions:
- arn:aws:sns:#{AWS::Region}:#{AWS::AccountId}:${self:custom.prefix}-hibernate
JobsWaitingAlarm:
Type: AWS::CloudWatch::Alarm
Properties:
AlarmDescription: Jobs are waiting in the queue
Namespace: AWS/SQS
MetricName: ApproximateNumberOfMessagesVisible
Dimensions:
- Name: QueueName
Value: ${self:custom.prefix}-jobs
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 1
Period: 60
EvaluationPeriods: 1
AlarmActions:
- arn:aws:sns:#{AWS::Region}:#{AWS::AccountId}:${self:custom.prefix}-wake
ResetJobsWaitingAlarm:
Type: AWS::CloudWatch::Alarm
Properties:
AlarmDescription: !Join [' ', ['Reset alarm', !Ref JobsWaitingAlarm]]
Namespace: AWS/Lambda
MetricName: Invocations
Dimensions:
- Name: FunctionName
Value: ${self:custom.lambda_prefix}-wake
Statistic: Sum
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 1
Period: 60
EvaluationPeriods: 1
TreatMissingData: notBreaching
AlarmActions:
- arn:aws:sns:#{AWS::Region}:#{AWS::AccountId}:${self:custom.prefix}-reset-alarm
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment