mdwint/autoscaling.py

## autoscaling.py
import abc
import logging
import time
from dataclasses import dataclass

import boto3

log = logging.getLogger()

autoscaling = boto3.client("autoscaling")
ecs = boto3.client("ecs")


@dataclass
class State:
    desired: int
    running: int

    @property
    def is_enabled(self) -> bool:
        return self.desired > 0

    @property
    def is_stable(self) -> bool:
        return self.running == self.desired


class Target(abc.ABC):
    @abc.abstractmethod
    def get_state(self):
        pass

    @abc.abstractmethod
    def set_enabled(self, enabled: bool):
        pass

    def toggle(self, enabled: bool, wait: bool = False):
        if self.get_state().is_enabled != enabled:
            log.info(f"{'En' if enabled else 'Dis'}abling {self}")
            self.set_enabled(enabled)

        if wait and not self.get_state().is_stable:
            log.info(f"Waiting on {self}")
            self.wait_until_stable()

    def wait_until_stable(
        self,
        poll_interval: int = 5,
        max_attempts: int = 60,
        wait_after_stable: int = 30,
    ):
        while max_attempts > 0:
            state = self.get_state()
            if state.is_stable:
                time.sleep(wait_after_stable)
                return
            max_attempts -= 1
            time.sleep(poll_interval)

        raise ValueError(f"{self} did not stabilize: {state}")


@dataclass
class AutoScalingGroup(Target):
    name: str

    def get_state(self) -> State:
        s = autoscaling.describe_auto_scaling_groups(AutoScalingGroupNames=[self.name])
        s = s["AutoScalingGroups"][0]
        return State(desired=s["DesiredCapacity"], running=len(s["Instances"]))

    def set_enabled(self, enabled: bool):
        n = int(enabled)
        autoscaling.update_auto_scaling_group(
            AutoScalingGroupName=self.name, DesiredCapacity=n, MinSize=n
        )


@dataclass
class ECSService(Target):
    cluster: str
    name: str

    def get_state(self) -> State:
        s = ecs.describe_services(cluster=self.cluster, services=[self.name])
        s = s["services"][0]
        return State(desired=s["desiredCount"], running=s["runningCount"])

    def set_enabled(self, enabled: bool):
        n = int(enabled)
        ecs.update_service(cluster=self.cluster, service=self.name, desiredCount=n)

## cloudwatch.py
import json
import logging
from dataclasses import dataclass
from functools import wraps

import boto3

logging.basicConfig()
log = logging.getLogger()
log.setLevel(logging.INFO)


@dataclass
class CloudWatchAlarm:
    name: str
    descr: str
    state: str
    reason: str


def cloudwatch_alarm_handler(func):
    @wraps(func)
    def wrapper(event: dict, context):
        for record in event["Records"]:
            msg = json.loads(record["Sns"]["Message"])
            alarm = CloudWatchAlarm(
                name=msg["AlarmName"],
                descr=msg["AlarmDescription"],
                state=msg["NewStateValue"],
                reason=msg["NewStateReason"],
            )
            log.info(f"[{alarm.state}] {alarm.descr} (reason: {alarm.reason})")
            if alarm.state == "ALARM":
                func(alarm, context)

    return wrapper


@cloudwatch_alarm_handler
def reset_alarm(alarm, context):
    alarm_to_reset = alarm.descr.split()[-1]
    reason = f"Reset by alarm {alarm.name}"

    cw = boto3.client("cloudwatch")
    for alarm_name in (alarm_to_reset, alarm.name):
        cw.set_alarm_state(AlarmName=alarm_name, StateValue="OK", StateReason=reason)

## main.py
import logging
import os

from gorilla.hibernation.autoscaling import AutoScalingGroup, ECSService
from gorilla.hibernation.cloudwatch import cloudwatch_alarm_handler

logging.basicConfig()
log = logging.getLogger()
log.setLevel(logging.INFO)


def toggle_service(enabled: bool):
    cluster = os.environ["cluster"]
    prefix = os.environ["service_prefix"]

    ECSService(cluster, f"{prefix}-scheduler").toggle(enabled, wait=not enabled)
    if not enabled:
        # NOTE: Workers are enabled by the scheduler's autoscaling
        ECSService(cluster, f"{prefix}-worker").toggle(enabled)

    toggle_autoscaling_group(enabled)


def toggle_autoscaling_group(enabled: bool):
    asg = os.environ["autoscaling_group"]
    AutoScalingGroup(asg).toggle(enabled)


@cloudwatch_alarm_handler
def hibernate(alarm, context):
    toggle_service(False)


@cloudwatch_alarm_handler
def wake(alarm, context):
    toggle_service(True)


def wake_on_deploy(event, context):
    toggle_autoscaling_group(True)

## serverless.yml
# For an introduction to Serverless Framework, see:
# https://www.serverless.com/framework/docs/providers/aws/guide/intro/
service: gorilla-hibernation

plugins:
- serverless-plugin-conditional-functions
- serverless-pseudo-parameters
- serverless-python-requirements

custom:
  # Customize these to your needs:
  prefix: example
  lambda_prefix: ${self:service}-${self:provider.stage}
  service_prefix: my-service
  autoscaling_group: my-auto-scaling-group
  cpu_low: 3  # percent
  # ...
  pythonRequirements:
    dockerizePip: non-linux
    useDownloadCache: true
    slim: true

provider:
  name: aws
  runtime: python3.7
  memorySize: 256
  timeout: 300

  region: ${env:REGION, 'eu-west-1'}
  stage: ${opt:stage, 'dev'}

  environment:
    cluster: ${self:custom.prefix}
    service_prefix: ${self:custom.service_prefix}
    autoscaling_group: ${self:custom.autoscaling_group}

  iamRoleStatements:
  - Effect: Allow
    Action:
    - autoscaling:DescribeAutoScalingGroups
    Resource: '*'

  - Effect: Allow
    Action: autoscaling:UpdateAutoScalingGroup
    Resource:
    - arn:aws:autoscaling:#{AWS::Region}:#{AWS::AccountId}:autoScalingGroup:*:autoScalingGroupName/${self:custom.autoscaling_group}

  - Effect: Allow
    Action:
    - ecs:DescribeServices
    - ecs:UpdateService
    Resource:
    - arn:aws:ecs:#{AWS::Region}:#{AWS::AccountId}:service/${self:custom.prefix}/${self:custom.service_prefix}-*

  - Effect: Allow
    Action: cloudwatch:SetAlarmState
    Resource:
    - !GetAtt JobsWaitingAlarm.Arn
    - !GetAtt ResetJobsWaitingAlarm.Arn

package:
  exclude:
  - '**'
  include:
  - gorilla/hibernation/**

functions:
  hibernate:
    handler: gorilla/hibernation/main.hibernate
    events:
    - sns: ${self:custom.prefix}-hibernate

  wake:
    handler: gorilla/hibernation/main.wake
    events:
    - sns: ${self:custom.prefix}-wake

  wake-on-deploy:
    handler: gorilla/hibernation/main.wake_on_deploy
    events:
    - cloudwatchEvent:
        event:
          source: [aws.ecs]
          detail-type: [ECS Service Action]
          detail:
            eventName: [SERVICE_TASK_PLACEMENT_FAILURE]
          resources:
          - arn:aws:ecs:#{AWS::Region}:#{AWS::AccountId}:service/${self:custom.service_prefix}-worker

  reset-alarm:
    handler: gorilla/hibernation/cloudwatch.reset_alarm
    events:
    - sns: ${self:custom.prefix}-reset-alarm

resources:
  Resources:
    NoRecentActivityAlarm:
      Type: AWS::CloudWatch::Alarm
      Properties:
        AlarmDescription: Cluster did not show any activity recently
        Metrics:
        - Id: cpu
          ReturnData: false
          MetricStat:
            Metric:
              Namespace: AWS/ECS
              MetricName: CPUUtilization
              Dimensions:
              - Name: ClusterName
                Value: ${self:custom.prefix}
              - Name: ServiceName
                Value: ${self:custom.service_prefix}-worker
            Stat: Maximum
            Period: 60
        - Id: jobs_finished
          ReturnData: false
          MetricStat:
            Metric:
              Namespace: AWS/SNS
              MetricName: NumberOfMessagesPublished
              Dimensions:
              - Name: TopicName
                Value: ${self:custom.prefix}-notifications
            Stat: Average
            Period: 60
        - Id: activity
          Expression: IF(cpu >= ${self:custom.cpu_low} OR jobs_finished, 1, 0)
        ComparisonOperator: LessThanThreshold
        Threshold: 1
        EvaluationPeriods: ${self:custom.idle_period}
        TreatMissingData: breaching
        AlarmActions:
        - arn:aws:sns:#{AWS::Region}:#{AWS::AccountId}:${self:custom.prefix}-hibernate

    JobsWaitingAlarm:
      Type: AWS::CloudWatch::Alarm
      Properties:
        AlarmDescription: Jobs are waiting in the queue
        Namespace: AWS/SQS
        MetricName: ApproximateNumberOfMessagesVisible
        Dimensions:
        - Name: QueueName
          Value: ${self:custom.prefix}-jobs
        Statistic: Average
        ComparisonOperator: GreaterThanOrEqualToThreshold
        Threshold: 1
        Period: 60
        EvaluationPeriods: 1
        AlarmActions:
        - arn:aws:sns:#{AWS::Region}:#{AWS::AccountId}:${self:custom.prefix}-wake

    ResetJobsWaitingAlarm:
      Type: AWS::CloudWatch::Alarm
      Properties:
        AlarmDescription: !Join [' ', ['Reset alarm', !Ref JobsWaitingAlarm]]
        Namespace: AWS/Lambda
        MetricName: Invocations
        Dimensions:
        - Name: FunctionName
          Value: ${self:custom.lambda_prefix}-wake
        Statistic: Sum
        ComparisonOperator: GreaterThanOrEqualToThreshold
        Threshold: 1
        Period: 60
        EvaluationPeriods: 1
        TreatMissingData: notBreaching
        AlarmActions:
        - arn:aws:sns:#{AWS::Region}:#{AWS::AccountId}:${self:custom.prefix}-reset-alarm
	import abc
	import logging
	import time
	from dataclasses import dataclass

	import boto3

	log = logging.getLogger()

	autoscaling = boto3.client("autoscaling")
	ecs = boto3.client("ecs")


	@dataclass
	class State:
	desired: int
	running: int

	@property
	def is_enabled(self) -> bool:
	return self.desired > 0

	@property
	def is_stable(self) -> bool:
	return self.running == self.desired


	class Target(abc.ABC):
	@abc.abstractmethod
	def get_state(self):
	pass

	@abc.abstractmethod
	def set_enabled(self, enabled: bool):
	pass

	def toggle(self, enabled: bool, wait: bool = False):
	if self.get_state().is_enabled != enabled:
	log.info(f"{'En' if enabled else 'Dis'}abling {self}")
	self.set_enabled(enabled)

	if wait and not self.get_state().is_stable:
	log.info(f"Waiting on {self}")
	self.wait_until_stable()

	def wait_until_stable(
	self,
	poll_interval: int = 5,
	max_attempts: int = 60,
	wait_after_stable: int = 30,
	):
	while max_attempts > 0:
	state = self.get_state()
	if state.is_stable:
	time.sleep(wait_after_stable)
	return
	max_attempts -= 1
	time.sleep(poll_interval)

	raise ValueError(f"{self} did not stabilize: {state}")


	@dataclass
	class AutoScalingGroup(Target):
	name: str

	def get_state(self) -> State:
	s = autoscaling.describe_auto_scaling_groups(AutoScalingGroupNames=[self.name])
	s = s["AutoScalingGroups"][0]
	return State(desired=s["DesiredCapacity"], running=len(s["Instances"]))

	def set_enabled(self, enabled: bool):
	n = int(enabled)
	autoscaling.update_auto_scaling_group(
	AutoScalingGroupName=self.name, DesiredCapacity=n, MinSize=n
	)


	@dataclass
	class ECSService(Target):
	cluster: str
	name: str

	def get_state(self) -> State:
	s = ecs.describe_services(cluster=self.cluster, services=[self.name])
	s = s["services"][0]
	return State(desired=s["desiredCount"], running=s["runningCount"])

	def set_enabled(self, enabled: bool):
	n = int(enabled)
	ecs.update_service(cluster=self.cluster, service=self.name, desiredCount=n)
	import json
	import logging
	from dataclasses import dataclass
	from functools import wraps

	import boto3

	logging.basicConfig()
	log = logging.getLogger()
	log.setLevel(logging.INFO)


	@dataclass
	class CloudWatchAlarm:
	name: str
	descr: str
	state: str
	reason: str


	def cloudwatch_alarm_handler(func):
	@wraps(func)
	def wrapper(event: dict, context):
	for record in event["Records"]:
	msg = json.loads(record["Sns"]["Message"])
	alarm = CloudWatchAlarm(
	name=msg["AlarmName"],
	descr=msg["AlarmDescription"],
	state=msg["NewStateValue"],
	reason=msg["NewStateReason"],
	)
	log.info(f"[{alarm.state}] {alarm.descr} (reason: {alarm.reason})")
	if alarm.state == "ALARM":
	func(alarm, context)

	return wrapper


	@cloudwatch_alarm_handler
	def reset_alarm(alarm, context):
	alarm_to_reset = alarm.descr.split()[-1]
	reason = f"Reset by alarm {alarm.name}"

	cw = boto3.client("cloudwatch")
	for alarm_name in (alarm_to_reset, alarm.name):
	cw.set_alarm_state(AlarmName=alarm_name, StateValue="OK", StateReason=reason)
	import logging
	import os

	from gorilla.hibernation.autoscaling import AutoScalingGroup, ECSService
	from gorilla.hibernation.cloudwatch import cloudwatch_alarm_handler

	logging.basicConfig()
	log = logging.getLogger()
	log.setLevel(logging.INFO)


	def toggle_service(enabled: bool):
	cluster = os.environ["cluster"]
	prefix = os.environ["service_prefix"]

	ECSService(cluster, f"{prefix}-scheduler").toggle(enabled, wait=not enabled)
	if not enabled:
	# NOTE: Workers are enabled by the scheduler's autoscaling
	ECSService(cluster, f"{prefix}-worker").toggle(enabled)

	toggle_autoscaling_group(enabled)


	def toggle_autoscaling_group(enabled: bool):
	asg = os.environ["autoscaling_group"]
	AutoScalingGroup(asg).toggle(enabled)


	@cloudwatch_alarm_handler
	def hibernate(alarm, context):
	toggle_service(False)


	@cloudwatch_alarm_handler
	def wake(alarm, context):
	toggle_service(True)


	def wake_on_deploy(event, context):
	toggle_autoscaling_group(True)
	# For an introduction to Serverless Framework, see:
	# https://www.serverless.com/framework/docs/providers/aws/guide/intro/
	service: gorilla-hibernation

	plugins:
	- serverless-plugin-conditional-functions
	- serverless-pseudo-parameters
	- serverless-python-requirements

	custom:
	# Customize these to your needs:
	prefix: example
	lambda_prefix: ${self:service}-${self:provider.stage}
	service_prefix: my-service
	autoscaling_group: my-auto-scaling-group
	cpu_low: 3 # percent
	# ...
	pythonRequirements:
	dockerizePip: non-linux
	useDownloadCache: true
	slim: true

	provider:
	name: aws
	runtime: python3.7
	memorySize: 256
	timeout: 300

	region: ${env:REGION, 'eu-west-1'}
	stage: ${opt:stage, 'dev'}

	environment:
	cluster: ${self:custom.prefix}
	service_prefix: ${self:custom.service_prefix}
	autoscaling_group: ${self:custom.autoscaling_group}

	iamRoleStatements:
	- Effect: Allow
	Action:
	- autoscaling:DescribeAutoScalingGroups
	Resource: '*'

	- Effect: Allow
	Action: autoscaling:UpdateAutoScalingGroup
	Resource:
	- arn:aws:autoscaling:#{AWS::Region}:#{AWS::AccountId}:autoScalingGroup:*:autoScalingGroupName/${self:custom.autoscaling_group}

	- Effect: Allow
	Action:
	- ecs:DescribeServices
	- ecs:UpdateService
	Resource:
	- arn:aws:ecs:#{AWS::Region}:#{AWS::AccountId}:service/${self:custom.prefix}/${self:custom.service_prefix}-*

	- Effect: Allow
	Action: cloudwatch:SetAlarmState
	Resource:
	- !GetAtt JobsWaitingAlarm.Arn
	- !GetAtt ResetJobsWaitingAlarm.Arn

	package:
	exclude:
	- '**'
	include:
	- gorilla/hibernation/**

	functions:
	hibernate:
	handler: gorilla/hibernation/main.hibernate
	events:
	- sns: ${self:custom.prefix}-hibernate

	wake:
	handler: gorilla/hibernation/main.wake
	events:
	- sns: ${self:custom.prefix}-wake

	wake-on-deploy:
	handler: gorilla/hibernation/main.wake_on_deploy
	events:
	- cloudwatchEvent:
	event:
	source: [aws.ecs]
	detail-type: [ECS Service Action]
	detail:
	eventName: [SERVICE_TASK_PLACEMENT_FAILURE]
	resources:
	- arn:aws:ecs:#{AWS::Region}:#{AWS::AccountId}:service/${self:custom.service_prefix}-worker

	reset-alarm:
	handler: gorilla/hibernation/cloudwatch.reset_alarm
	events:
	- sns: ${self:custom.prefix}-reset-alarm

	resources:
	Resources:
	NoRecentActivityAlarm:
	Type: AWS::CloudWatch::Alarm
	Properties:
	AlarmDescription: Cluster did not show any activity recently
	Metrics:
	- Id: cpu
	ReturnData: false
	MetricStat:
	Metric:
	Namespace: AWS/ECS
	MetricName: CPUUtilization
	Dimensions:
	- Name: ClusterName
	Value: ${self:custom.prefix}
	- Name: ServiceName
	Value: ${self:custom.service_prefix}-worker
	Stat: Maximum
	Period: 60
	- Id: jobs_finished
	ReturnData: false
	MetricStat:
	Metric:
	Namespace: AWS/SNS
	MetricName: NumberOfMessagesPublished
	Dimensions:
	- Name: TopicName
	Value: ${self:custom.prefix}-notifications
	Stat: Average
	Period: 60
	- Id: activity
	Expression: IF(cpu >= ${self:custom.cpu_low} OR jobs_finished, 1, 0)
	ComparisonOperator: LessThanThreshold
	Threshold: 1
	EvaluationPeriods: ${self:custom.idle_period}
	TreatMissingData: breaching
	AlarmActions:
	- arn:aws:sns:#{AWS::Region}:#{AWS::AccountId}:${self:custom.prefix}-hibernate

	JobsWaitingAlarm:
	Type: AWS::CloudWatch::Alarm
	Properties:
	AlarmDescription: Jobs are waiting in the queue
	Namespace: AWS/SQS
	MetricName: ApproximateNumberOfMessagesVisible
	Dimensions:
	- Name: QueueName
	Value: ${self:custom.prefix}-jobs
	Statistic: Average
	ComparisonOperator: GreaterThanOrEqualToThreshold
	Threshold: 1
	Period: 60
	EvaluationPeriods: 1
	AlarmActions:
	- arn:aws:sns:#{AWS::Region}:#{AWS::AccountId}:${self:custom.prefix}-wake

	ResetJobsWaitingAlarm:
	Type: AWS::CloudWatch::Alarm
	Properties:
	AlarmDescription: !Join [' ', ['Reset alarm', !Ref JobsWaitingAlarm]]
	Namespace: AWS/Lambda
	MetricName: Invocations
	Dimensions:
	- Name: FunctionName
	Value: ${self:custom.lambda_prefix}-wake
	Statistic: Sum
	ComparisonOperator: GreaterThanOrEqualToThreshold
	Threshold: 1
	Period: 60
	EvaluationPeriods: 1
	TreatMissingData: notBreaching
	AlarmActions:
	- arn:aws:sns:#{AWS::Region}:#{AWS::AccountId}:${self:custom.prefix}-reset-alarm