Last active
June 5, 2021 11:09
-
-
Save mdwint/73ac6aa361fbba692c92177f06d7d918 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import abc | |
import logging | |
import time | |
from dataclasses import dataclass | |
import boto3 | |
log = logging.getLogger() | |
autoscaling = boto3.client("autoscaling") | |
ecs = boto3.client("ecs") | |
@dataclass | |
class State: | |
desired: int | |
running: int | |
@property | |
def is_enabled(self) -> bool: | |
return self.desired > 0 | |
@property | |
def is_stable(self) -> bool: | |
return self.running == self.desired | |
class Target(abc.ABC): | |
@abc.abstractmethod | |
def get_state(self): | |
pass | |
@abc.abstractmethod | |
def set_enabled(self, enabled: bool): | |
pass | |
def toggle(self, enabled: bool, wait: bool = False): | |
if self.get_state().is_enabled != enabled: | |
log.info(f"{'En' if enabled else 'Dis'}abling {self}") | |
self.set_enabled(enabled) | |
if wait and not self.get_state().is_stable: | |
log.info(f"Waiting on {self}") | |
self.wait_until_stable() | |
def wait_until_stable( | |
self, | |
poll_interval: int = 5, | |
max_attempts: int = 60, | |
wait_after_stable: int = 30, | |
): | |
while max_attempts > 0: | |
state = self.get_state() | |
if state.is_stable: | |
time.sleep(wait_after_stable) | |
return | |
max_attempts -= 1 | |
time.sleep(poll_interval) | |
raise ValueError(f"{self} did not stabilize: {state}") | |
@dataclass | |
class AutoScalingGroup(Target): | |
name: str | |
def get_state(self) -> State: | |
s = autoscaling.describe_auto_scaling_groups(AutoScalingGroupNames=[self.name]) | |
s = s["AutoScalingGroups"][0] | |
return State(desired=s["DesiredCapacity"], running=len(s["Instances"])) | |
def set_enabled(self, enabled: bool): | |
n = int(enabled) | |
autoscaling.update_auto_scaling_group( | |
AutoScalingGroupName=self.name, DesiredCapacity=n, MinSize=n | |
) | |
@dataclass | |
class ECSService(Target): | |
cluster: str | |
name: str | |
def get_state(self) -> State: | |
s = ecs.describe_services(cluster=self.cluster, services=[self.name]) | |
s = s["services"][0] | |
return State(desired=s["desiredCount"], running=s["runningCount"]) | |
def set_enabled(self, enabled: bool): | |
n = int(enabled) | |
ecs.update_service(cluster=self.cluster, service=self.name, desiredCount=n) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import logging | |
from dataclasses import dataclass | |
from functools import wraps | |
import boto3 | |
logging.basicConfig() | |
log = logging.getLogger() | |
log.setLevel(logging.INFO) | |
@dataclass | |
class CloudWatchAlarm: | |
name: str | |
descr: str | |
state: str | |
reason: str | |
def cloudwatch_alarm_handler(func): | |
@wraps(func) | |
def wrapper(event: dict, context): | |
for record in event["Records"]: | |
msg = json.loads(record["Sns"]["Message"]) | |
alarm = CloudWatchAlarm( | |
name=msg["AlarmName"], | |
descr=msg["AlarmDescription"], | |
state=msg["NewStateValue"], | |
reason=msg["NewStateReason"], | |
) | |
log.info(f"[{alarm.state}] {alarm.descr} (reason: {alarm.reason})") | |
if alarm.state == "ALARM": | |
func(alarm, context) | |
return wrapper | |
@cloudwatch_alarm_handler | |
def reset_alarm(alarm, context): | |
alarm_to_reset = alarm.descr.split()[-1] | |
reason = f"Reset by alarm {alarm.name}" | |
cw = boto3.client("cloudwatch") | |
for alarm_name in (alarm_to_reset, alarm.name): | |
cw.set_alarm_state(AlarmName=alarm_name, StateValue="OK", StateReason=reason) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import os | |
from gorilla.hibernation.autoscaling import AutoScalingGroup, ECSService | |
from gorilla.hibernation.cloudwatch import cloudwatch_alarm_handler | |
logging.basicConfig() | |
log = logging.getLogger() | |
log.setLevel(logging.INFO) | |
def toggle_service(enabled: bool): | |
cluster = os.environ["cluster"] | |
prefix = os.environ["service_prefix"] | |
ECSService(cluster, f"{prefix}-scheduler").toggle(enabled, wait=not enabled) | |
if not enabled: | |
# NOTE: Workers are enabled by the scheduler's autoscaling | |
ECSService(cluster, f"{prefix}-worker").toggle(enabled) | |
toggle_autoscaling_group(enabled) | |
def toggle_autoscaling_group(enabled: bool): | |
asg = os.environ["autoscaling_group"] | |
AutoScalingGroup(asg).toggle(enabled) | |
@cloudwatch_alarm_handler | |
def hibernate(alarm, context): | |
toggle_service(False) | |
@cloudwatch_alarm_handler | |
def wake(alarm, context): | |
toggle_service(True) | |
def wake_on_deploy(event, context): | |
toggle_autoscaling_group(True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# For an introduction to Serverless Framework, see: | |
# https://www.serverless.com/framework/docs/providers/aws/guide/intro/ | |
service: gorilla-hibernation | |
plugins: | |
- serverless-plugin-conditional-functions | |
- serverless-pseudo-parameters | |
- serverless-python-requirements | |
custom: | |
# Customize these to your needs: | |
prefix: example | |
lambda_prefix: ${self:service}-${self:provider.stage} | |
service_prefix: my-service | |
autoscaling_group: my-auto-scaling-group | |
cpu_low: 3 # percent | |
# ... | |
pythonRequirements: | |
dockerizePip: non-linux | |
useDownloadCache: true | |
slim: true | |
provider: | |
name: aws | |
runtime: python3.7 | |
memorySize: 256 | |
timeout: 300 | |
region: ${env:REGION, 'eu-west-1'} | |
stage: ${opt:stage, 'dev'} | |
environment: | |
cluster: ${self:custom.prefix} | |
service_prefix: ${self:custom.service_prefix} | |
autoscaling_group: ${self:custom.autoscaling_group} | |
iamRoleStatements: | |
- Effect: Allow | |
Action: | |
- autoscaling:DescribeAutoScalingGroups | |
Resource: '*' | |
- Effect: Allow | |
Action: autoscaling:UpdateAutoScalingGroup | |
Resource: | |
- arn:aws:autoscaling:#{AWS::Region}:#{AWS::AccountId}:autoScalingGroup:*:autoScalingGroupName/${self:custom.autoscaling_group} | |
- Effect: Allow | |
Action: | |
- ecs:DescribeServices | |
- ecs:UpdateService | |
Resource: | |
- arn:aws:ecs:#{AWS::Region}:#{AWS::AccountId}:service/${self:custom.prefix}/${self:custom.service_prefix}-* | |
- Effect: Allow | |
Action: cloudwatch:SetAlarmState | |
Resource: | |
- !GetAtt JobsWaitingAlarm.Arn | |
- !GetAtt ResetJobsWaitingAlarm.Arn | |
package: | |
exclude: | |
- '**' | |
include: | |
- gorilla/hibernation/** | |
functions: | |
hibernate: | |
handler: gorilla/hibernation/main.hibernate | |
events: | |
- sns: ${self:custom.prefix}-hibernate | |
wake: | |
handler: gorilla/hibernation/main.wake | |
events: | |
- sns: ${self:custom.prefix}-wake | |
wake-on-deploy: | |
handler: gorilla/hibernation/main.wake_on_deploy | |
events: | |
- cloudwatchEvent: | |
event: | |
source: [aws.ecs] | |
detail-type: [ECS Service Action] | |
detail: | |
eventName: [SERVICE_TASK_PLACEMENT_FAILURE] | |
resources: | |
- arn:aws:ecs:#{AWS::Region}:#{AWS::AccountId}:service/${self:custom.service_prefix}-worker | |
reset-alarm: | |
handler: gorilla/hibernation/cloudwatch.reset_alarm | |
events: | |
- sns: ${self:custom.prefix}-reset-alarm | |
resources: | |
Resources: | |
NoRecentActivityAlarm: | |
Type: AWS::CloudWatch::Alarm | |
Properties: | |
AlarmDescription: Cluster did not show any activity recently | |
Metrics: | |
- Id: cpu | |
ReturnData: false | |
MetricStat: | |
Metric: | |
Namespace: AWS/ECS | |
MetricName: CPUUtilization | |
Dimensions: | |
- Name: ClusterName | |
Value: ${self:custom.prefix} | |
- Name: ServiceName | |
Value: ${self:custom.service_prefix}-worker | |
Stat: Maximum | |
Period: 60 | |
- Id: jobs_finished | |
ReturnData: false | |
MetricStat: | |
Metric: | |
Namespace: AWS/SNS | |
MetricName: NumberOfMessagesPublished | |
Dimensions: | |
- Name: TopicName | |
Value: ${self:custom.prefix}-notifications | |
Stat: Average | |
Period: 60 | |
- Id: activity | |
Expression: IF(cpu >= ${self:custom.cpu_low} OR jobs_finished, 1, 0) | |
ComparisonOperator: LessThanThreshold | |
Threshold: 1 | |
EvaluationPeriods: ${self:custom.idle_period} | |
TreatMissingData: breaching | |
AlarmActions: | |
- arn:aws:sns:#{AWS::Region}:#{AWS::AccountId}:${self:custom.prefix}-hibernate | |
JobsWaitingAlarm: | |
Type: AWS::CloudWatch::Alarm | |
Properties: | |
AlarmDescription: Jobs are waiting in the queue | |
Namespace: AWS/SQS | |
MetricName: ApproximateNumberOfMessagesVisible | |
Dimensions: | |
- Name: QueueName | |
Value: ${self:custom.prefix}-jobs | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 1 | |
Period: 60 | |
EvaluationPeriods: 1 | |
AlarmActions: | |
- arn:aws:sns:#{AWS::Region}:#{AWS::AccountId}:${self:custom.prefix}-wake | |
ResetJobsWaitingAlarm: | |
Type: AWS::CloudWatch::Alarm | |
Properties: | |
AlarmDescription: !Join [' ', ['Reset alarm', !Ref JobsWaitingAlarm]] | |
Namespace: AWS/Lambda | |
MetricName: Invocations | |
Dimensions: | |
- Name: FunctionName | |
Value: ${self:custom.lambda_prefix}-wake | |
Statistic: Sum | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 1 | |
Period: 60 | |
EvaluationPeriods: 1 | |
TreatMissingData: notBreaching | |
AlarmActions: | |
- arn:aws:sns:#{AWS::Region}:#{AWS::AccountId}:${self:custom.prefix}-reset-alarm |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment