Skip to content

Instantly share code, notes, and snippets.

@f9n
Last active August 15, 2022 08:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save f9n/a412f7c9e7fdd852d432bc5626b9388a to your computer and use it in GitHub Desktop.
Save f9n/a412f7c9e7fdd852d432bc5626b9388a to your computer and use it in GitHub Desktop.
Generic alarms for Aws Cloudwatch
# In 2019
import os
import json
import sys
import time
import copy
# Import requirement python packages
file_path = os.path.dirname(__file__)
module_path = os.path.join(file_path, "env")
sys.path.append(module_path)
import yaml
import boto3
from botocore.exceptions import ClientError, BotoCoreError
def get_configs():
print(f"[+] Get config")
with open("./config.yml") as f:
return yaml.safe_load(f)
CONFIGS = get_configs()
class Ec2Instance:
def __init__(self, region, instance_id):
self.ec2 = boto3.resource("ec2", region_name=region)
self.instance = self.ec2.Instance(instance_id)
def get_name(self):
print("[+] Ec2Instance.get_name()")
instance_id = self.instance.id
instance_name = self.get_tag_value(key_name="Name")
print(f"[+] InstanceName: {instance_name}, InstanceId: {instance_id}")
if instance_name == None:
instance_name = instance_id
if instance_name.find(instance_id) < 0:
instance_name = instance_name + "_" + instance_id
print(f"[+] InstanceName: {instance_name}")
return instance_name
def get_tag_value(self, key_name="Name"):
for tag in self.instance.tags:
if tag["Key"] == key_name:
return tag["Value"]
return None
def convert_tags_to_dictionary(self):
tags_object = {}
for tag in self.instance.tags:
tags_object[tag["Key"]] = tag["Value"]
print(f"[+] Instace Tags: {tags_object}")
return tags_object
class Alarms:
def __init__(
self,
region,
account_id,
instance_id,
sleep_duration=CONFIGS["globals"]["sleep_duration"],
):
self.region = region
self.account_id = account_id
self.instance_id = instance_id
self.sleep_duration = sleep_duration
self.__ec2instance = None
self.__cw_client = None
self.__cw_resource = None
@property
def ec2instance(self):
if self.__ec2instance is None:
self.__ec2instance = Ec2Instance(
region=self.region, instance_id=self.instance_id
)
return self.__ec2instance
@property
def cw_client(self):
if self.__cw_client is None:
self.__cw_client = boto3.client("cloudwatch", region_name=self.region)
return self.__cw_client
@property
def cw_resource(self):
if self.__cw_resource is None:
self.__cw_resource = boto3.resource("cloudwatch", region_name=self.region)
return self.__cw_resource
def __create_alarm(self, metric):
print("[+] Create {name} alarm".format(name=metric["AlarmName"]))
self.cw_client.put_metric_alarm(**metric)
def __find_metrics_by_instance(self):
instance_tags = self.ec2instance.convert_tags_to_dictionary()
temp_metrics = []
count = 0
matched = False
configs = get_configs()
for config in configs["ec2"]["alarms"]:
print(f"[+] Config: {config}")
config_select_by_tags = config["select_by_tags"]
config_metrics = config["metrics"]
_count = 0
for instance_tag_key, instance_tag_value in instance_tags.items():
for config_tag_key, config_tag_value in config_select_by_tags.items():
if (
instance_tag_key == config_tag_key
and instance_tag_value == config_tag_value
):
_count += 1
if _count > count:
count = _count
temp_metrics = copy.deepcopy(config_metrics)
matched = True
print(f"[+] Temp Metrics: {temp_metrics}")
print(f"[+] Matched: {matched}")
return temp_metrics, matched
def __get_metrics_by_instance(self):
print("[+] Alarms.__get_metrics_by_instsance")
instance_name = self.ec2instance.get_name()
metric = dict(
Namespace="AWS/EC2",
AlarmName="Shallow",
AlarmDescription="",
MetricName="CPUUtilization",
ComparisonOperator="GreaterThanOrEqualToThreshold",
Statistic="Average",
Threshold=50,
EvaluationPeriods=1,
Period=5 * 60,
ActionsEnabled=False,
OKActions=[],
AlarmActions=[],
Dimensions=[{"Name": "InstanceId", "Value": self.ec2instance.instance.id}],
)
metric.update(CONFIGS["ec2"]["default"])
temp_metrics, matched = self.__find_metrics_by_instance()
metrics = []
if matched:
for temp_metric in temp_metrics:
_metric = copy.deepcopy(metric)
temp_metric_alarm_name = temp_metric["AlarmName"]
print(f"Old -> TempMetricAlarmName: {temp_metric_alarm_name}")
temp_metric["AlarmName"] = "[{instance_name}][{alarm_name}]".format(
instance_name=instance_name, alarm_name=temp_metric_alarm_name
)
temp_metric_alarm_name = temp_metric["AlarmName"]
print(f"New -> TempMetricAlarmName: {temp_metric_alarm_name}")
_metric.update(temp_metric)
metrics.append(_metric)
print(f"[+] Metrics: {metrics}")
return (metrics, matched)
def create(self):
print("[+] Create all alarms")
metrics, matched = self.__get_metrics_by_instance()
if not matched:
message = "We can't find the metrics for this '{instance}' instance, so we will use default configurations.".format(
instance=self.ec2instance.get_name()
)
sys.exit(message)
for metric in metrics:
self.__create_alarm(metric)
time.sleep(self.sleep_duration)
# Enable all notifications
# Change Alarm State to INSUFFICIENT_DATA
print("[+] Enable all notifications")
for metric in metrics:
alarm = self.cw_resource.Alarm(metric["AlarmName"])
print(f"[+] Alarm: {alarm}, State: {alarm.state_value}")
alarm.enable_actions()
time.sleep(1)
if alarm.state_value == "ALARM":
alarm.set_state(
StateValue="INSUFFICIENT_DATA",
StateReason="Set state to INSUFFICIENT_DATA",
)
def delete(self):
print("[+] Delete all alarms")
metrics, matched = self.__get_metrics_by_instance()
if not matched:
message = "We can't find the metrics for this '{instance}' instance.".format(
instance=self.ec2instance.get_name()
)
sys.exit(message)
all_alarm_names = [metric["AlarmName"] for metric in metrics]
print(f"[+] Delete All Alarm Names: {all_alarm_names}")
self.cw_client.delete_alarms(AlarmNames=all_alarm_names)
def __str__(self):
return "Alarms<Region: {region}, AccountId: {account_id}, InstanceId: {instance_id}>".format(
region=self.region, account_id=self.account_id, instance_id=self.instance_id
)
def handle(event, context):
alarms = Alarms(
region=event["region"],
account_id=event["account"],
instance_id=event["detail"]["instance-id"],
)
state = event["detail"]["state"]
if CONFIGS["globals"]["debug"]:
print(f"[+] Event: {event}")
print(f"[+] State: {state}")
print(f"[+] Alarms: {alarms}")
if state == "running":
alarms.create()
elif state == "terminated":
alarms.delete()
else:
print("[+] Undefined Ec2 Event State")
return {"statusCode": 200, "body": json.dumps("Generic Alarms!")}
globals:
sleep_duration: 840
debug: True
vars:
ec2:
default:
OKActions: ["arn:aws:sns:us-east-1:...:DatabaseAlerts"]
AlarmActions: ["arn:aws:sns:us-east-1:...:DatabaseAlerts"]
InsufficientDataActions: []
EvaluationPeriods: 1
Period: 300
alarms:
- select_by_tags:
Name: logstash_test_1
metrics:
- AlarmName: HighCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 70
- AlarmName: LowCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 40
- select_by_tags:
aws:autoscaling:groupName: app_example_prod_1
metrics:
- AlarmName: HighStatusCheckFailed
MetricName: StatusCheckFailed
Statistic: Maximum
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 1
- AlarmName: HighCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 30
- AlarmName: LowCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 20
- AlarmName: HighNetworkIn
MetricName: NetworkIn
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 20000000
- AlarmName: LowNetworkIn
MetricName: NetworkIn
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 2500000
- AlarmName: HighNetworkOut
MetricName: NetworkOut
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 14000000
- AlarmName: LowNetworkOut
MetricName: NetworkOut
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 1000000
- select_by_tags:
aws:autoscaling:groupName: app_example_prod_2
metrics:
- AlarmName: HighStatusCheckFailed
MetricName: StatusCheckFailed
Statistic: Maximum
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 1
- AlarmName: HighCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 85
- AlarmName: LowCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 20
- AlarmName: HighNetworkIn
MetricName: NetworkIn
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 110000000
- AlarmName: LowNetworkIn
MetricName: NetworkIn
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 25000000
- AlarmName: HighNetworkOut
MetricName: NetworkOut
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 100000000
- AlarmName: LowNetworkOut
MetricName: NetworkOut
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 20000000
- select_by_tags:
aws:autoscaling:groupName: app_example_prod_3
metrics:
- AlarmName: HighStatusCheckFailed
MetricName: StatusCheckFailed
Statistic: Maximum
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 1
- AlarmName: HighCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 85
- AlarmName: LowCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 30
- AlarmName: HighNetworkIn
MetricName: NetworkIn
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 55000000
- AlarmName: LowNetworkIn
MetricName: NetworkIn
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 10000000
- AlarmName: HighNetworkOut
MetricName: NetworkOut
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 35000000
- AlarmName: LowNetworkOut
MetricName: NetworkOut
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 10000000
- select_by_tags:
aws:autoscaling:groupName: app_example_prod_4
metrics:
- AlarmName: HighStatusCheckFailed
MetricName: StatusCheckFailed
Statistic: Maximum
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 1
- AlarmName: HighCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 70
- AlarmName: LowCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 30
- AlarmName: HighNetworkIn
MetricName: NetworkIn
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 100000000
- AlarmName: LowNetworkIn
MetricName: NetworkIn
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 25000000
- AlarmName: HighNetworkOut
MetricName: NetworkOut
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 40000000
- AlarmName: LowNetworkOut
MetricName: NetworkOut
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 10000000
- select_by_tags:
aws:autoscaling:groupName: nodes.k8s-prod-3
metrics:
- AlarmName: HighStatusCheckFailed
MetricName: StatusCheckFailed
Statistic: Maximum
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 1
- AlarmName: HighCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 65
- AlarmName: LowCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 10
- AlarmName: HighNetworkIn
MetricName: NetworkIn
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 350000000
- AlarmName: LowNetworkIn
MetricName: NetworkIn
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 150000000
- AlarmName: HighNetworkOut
MetricName: NetworkOut
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 300000000
- AlarmName: LowNetworkOut
MetricName: NetworkOut
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 120000000
- select_by_tags:
aws:autoscaling:groupName: nodes.k8s-beta-5
metrics:
- AlarmName: HighStatusCheckFailed
MetricName: StatusCheckFailed
Statistic: Maximum
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 1
- AlarmName: HighCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 70
- AlarmName: LowCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 5
- AlarmName: HighNetworkIn
MetricName: NetworkIn
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 5000000
- AlarmName: LowNetworkIn
MetricName: NetworkIn
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 2000000
- AlarmName: HighNetworkOut
MetricName: NetworkOut
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 5000000
- AlarmName: LowNetworkOut
MetricName: NetworkOut
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 1000000
- select_by_tags:
aws:autoscaling:groupName: app_example_prod_5
metrics:
- AlarmName: HighStatusCheckFailed
MetricName: StatusCheckFailed
Statistic: Maximum
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 1
- AlarmName: HighCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 55
- AlarmName: LowCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 10
- AlarmName: HighNetworkIn
MetricName: NetworkIn
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 45000000
- AlarmName: LowNetworkIn
MetricName: NetworkIn
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 15000000
- AlarmName: HighNetworkOut
MetricName: NetworkOut
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 25000000
- AlarmName: LowNetworkOut
MetricName: NetworkOut
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 10000000
- select_by_tags:
aws:autoscaling:groupName: master-us-east-1b.masters.k8s-prod-3
metrics:
- AlarmName: HighStatusCheckFailed
MetricName: StatusCheckFailed
Statistic: Maximum
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 1
- AlarmName: HighCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 3.5
- AlarmName: LowCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 2.5
- AlarmName: HighNetworkIn
MetricName: NetworkIn
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 500000
- AlarmName: LowNetworkIn
MetricName: NetworkIn
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 400000
- AlarmName: HighNetworkOut
MetricName: NetworkOut
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 10000000
- AlarmName: LowNetworkOut
MetricName: NetworkOut
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 9000000
- select_by_tags:
aws:autoscaling:groupName: master-us-east-1b.masters.k8s-beta-5
metrics:
- AlarmName: HighStatusCheckFailed
MetricName: StatusCheckFailed
Statistic: Maximum
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 1
- AlarmName: HighCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 15
- AlarmName: LowCPUUtilization
MetricName: CPUUtilization
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 10
- AlarmName: HighNetworkIn
MetricName: NetworkIn
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 400000
- AlarmName: LowNetworkIn
MetricName: NetworkIn
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 300000
- AlarmName: HighNetworkOut
MetricName: NetworkOut
Statistic: Average
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 2000000
- AlarmName: LowNetworkOut
MetricName: NetworkOut
Statistic: Average
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 1700000
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment