Simon KP eskp

## grafana-agent.yaml
server:
  log_level: info

metrics:
  global:
    scrape_interval: 1m
    remote_write:
      - url: ${URL}
        basic_auth:
          username: ${USERNAME}

## lambda-logs
...
/aws/lambda/drain-ecs-lambda 2019/03/14/[$LATEST]fbb61cba26b245de85f0d2c7a59b118f [INFO]	2019-03-14T23:50:23.638Z	ad7258ee-7bc7-40f2-8721-105d99001367	Starting execution
/aws/lambda/drain-ecs-lambda 2019/03/14/[$LATEST]fbb61cba26b245de85f0d2c7a59b118f [INFO]	2019-03-14T23:50:23.638Z	ad7258ee-7bc7-40f2-8721-105d99001367	Found container instance id arn:aws:ecs:ap-southeast-2:677050795231:container-instance/d48a81e1-3d92-4803-b2d5-5c45a1cc4bd6 in cluster arn:aws:ecs:ap-southeast-2:677050795231:cluster/demo-cluster
/aws/lambda/drain-ecs-lambda 2019/03/14/[$LATEST]fbb61cba26b245de85f0d2c7a59b118f [INFO]	2019-03-14T23:50:23.638Z	ad7258ee-7bc7-40f2-8721-105d99001367	Getting tasks running on arn:aws:ecs:ap-southeast-2:677050795231:container-instance/d48a81e1-3d92-4803-b2d5-5c45a1cc4bd6...
/aws/lambda/drain-ecs-lambda 2019/03/14/[$LATEST]fbb61cba26b245de85f0d2c7a59b118f [INFO]	2019-03-14T23:50:23.639Z	ad7258ee-7bc7-40f2-8721-105d99001367	Resetting dropped connection: ecs.ap-southeast-2.amazonaws.com
/aws/lambda/dr

## update_lambda
module "ecs_update_lambdas" {
  source = "git::https://github.com/xero-oss/ecs-cluster-update-lambda.git//src"
  region = "${var.region}"
}
# Send notifications to the SNS topic created by ecs_update_lambdas module on all important Auto Scaling events
resource "aws_autoscaling_notification" "asg-terminate" {
  group_names = [
    "${aws_autoscaling_group.ecs-autoscaling-group.name}"
  ]

## ecs-cluster
resource "aws_ecs_cluster" "demo-ecs-cluster" {
    name = "${var.ecs_cluster}"
}
# TODO convert to aws_launch_template https://www.terraform.io/docs/providers/aws/r/launch_template.html
resource "aws_launch_configuration" "ecs-launch-configuration" {
    # Notice create_before_destroy lifecycle setting and name_prefix. As we can’t create a new resource with
    # the same name as the old one, we don’t hard-code the name and only specify the prefix. Terraform adds
    # a random postfix to it, so the new configuration doesn’t clash with the old one before it is destroyed.
    name_prefix                 = "demo-cluster-lc"
    # image_id                    = "${data.aws_ami.latest_ecs.id}"

## alertmanager.yml
    slack_configs:
    - api_url: "<slack_auth url here>"
        channel: '#devops'
        send_resolved: true
        title: "{{ .GroupLabels.alertname }} alert is {{ .Status|toUpper }} in {{ .CommonLabels.env }}"
        text: |
            Affected instances {{ range .Alerts }}{{ .Labels.instance }}/{{ .Labels.job }} {{ .Labels.target }} {{ end }}
            {{ .CommonAnnotations.description }}
            See http://docs.airtame.cloud/alerts/{{ .GroupLabels.alertname }}

## curl.sh
curl -XPOST -d"$alerts" http://<alertmanager address here>:9093/api/v1/alerts

## send_alert.sh
alerts='[
    {
        "status": "resolved",
        "labels": {
            "alertname": "InstanceHighCpu",
            "instance": "dev-foo",
            "env": "dev",
            "job": "ec2_instances"
        },
        "annotations": {

## prom.alerts
    ALERT InstanceHighCpu
    IF 100 - (avg by (instance) (irate(node_cpu{mode="idle"}[5m])) * 100) > 90
    FOR 20m
    ANNOTATIONS {
        summary = "High CPU Usage on {{ $labels.instance }}",
        description = "CPU usage exceeds threshold (currently {{ $value|humanize }}% in use)",
    }

## main_prom.yml
    relabel_configs:
        # Only monitor instances with a Name starting with the regex
      - source_labels: [__meta_ec2_tag_Name]
        regex: prod-instance.*
        action: keep

## main_prom.yml
    relabel_configs:
        # Use the instance tag as the instance label
      - source_labels: [__meta_ec2_tag_Name]
        target_label: instance
	server:
	log_level: info

	metrics:
	global:
	scrape_interval: 1m
	remote_write:
	- url: ${URL}
	basic_auth:
	username: ${USERNAME}
	...
	/aws/lambda/drain-ecs-lambda 2019/03/14/[$LATEST]fbb61cba26b245de85f0d2c7a59b118f [INFO] 2019-03-14T23:50:23.638Z ad7258ee-7bc7-40f2-8721-105d99001367 Starting execution
	/aws/lambda/drain-ecs-lambda 2019/03/14/[$LATEST]fbb61cba26b245de85f0d2c7a59b118f [INFO] 2019-03-14T23:50:23.638Z ad7258ee-7bc7-40f2-8721-105d99001367 Found container instance id arn:aws:ecs:ap-southeast-2:677050795231:container-instance/d48a81e1-3d92-4803-b2d5-5c45a1cc4bd6 in cluster arn:aws:ecs:ap-southeast-2:677050795231:cluster/demo-cluster
	/aws/lambda/drain-ecs-lambda 2019/03/14/[$LATEST]fbb61cba26b245de85f0d2c7a59b118f [INFO] 2019-03-14T23:50:23.638Z ad7258ee-7bc7-40f2-8721-105d99001367 Getting tasks running on arn:aws:ecs:ap-southeast-2:677050795231:container-instance/d48a81e1-3d92-4803-b2d5-5c45a1cc4bd6...
	/aws/lambda/drain-ecs-lambda 2019/03/14/[$LATEST]fbb61cba26b245de85f0d2c7a59b118f [INFO] 2019-03-14T23:50:23.639Z ad7258ee-7bc7-40f2-8721-105d99001367 Resetting dropped connection: ecs.ap-southeast-2.amazonaws.com
	/aws/lambda/dr
	module "ecs_update_lambdas" {
	source = "git::https://github.com/xero-oss/ecs-cluster-update-lambda.git//src"
	region = "${var.region}"
	}
	# Send notifications to the SNS topic created by ecs_update_lambdas module on all important Auto Scaling events
	resource "aws_autoscaling_notification" "asg-terminate" {
	group_names = [
	"${aws_autoscaling_group.ecs-autoscaling-group.name}"
	]
	resource "aws_ecs_cluster" "demo-ecs-cluster" {
	name = "${var.ecs_cluster}"
	}
	# TODO convert to aws_launch_template https://www.terraform.io/docs/providers/aws/r/launch_template.html
	resource "aws_launch_configuration" "ecs-launch-configuration" {
	# Notice create_before_destroy lifecycle setting and name_prefix. As we can’t create a new resource with
	# the same name as the old one, we don’t hard-code the name and only specify the prefix. Terraform adds
	# a random postfix to it, so the new configuration doesn’t clash with the old one before it is destroyed.
	name_prefix = "demo-cluster-lc"
	# image_id = "${data.aws_ami.latest_ecs.id}"
	slack_configs:
	- api_url: "<slack_auth url here>"
	channel: '#devops'
	send_resolved: true
	title: "{{ .GroupLabels.alertname }} alert is {{ .Status\|toUpper }} in {{ .CommonLabels.env }}"
	text: \|
	Affected instances {{ range .Alerts }}{{ .Labels.instance }}/{{ .Labels.job }} {{ .Labels.target }} {{ end }}
	{{ .CommonAnnotations.description }}
	See http://docs.airtame.cloud/alerts/{{ .GroupLabels.alertname }}
	alerts='[
	{
	"status": "resolved",
	"labels": {
	"alertname": "InstanceHighCpu",
	"instance": "dev-foo",
	"env": "dev",
	"job": "ec2_instances"
	},
	"annotations": {
	ALERT InstanceHighCpu
	IF 100 - (avg by (instance) (irate(node_cpu{mode="idle"}[5m])) * 100) > 90
	FOR 20m
	ANNOTATIONS {
	summary = "High CPU Usage on {{ $labels.instance }}",
	description = "CPU usage exceeds threshold (currently {{ $value\|humanize }}% in use)",
	}
	relabel_configs:
	# Only monitor instances with a Name starting with the regex
	- source_labels: [__meta_ec2_tag_Name]
	regex: prod-instance.*
	action: keep
	relabel_configs:
	# Use the instance tag as the instance label
	- source_labels: [__meta_ec2_tag_Name]
	target_label: instance