Skip to content

Instantly share code, notes, and snippets.

@bogue1979
Last active May 1, 2022 11:15
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bogue1979/54726aa0d5a2a60f33efa21bac0e0a61 to your computer and use it in GitHub Desktop.
Save bogue1979/54726aa0d5a2a60f33efa21bac0e0a61 to your computer and use it in GitHub Desktop.
Nomad Agent autoscaling
# SNS topic lifecycle hooks are sent to
resource "aws_sns_topic" "nomad_graceful_termination_topic" {
name = "${local.instance_prefix}-nomad_graceful_termination_topic"
}
resource "aws_sns_topic_policy" "nomad_graceful_termination_topic" {
arn = aws_sns_topic.nomad_graceful_termination_topic.arn
policy = data.aws_iam_policy_document.nomad_graceful_termination_topic_policy.json
}
data "aws_iam_policy_document" "nomad_graceful_termination_topic_policy" {
policy_id = "__default_policy_ID"
# Copy of existing default statement
statement {
actions = [
"SNS:GetTopicAttributes",
"SNS:SetTopicAttributes",
"SNS:AddPermission",
"SNS:RemovePermission",
"SNS:DeleteTopic",
"SNS:Subscribe",
"SNS:ListSubscriptionsByTopic",
"SNS:Publish",
"SNS:Receive"
]
condition {
test = "StringEquals"
variable = "AWS:SourceOwner"
values = [
data.aws_caller_identity.self.account_id
]
}
effect = "Allow"
principals {
type = "AWS"
identifiers = ["*"]
}
resources = [aws_sns_topic.nomad_graceful_termination_topic.arn]
sid = "__default_statement_ID"
}
# AWS EventBridge Permissions for spot instance events
statement {
actions = ["SNS:Publish"]
effect = "Allow"
principals {
type = "Service"
identifiers = ["events.amazonaws.com"]
}
resources = [
aws_sns_topic.nomad_graceful_termination_topic.arn
]
sid = "__events_statement_ID"
}
}
# Lambda triggered by SNS Event
resource "aws_sns_topic_subscription" "nomad_graceful_termination_subscription" {
topic_arn = aws_sns_topic.nomad_graceful_termination_topic.arn
protocol = "lambda"
endpoint = aws_lambda_function.nomad_graceful_shutdown.arn
}
# IAM for lambda
resource "aws_iam_role" "nomad_autoscaling_role" {
name = "${local.instance_prefix}-nomad-autoscaling_role"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Principal": {
"Service": ["autoscaling.amazonaws.com","lambda.amazonaws.com"]
},
"Action": "sts:AssumeRole"
}
]
}
EOF
}
resource "aws_iam_role_policy_attachment" "nomad_autoscaling_role-policy-attach" {
role = aws_iam_role.nomad_autoscaling_role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"
}
resource "aws_iam_role_policy" "lifecycle_hook_autoscaling_policy" {
name = "${local.instance_prefix}-lifecycle_hook_autoscaling_policy"
role = aws_iam_role.nomad_autoscaling_role.id
policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Action": [
"sns:Publish",
"autoscaling:CompleteLifecycleAction",
"ec2:DescribeInstances",
"ec2:DescribeTags",
"ec2:CreateNetworkInterface",
"ec2:DescribeNetworkInterfaces",
"ec2:DeleteNetworkInterface",
"ec2:DescribeSecurityGroups",
"ec2:DescribeSubnets",
"ec2:DescribeVpcs"
],
"Resource": ["*"]
}
]
}
EOF
}
# AWS Event Bridge Spot instance events
resource "aws_cloudwatch_event_rule" "spot-instance-termination" {
name = "${local.instance_prefix}-spot-instance-termination-event"
description = "${local.instance_prefix}-spot-instance-termination-event"
event_pattern = <<EOF
{
"source": [
"aws.ec2"
],
"detail-type": [
"EC2 Spot Instance Interruption Warning"
]
}
EOF
}
# send spot instance events to nomad_graceful_termination_topic SNS
resource "aws_cloudwatch_event_target" "spot-instance-termination-sns" {
rule = aws_cloudwatch_event_rule.spot-instance-termination.name
target_id = "${local.instance_prefix}-SendSpotTerminationToSNS"
arn = aws_sns_topic.nomad_graceful_termination_topic.arn
}
# Lambda
# 1. check if Spot instance event or ASG-lifecycle event
# 2. get aws-instance-id from event
# 3. find ip of aws-instance-id
# 4. connect to nomad agent of ip and drain node
# 5. Notify ASG to continue if everyything is fine
data "vault_generic_secret" "platform_slack_webhook_url" {
path = "secret/accounts/devops/slack/webhooks"
}
resource "local_file" "nomad_drain_node-source" {
filename = "lambda/nomad_drain_node/main.go"
directory_permission = "0755"
file_permission = "0644"
content = templatefile( "${path.module}/lambda/nomad_drain_node/code/main.tpl.go", {
webhook_url = data.vault_generic_secret.platform_slack_webhook_url.data["platform-alerts"]
})
}
resource "null_resource" "compile-nomad_drain_node" {
triggers = {
gocode = local_file.nomad_drain_node-source.content
}
provisioner "local-exec" {
command = "CGO_ENABLED=0 GOOS=linux go build -a -ldflags '-extldflags \"-static\"' main.go"
working_dir = "lambda/nomad_drain_node"
}
}
data "archive_file" "lambda_nomad_drain_node_zip" {
type = "zip"
source_file = "lambda/nomad_drain_node/main"
output_path = "lambda/nomad_drain_node.zip"
depends_on = [null_resource.compile-nomad_drain_node]
}
resource "aws_lambda_function" "nomad_graceful_shutdown" {
filename = data.archive_file.lambda_nomad_drain_node_zip.output_path
function_name = "${local.instance_prefix}-nomad-graceful-shutdown"
role = aws_iam_role.nomad_autoscaling_role.arn
handler = "main"
timeout = 900
vpc_config {
subnet_ids = [
aws_subnet.myvpc-private-a.id,
aws_subnet.myvpc-private-b.id,
aws_subnet.myvpc-private-c.id
]
security_group_ids = [aws_vpc.myvpc.default_security_group_id, aws_security_group.myvpc-internal.id]
}
environment {
variables = {
AWS_VPC_ID = aws_vpc.myvpc.id
}
}
source_code_hash = data.archive_file.lambda_nomad_drain_node_zip.output_base64sha256
runtime = "go1.x"
}
resource "aws_lambda_permission" "trigger_nomad_graceful_shutdown_with_sns" {
statement_id = "AllowExecutionFromSNS"
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.nomad_graceful_shutdown.function_name
principal = "sns.amazonaws.com"
source_arn = aws_sns_topic.nomad_graceful_termination_topic.arn
}
# ASGs
data "aws_ami" "nomad_agent" {
most_recent = true
owners = ["self"]
filter {
name = "name"
values = ["my-nomad-agent-base*"]
}
filter {
name = "architecture"
values = ["x86_64"]
}
}
resource "aws_launch_template" "nomad-agent" {
name = "${local.instance_prefix}-nomad-agent"
image_id = data.aws_ami.nomad_agent.id
instance_type = "m5d.large"
update_default_version = true
disable_api_termination = false
key_name = "deployer"
iam_instance_profile {
arn = aws_iam_instance_profile.myprofile.arn
}
ebs_optimized = true
vpc_security_group_ids = [aws_vpc.myvpc.default_security_group_id, aws_security_group.myvpc-internal.id]
block_device_mappings {
device_name = "/dev/sda1"
ebs {
volume_type = "gp2"
volume_size = var.agent_root_disk_size
delete_on_termination = true
}
}
}
locals {
agents = {
general = var.nomad_agent_general_instance_count
foo = var.nomad_agent_foo_instance_count
bar = var.nomad_agent_bar_instance_count
}
agent_tags = {
general = [
{ key = "nomad_meta_general", value = "True", propagate_at_launch = true },
{ key = "nomad_meta_foo", value = "False", propagate_at_launch = true },
{ key = "nomad_meta_bar", value = "False", propagate_at_launch = true },
]
foo = [
{ key = "nomad_meta_general", value = "False", propagate_at_launch = true },
{ key = "nomad_meta_foo", value = "True", propagate_at_launch = true },
{ key = "nomad_meta_bar", value = "False", propagate_at_launch = true },
]
bar = [
{ key = "nomad_meta_general", value = "False", propagate_at_launch = true },
{ key = "nomad_meta_foo", value = "False", propagate_at_launch = true },
{ key = "nomad_meta_bar", value = "True", propagate_at_launch = true },
]
}
}
resource "aws_autoscaling_group" "nomad-agents" {
for_each = local.agents
name = "${local.instance_prefix}-nomad-agent-${each.key}"
availability_zones = [
"${var.aws_region}${var.aws_region_ids["a"]}",
"${var.aws_region}${var.aws_region_ids["b"]}",
"${var.aws_region}${var.aws_region_ids["c"]}"
]
vpc_zone_identifier = [
aws_subnet.myvpc-private-a.id,
aws_subnet.myvpc-private-b.id,
aws_subnet.myvpc-private-c.id
]
desired_capacity = each.value
min_size = each.value
max_size = each.value * 2
health_check_grace_period = 300
health_check_type = "ELB"
force_delete = true
mixed_instances_policy {
launch_template {
launch_template_specification {
launch_template_id = aws_launch_template.nomad-agent.id
version = "$Latest"
}
override {
instance_type = "m5d.large"
}
override {
instance_type = "m5ad.large"
}
override {
instance_type = "r5d.large"
}
override {
instance_type = "r5ad.large"
}
}
instances_distribution {
on_demand_percentage_above_base_capacity = var.environment == "prd" ? 100 : 50
}
}
tags = flatten ( [ [
{
key = "Env" ,
value = var.environment,
propagate_at_launch = true
},
{
key = "Region" ,
value = var.region,
propagate_at_launch = true
},
{
key = "monitored" ,
value = "true",
propagate_at_launch = true
},
{
key = "NOMAD_REGION",
value = var.region,
propagate_at_launch = true
},
{
key = "NOMAD_NODE_CLASS",
value = each.key,
propagate_at_launch = true
},
{
key = "Name",
value = "${local.instance_prefix}-nomad-agent-${each.key}"
propagate_at_launch = true
},
] , local.agent_tags[ each.key ] ] )
depends_on = [
aws_autoscaling_group.nomad-server
]
}
resource "aws_autoscaling_lifecycle_hook" "nomad_graceful_shutdown_asg_hook" {
for_each = local.agents
name = "${local.instance_prefix}-nomad-agent-${each.key}-graceful-shutdown"
autoscaling_group_name = "${local.instance_prefix}-nomad-agent-${each.key}"
default_result = "ABANDON"
heartbeat_timeout = 900
lifecycle_transition = "autoscaling:EC2_INSTANCE_TERMINATING"
notification_target_arn = aws_sns_topic.nomad_graceful_termination_topic.arn
role_arn = aws_iam_role.nomad_autoscaling_role.arn
}
# during startup of nomad agent get Tags
# start nomad with settings based on tags
# if everything is fine notify ASG-Lifecycle-Hook to continue
resource "aws_autoscaling_lifecycle_hook" "nomad_startup_asg_hook" {
for_each = local.agents
name = "${local.instance_prefix}-nomad-agent-${each.key}-startup"
autoscaling_group_name = "${local.instance_prefix}-nomad-agent-${each.key}"
default_result = "ABANDON"
heartbeat_timeout = 900
lifecycle_transition = "autoscaling:EC2_INSTANCE_LAUNCHING"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment