Last active
May 1, 2022 11:15
-
-
Save bogue1979/54726aa0d5a2a60f33efa21bac0e0a61 to your computer and use it in GitHub Desktop.
Nomad Agent autoscaling
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SNS topic lifecycle hooks are sent to | |
resource "aws_sns_topic" "nomad_graceful_termination_topic" { | |
name = "${local.instance_prefix}-nomad_graceful_termination_topic" | |
} | |
resource "aws_sns_topic_policy" "nomad_graceful_termination_topic" { | |
arn = aws_sns_topic.nomad_graceful_termination_topic.arn | |
policy = data.aws_iam_policy_document.nomad_graceful_termination_topic_policy.json | |
} | |
data "aws_iam_policy_document" "nomad_graceful_termination_topic_policy" { | |
policy_id = "__default_policy_ID" | |
# Copy of existing default statement | |
statement { | |
actions = [ | |
"SNS:GetTopicAttributes", | |
"SNS:SetTopicAttributes", | |
"SNS:AddPermission", | |
"SNS:RemovePermission", | |
"SNS:DeleteTopic", | |
"SNS:Subscribe", | |
"SNS:ListSubscriptionsByTopic", | |
"SNS:Publish", | |
"SNS:Receive" | |
] | |
condition { | |
test = "StringEquals" | |
variable = "AWS:SourceOwner" | |
values = [ | |
data.aws_caller_identity.self.account_id | |
] | |
} | |
effect = "Allow" | |
principals { | |
type = "AWS" | |
identifiers = ["*"] | |
} | |
resources = [aws_sns_topic.nomad_graceful_termination_topic.arn] | |
sid = "__default_statement_ID" | |
} | |
# AWS EventBridge Permissions for spot instance events | |
statement { | |
actions = ["SNS:Publish"] | |
effect = "Allow" | |
principals { | |
type = "Service" | |
identifiers = ["events.amazonaws.com"] | |
} | |
resources = [ | |
aws_sns_topic.nomad_graceful_termination_topic.arn | |
] | |
sid = "__events_statement_ID" | |
} | |
} | |
# Lambda triggered by SNS Event | |
resource "aws_sns_topic_subscription" "nomad_graceful_termination_subscription" { | |
topic_arn = aws_sns_topic.nomad_graceful_termination_topic.arn | |
protocol = "lambda" | |
endpoint = aws_lambda_function.nomad_graceful_shutdown.arn | |
} | |
# IAM for lambda | |
resource "aws_iam_role" "nomad_autoscaling_role" { | |
name = "${local.instance_prefix}-nomad-autoscaling_role" | |
assume_role_policy = <<EOF | |
{ | |
"Version": "2012-10-17", | |
"Statement": [ | |
{ | |
"Sid": "", | |
"Effect": "Allow", | |
"Principal": { | |
"Service": ["autoscaling.amazonaws.com","lambda.amazonaws.com"] | |
}, | |
"Action": "sts:AssumeRole" | |
} | |
] | |
} | |
EOF | |
} | |
resource "aws_iam_role_policy_attachment" "nomad_autoscaling_role-policy-attach" { | |
role = aws_iam_role.nomad_autoscaling_role.name | |
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" | |
} | |
resource "aws_iam_role_policy" "lifecycle_hook_autoscaling_policy" { | |
name = "${local.instance_prefix}-lifecycle_hook_autoscaling_policy" | |
role = aws_iam_role.nomad_autoscaling_role.id | |
policy = <<EOF | |
{ | |
"Version": "2012-10-17", | |
"Statement": [ | |
{ | |
"Sid": "", | |
"Effect": "Allow", | |
"Action": [ | |
"sns:Publish", | |
"autoscaling:CompleteLifecycleAction", | |
"ec2:DescribeInstances", | |
"ec2:DescribeTags", | |
"ec2:CreateNetworkInterface", | |
"ec2:DescribeNetworkInterfaces", | |
"ec2:DeleteNetworkInterface", | |
"ec2:DescribeSecurityGroups", | |
"ec2:DescribeSubnets", | |
"ec2:DescribeVpcs" | |
], | |
"Resource": ["*"] | |
} | |
] | |
} | |
EOF | |
} | |
# AWS Event Bridge Spot instance events | |
resource "aws_cloudwatch_event_rule" "spot-instance-termination" { | |
name = "${local.instance_prefix}-spot-instance-termination-event" | |
description = "${local.instance_prefix}-spot-instance-termination-event" | |
event_pattern = <<EOF | |
{ | |
"source": [ | |
"aws.ec2" | |
], | |
"detail-type": [ | |
"EC2 Spot Instance Interruption Warning" | |
] | |
} | |
EOF | |
} | |
# send spot instance events to nomad_graceful_termination_topic SNS | |
resource "aws_cloudwatch_event_target" "spot-instance-termination-sns" { | |
rule = aws_cloudwatch_event_rule.spot-instance-termination.name | |
target_id = "${local.instance_prefix}-SendSpotTerminationToSNS" | |
arn = aws_sns_topic.nomad_graceful_termination_topic.arn | |
} | |
# Lambda | |
# 1. check if Spot instance event or ASG-lifecycle event | |
# 2. get aws-instance-id from event | |
# 3. find ip of aws-instance-id | |
# 4. connect to nomad agent of ip and drain node | |
# 5. Notify ASG to continue if everyything is fine | |
data "vault_generic_secret" "platform_slack_webhook_url" { | |
path = "secret/accounts/devops/slack/webhooks" | |
} | |
resource "local_file" "nomad_drain_node-source" { | |
filename = "lambda/nomad_drain_node/main.go" | |
directory_permission = "0755" | |
file_permission = "0644" | |
content = templatefile( "${path.module}/lambda/nomad_drain_node/code/main.tpl.go", { | |
webhook_url = data.vault_generic_secret.platform_slack_webhook_url.data["platform-alerts"] | |
}) | |
} | |
resource "null_resource" "compile-nomad_drain_node" { | |
triggers = { | |
gocode = local_file.nomad_drain_node-source.content | |
} | |
provisioner "local-exec" { | |
command = "CGO_ENABLED=0 GOOS=linux go build -a -ldflags '-extldflags \"-static\"' main.go" | |
working_dir = "lambda/nomad_drain_node" | |
} | |
} | |
data "archive_file" "lambda_nomad_drain_node_zip" { | |
type = "zip" | |
source_file = "lambda/nomad_drain_node/main" | |
output_path = "lambda/nomad_drain_node.zip" | |
depends_on = [null_resource.compile-nomad_drain_node] | |
} | |
resource "aws_lambda_function" "nomad_graceful_shutdown" { | |
filename = data.archive_file.lambda_nomad_drain_node_zip.output_path | |
function_name = "${local.instance_prefix}-nomad-graceful-shutdown" | |
role = aws_iam_role.nomad_autoscaling_role.arn | |
handler = "main" | |
timeout = 900 | |
vpc_config { | |
subnet_ids = [ | |
aws_subnet.myvpc-private-a.id, | |
aws_subnet.myvpc-private-b.id, | |
aws_subnet.myvpc-private-c.id | |
] | |
security_group_ids = [aws_vpc.myvpc.default_security_group_id, aws_security_group.myvpc-internal.id] | |
} | |
environment { | |
variables = { | |
AWS_VPC_ID = aws_vpc.myvpc.id | |
} | |
} | |
source_code_hash = data.archive_file.lambda_nomad_drain_node_zip.output_base64sha256 | |
runtime = "go1.x" | |
} | |
resource "aws_lambda_permission" "trigger_nomad_graceful_shutdown_with_sns" { | |
statement_id = "AllowExecutionFromSNS" | |
action = "lambda:InvokeFunction" | |
function_name = aws_lambda_function.nomad_graceful_shutdown.function_name | |
principal = "sns.amazonaws.com" | |
source_arn = aws_sns_topic.nomad_graceful_termination_topic.arn | |
} | |
# ASGs | |
data "aws_ami" "nomad_agent" { | |
most_recent = true | |
owners = ["self"] | |
filter { | |
name = "name" | |
values = ["my-nomad-agent-base*"] | |
} | |
filter { | |
name = "architecture" | |
values = ["x86_64"] | |
} | |
} | |
resource "aws_launch_template" "nomad-agent" { | |
name = "${local.instance_prefix}-nomad-agent" | |
image_id = data.aws_ami.nomad_agent.id | |
instance_type = "m5d.large" | |
update_default_version = true | |
disable_api_termination = false | |
key_name = "deployer" | |
iam_instance_profile { | |
arn = aws_iam_instance_profile.myprofile.arn | |
} | |
ebs_optimized = true | |
vpc_security_group_ids = [aws_vpc.myvpc.default_security_group_id, aws_security_group.myvpc-internal.id] | |
block_device_mappings { | |
device_name = "/dev/sda1" | |
ebs { | |
volume_type = "gp2" | |
volume_size = var.agent_root_disk_size | |
delete_on_termination = true | |
} | |
} | |
} | |
locals { | |
agents = { | |
general = var.nomad_agent_general_instance_count | |
foo = var.nomad_agent_foo_instance_count | |
bar = var.nomad_agent_bar_instance_count | |
} | |
agent_tags = { | |
general = [ | |
{ key = "nomad_meta_general", value = "True", propagate_at_launch = true }, | |
{ key = "nomad_meta_foo", value = "False", propagate_at_launch = true }, | |
{ key = "nomad_meta_bar", value = "False", propagate_at_launch = true }, | |
] | |
foo = [ | |
{ key = "nomad_meta_general", value = "False", propagate_at_launch = true }, | |
{ key = "nomad_meta_foo", value = "True", propagate_at_launch = true }, | |
{ key = "nomad_meta_bar", value = "False", propagate_at_launch = true }, | |
] | |
bar = [ | |
{ key = "nomad_meta_general", value = "False", propagate_at_launch = true }, | |
{ key = "nomad_meta_foo", value = "False", propagate_at_launch = true }, | |
{ key = "nomad_meta_bar", value = "True", propagate_at_launch = true }, | |
] | |
} | |
} | |
resource "aws_autoscaling_group" "nomad-agents" { | |
for_each = local.agents | |
name = "${local.instance_prefix}-nomad-agent-${each.key}" | |
availability_zones = [ | |
"${var.aws_region}${var.aws_region_ids["a"]}", | |
"${var.aws_region}${var.aws_region_ids["b"]}", | |
"${var.aws_region}${var.aws_region_ids["c"]}" | |
] | |
vpc_zone_identifier = [ | |
aws_subnet.myvpc-private-a.id, | |
aws_subnet.myvpc-private-b.id, | |
aws_subnet.myvpc-private-c.id | |
] | |
desired_capacity = each.value | |
min_size = each.value | |
max_size = each.value * 2 | |
health_check_grace_period = 300 | |
health_check_type = "ELB" | |
force_delete = true | |
mixed_instances_policy { | |
launch_template { | |
launch_template_specification { | |
launch_template_id = aws_launch_template.nomad-agent.id | |
version = "$Latest" | |
} | |
override { | |
instance_type = "m5d.large" | |
} | |
override { | |
instance_type = "m5ad.large" | |
} | |
override { | |
instance_type = "r5d.large" | |
} | |
override { | |
instance_type = "r5ad.large" | |
} | |
} | |
instances_distribution { | |
on_demand_percentage_above_base_capacity = var.environment == "prd" ? 100 : 50 | |
} | |
} | |
tags = flatten ( [ [ | |
{ | |
key = "Env" , | |
value = var.environment, | |
propagate_at_launch = true | |
}, | |
{ | |
key = "Region" , | |
value = var.region, | |
propagate_at_launch = true | |
}, | |
{ | |
key = "monitored" , | |
value = "true", | |
propagate_at_launch = true | |
}, | |
{ | |
key = "NOMAD_REGION", | |
value = var.region, | |
propagate_at_launch = true | |
}, | |
{ | |
key = "NOMAD_NODE_CLASS", | |
value = each.key, | |
propagate_at_launch = true | |
}, | |
{ | |
key = "Name", | |
value = "${local.instance_prefix}-nomad-agent-${each.key}" | |
propagate_at_launch = true | |
}, | |
] , local.agent_tags[ each.key ] ] ) | |
depends_on = [ | |
aws_autoscaling_group.nomad-server | |
] | |
} | |
resource "aws_autoscaling_lifecycle_hook" "nomad_graceful_shutdown_asg_hook" { | |
for_each = local.agents | |
name = "${local.instance_prefix}-nomad-agent-${each.key}-graceful-shutdown" | |
autoscaling_group_name = "${local.instance_prefix}-nomad-agent-${each.key}" | |
default_result = "ABANDON" | |
heartbeat_timeout = 900 | |
lifecycle_transition = "autoscaling:EC2_INSTANCE_TERMINATING" | |
notification_target_arn = aws_sns_topic.nomad_graceful_termination_topic.arn | |
role_arn = aws_iam_role.nomad_autoscaling_role.arn | |
} | |
# during startup of nomad agent get Tags | |
# start nomad with settings based on tags | |
# if everything is fine notify ASG-Lifecycle-Hook to continue | |
resource "aws_autoscaling_lifecycle_hook" "nomad_startup_asg_hook" { | |
for_each = local.agents | |
name = "${local.instance_prefix}-nomad-agent-${each.key}-startup" | |
autoscaling_group_name = "${local.instance_prefix}-nomad-agent-${each.key}" | |
default_result = "ABANDON" | |
heartbeat_timeout = 900 | |
lifecycle_transition = "autoscaling:EC2_INSTANCE_LAUNCHING" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment