Skip to content

Instantly share code, notes, and snippets.

@keymon
Last active September 29, 2021 23:28
Show Gist options
  • Save keymon/11826b5f511469f70bb0d3a664c806f0 to your computer and use it in GitHub Desktop.
Save keymon/11826b5f511469f70bb0d3a664c806f0 to your computer and use it in GitHub Desktop.
Replace SG in ELBs in EKS with a shared one without downtime
#!/bin/bash
# AWS sets a limit of how many rules per Security Group can be added.
# This limit dynamic, and depends on the hard limit of rules by ENI: 1000, and computed as: 1000 / max number SG per ENI
#
# In EKS this is a problem as each ELB gets one SG that is added to the nodes SG. For instance,
# if you want to have up to 3 SG per ENI, so our hard limit is 333 loadbalancers.
#
set -e -o pipefail -u
SCRIPT_NAME="$0"
# Context to process
CONTEXT="${CONTEXT:-$(kubectl config current-context)}"
# NAMESPACE to update LBs with
NAMESPACE_SELECTOR="${NAMESPACE_SELECTOR:-all}"
EXTERNAL_SECURITY_GROUP_NAME="shared-lb"
EXTERNAL_SECURITY_GROUP_DESCRIPTION="Shared SG for loadbalancers"
panic() {
echo "ERROR: $@" 1>&2
exit 1
}
# extract AWS info from the nodes
get_k8s_aws_info() {
local instanceid
local region
read region instanceid < <(
kubectl --context "${CONTEXT}" get nodes -o json | \
jq -r '
.items[0].spec.providerID |
capture("aws:///(?<region>.*)./(?<id>.*)") |
[.region, .id] |
join(" ")
'
)
[ -n "${region}" -a -n "${instanceid}" ] || panic "Failed to get region/instance from nodes"
local vpcid
local sgid
read vpcid sgid < <(
aws ec2 describe-instances --region "${region}" --instance-ids "${instanceid}" | \
jq -r '
.Reservations[0].Instances[0] |
[
.VpcId,
(.SecurityGroups | map(select(.GroupName == "node"))[0] | .GroupId)
] | join(" ")
'
)
[ -n "${vpcid}" -a -n "${sgid}" ] || panic "Failed to get vpcid/nodes sg from nodes"
echo "${region} ${vpcid} ${sgid}"
}
# Find the group, return error if not fond
find_shared_security_group() {
local region="$1"
local vpcid="$2"
local group_name="${EXTERNAL_SECURITY_GROUP_NAME}"
aws ec2 describe-security-groups \
--region "${region}" \
--filters "Name=vpc-id,Values=${vpcid}" "Name=group-name,Values=${group_name}" | \
jq -re .SecurityGroups[0].GroupId
}
ensure_shared_security_group() {
local region="$1"
local vpcid="$2"
local sgid
# exit if exists
if sgid="$(find_shared_security_group "${region}" "${vpcid}")"; then
echo "Found existing SG named ${EXTERNAL_SECURITY_GROUP_NAME}: $sgid" 1>&2
echo "$sgid"
return
fi
echo "Creating new SG named ${EXTERNAL_SECURITY_GROUP_NAME}" 1>&2
sgid="$(
aws ec2 create-security-group \
--region "${region}" \
--group-name "${EXTERNAL_SECURITY_GROUP_NAME}" \
--description "${EXTERNAL_SECURITY_GROUP_DESCRIPTION}" \
--vpc-id "${vpcid}" | \
jq -r .GroupId
)"
# TODO if this fails, it won retry
aws ec2 authorize-security-group-ingress \
--region "${region}" \
--group-id "${sgid}" \
--protocol -1 \
--cidr 0.0.0.0/0 > /dev/null
echo "Creating new SG named ${EXTERNAL_SECURITY_GROUP_NAME}: $sgid" 1>&2
echo "$sgid"
}
ensure_authorize_shared_sg_in_nodes_sg() {
local region="$1"
local vpcid="$2"
local nodes_sgid="$3"
local shared_sgid="$4"
# Skip if already presetnt
if aws ec2 describe-security-groups \
--region "${region}" \
--group-id "${nodes_sgid}" | \
jq -e --arg source_sg "${shared_sgid}" '
[.SecurityGroups[].IpPermissions[].UserIdGroupPairs[].GroupId] |
any(.==$source_sg)
' > /dev/null; then
echo "WARNING: Shared sg ${shared_sgid} is already authorized in nodes SG ${nodes_sgid}" 1>&2
return
fi
echo "Authorize sg ${shared_sgid} in nodes SG ${nodes_sgid}" 1>&2
aws ec2 authorize-security-group-ingress \
--region "${region}" \
--group-id "${nodes_sgid}" \
--protocol -1 \
--source-group "${shared_sgid}"
}
get_loadbalancer_services() {
local namespace="$1"
local namespace_selector
local loadbalancer_services
if [ "${namespace}" == "all" ]; then
namespace_selector="--all-namespaces"
else
namespace_selector="-n ${namespace}"
fi
kubectl --context "${CONTEXT}" get service ${namespace_selector} -o json | \
jq -r '
.items[] |
select(.spec.type == "LoadBalancer") |
"\(.metadata.namespace)/\(.metadata.name)"
'
}
get_elb_from_service() {
local ns="$1"
local name="$2"
kubectl --context "${CONTEXT}" get service -n "${ns}" "${name}" -o json | \
jq -r '
.status.loadBalancer.ingress[0].hostname |
capture("^(internal-)?(?<elbname>[^\\.]*)-.*\\.") |.elbname
'
}
print_load_balancers_sg() {
local namespace="$1"
local region="$2"
local loadbalancer_services
loadbalancer_services="$(
get_loadbalancer_services "${namespace}"
)"
echo "ELBs for NAMESPACEs $namespace:" 1>&2
for lb in ${loadbalancer_services}; do
ns="${lb%/*}"
name="${lb#*/}"
elb_name="$(get_elb_from_service "${ns}" "${name}")"
if [ -z "${elb_name}" ]; then
echo "WARNING: cannot find ELB for $lb" 1>&2
else
elb_sgids="$(
aws elb describe-load-balancers \
--region "${region}" \
--load-balancer-names "${elb_name}" | \
jq -r '.LoadBalancerDescriptions[0].SecurityGroups | join(",")'
)"
echo "${lb}: ${elb_sgids}" 1>&2
fi
done
}
annotate_load_balancers() {
local namespace="$1"
local shared_sgid="$2"
local loadbalancer_services
loadbalancer_services="$(
get_loadbalancer_services "${namespace}"
)"
echo "Annotating LB services of NAMESPACE ${namespace} to use SG ${shared_sgid}" 1>&2
for lb in ${loadbalancer_services}; do
ns="${lb%/*}"
name="${lb#*/}"
if existing_annotation="$(
kubectl --context "${CONTEXT}" get service -n "${ns}" "${name}" -o json | \
jq -e '.metadata.annotations."service.beta.kubernetes.io/aws-load-balancer-security-groups"'
)"; then
echo "WARNING: ${lb} already annotated service.beta.kubernetes.io/aws-load-balancer-security-groups=${existing_annotation}. Skipping" 1>&2
continue
fi
kubectl --context "${CONTEXT}" annotate service -n "${ns}" "${name}" \
"service.beta.kubernetes.io/aws-load-balancer-security-groups=${shared_sgid}"
done
}
get_elb_sg_in_vpc() {
local region="$1"
local vpcid="$2"
aws ec2 describe-security-groups \
--region "${region}" \
--filters Name=vpc-id,Values="${vpcid}" | \
jq -r '.SecurityGroups[] | select(.GroupName | startswith("k8s-elb-")) | .GroupId'
}
is_sg_orphan() {
local region="$1"
local sgid="$2"
aws ec2 describe-network-interfaces \
--region "${region}" \
--filters Name=group-id,Values="${sgid}" | \
jq -e '.NetworkInterfaces == []' > /dev/null
}
delete_orphan_sg() {
local region="$1"
local vpcid="$2"
local sg="$3"
local nodes_sgid="$4"
if is_sg_orphan "${region}" "${sg}" ; then
[ -n "${FORCE:-}" ] || read -p "$sg is orphan, delete?. (Ctrl+C to cancel)"
aws ec2 revoke-security-group-ingress --region "${region}" \
--group-id "${nodes_sgid}" \
--protocol -1 \
--source-group "${sg}" > /dev/null || true
aws ec2 delete-security-group --region "${region}" --group-id "${sg}" > /dev/null
echo "Security group $sg deleted" 1>&2
else
echo "${sg} is still in use: " 1>&2
aws ec2 describe-network-interfaces \
--filters Name=group-id,Values="${sg}" | \
jq -r '"- " + .NetworkInterfaces[].Description'
fi
}
delete_orphan_elb_sg() {
local region="$1"
local vpcid="$2"
local nodes_sgid="$3"
sgids="$(get_elb_sg_in_vpc "${region}" "${vpcid}")"
for sg in ${sgids}; do
delete_orphan_sg "${region}" "${vpcid}" "${sg}" "${nodes_sgid}"
done
}
revert() {
local region="$1"
local vpcid="$2"
local nodes_sgid="$3"
local shared_sgid="$4"
local namespace="$5"
echo "Removing annotation of LB services of NAMESPACE ${namespace} to not use SG ${shared_sgid}" 1>&2
local loadbalancer_services
loadbalancer_services="$(
get_loadbalancer_services "${namespace}"
)"
local removed=false
for lb in ${loadbalancer_services}; do
ns="${lb%/*}"
name="${lb#*/}"
if kubectl --context "${CONTEXT}" get service -n "${ns}" "${name}" -o json | \
jq -e --arg shared_sgid "${shared_sgid}" \
'.metadata.annotations."service.beta.kubernetes.io/aws-load-balancer-security-groups" == $shared_sgid' >/dev/null; then
kubectl --context "${CONTEXT}" annotate service -n "${ns}" "${name}" \
"service.beta.kubernetes.io/aws-load-balancer-security-groups-"
removed=true
else
echo "${lb} is not annotated" 1>&2
fi
done
if [ "${removed}" == "true" ]; then
echo "Waiting reconciliation" 1>&2
sleep 30
fi
delete_orphan_sg "${region}" "${vpcid}" "${shared_sgid}" "${nodes_sgid}"
}
#######################################
echo "Processing ${CONTEXT}..."
read region vpcid nodes_sgid < <(get_k8s_aws_info)
echo "Detected region=${region} vpc=${vpcid} nodes SG=${nodes_sgid}" 1>&2
main() {
action="$1"
case "${action}" in
"setup_sg")
#######################################
# 1. Create and retrieve the new shared SG
shared_sgid="$(
ensure_shared_security_group "${region}" "${vpcid}"
)"
#######################################
# 2. Authorize that in the nodes
#
# This is also done by the controller, but after the LB SGs are updated
# so adding it in before adding the annotation would prevent any downtime
#
ensure_authorize_shared_sg_in_nodes_sg "${region}" "${vpcid}" "${nodes_sgid}" "${shared_sgid}"
;;
"annotate_lbs")
#######################################
# 3. Annotate the loadbalancers
#
if ! shared_sgid="$(
find_shared_security_group "${region}" "${vpcid}"
)"; then
echo "ERROR: shared SG not found" 1>&2
exit 1
fi
annotate_load_balancers "${NAMESPACE_SELECTOR}" "${shared_sgid}"
;;
"check_load_balancers_sg")
print_load_balancers_sg "${NAMESPACE_SELECTOR}" "${region}"
;;
"delete_orphan_sg")
#######################################
# 5. Clean up orphaned ELB security groups
delete_orphan_elb_sg "${region}" "${vpcid}" "${nodes_sgid}"
;;
"all")
main setup_sg
main check_load_balancers_sg
main annotate_lbs
echo "Waiting for reconciliation" 1>&2
sleep 30
main check_load_balancers_sg
main delete_orphan_sg
;;
"revert")
if ! shared_sgid="$(
find_shared_security_group "${region}" "${vpcid}"
)"; then
echo "ERROR: shared SG not found" 1>&2
exit 1
fi
revert "${region}" "${vpcid}" "${nodes_sgid}" "${shared_sgid}" "${NAMESPACE_SELECTOR}"
print_load_balancers_sg "${NAMESPACE_SELECTOR}" "${region}"
;;
*)
echo "Unknown action: ${action}" 1>&2
exit 1
;;
esac
}
main "${1:-all}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment