Skip to content

Instantly share code, notes, and snippets.

@eedwards-sk
Last active June 17, 2019 15:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eedwards-sk/c91dc8e5d88b34d0a8eb1e77cbe867ac to your computer and use it in GitHub Desktop.
Save eedwards-sk/c91dc8e5d88b34d0a8eb1e77cbe867ac to your computer and use it in GitHub Desktop.
concourse worker shutdown scripts
#!/usr/bin/env bash
# this script will attempt to run the nursing-home script
# using systemd (concourse-worker-shutdown.service)
# this script is designed to be ran by an aws ssm command
# which is triggered by a lambda function
# which is triggered by a cloudwatch event
# which is fired whenever an asg marks an instance for termination
# disable history expansion so exclamation marks don't break commands
set +H
# constants
# =============================================================================
readonly EC2_INSTANCE_METADATA_URL="http://169.254.169.254/latest/meta-data"
readonly EC2_INSTANCE_DYNAMIC_DATA_URL="http://169.254.169.254/latest/dynamic"
readonly LIFECYCLE_ACTION_OK='CONTINUE'
readonly LIFECYCLE_ACTION_ERROR='ABANDON'
readonly LIFECYCLE_HOOK_NAME='concourse-worker-shutdown'
readonly SYSTEMD_SERVICE_NAME='concourse-worker-shutdown.service'
# functions
# =============================================================================
lookup_path_in_instance_metadata() {
local -r path="${1}"
curl --silent --show-error --location "$EC2_INSTANCE_METADATA_URL/$path/"
}
lookup_path_in_instance_dynamic_data() {
local -r path="${1}"
curl --silent --show-error --location "$EC2_INSTANCE_DYNAMIC_DATA_URL/$path/"
}
get_instance_id() {
lookup_path_in_instance_metadata "instance-id"
}
get_instance_region() {
lookup_path_in_instance_dynamic_data "instance-identity/document" | jq -r ".region"
}
get_autoscaling_group_name() {
local -r region="${1}"
local -r instance_id="${2}"
aws ec2 describe-tags \
--region "${region}" \
--filters "Name=resource-id,Values=${instance_id}" "Name=key,Values=aws:autoscaling:groupName" \
--query "Tags[].Value" \
--output text
}
complete_lifecycle_action() {
local -r lifecycle_action="${1}"
local -r instance_region="${2:-}"
local -r instance_id="${3:-}"
if [[ -z "${instance_region}" || -z "${instance_id}" ]]
then
echo >&2 "cannot complete lifecycle action - instance_region or instance_id is empty or unset"
return 64
fi
if autoscaling_group_name="$(get_autoscaling_group_name "${instance_region}" "${instance_id}")"
then
if [[ -n "${autoscaling_group_name}" ]]
then
aws autoscaling complete-lifecycle-action \
--region "${instance_region}" \
--auto-scaling-group-name "${autoscaling_group_name}" \
--lifecycle-hook-name "${LIFECYCLE_HOOK_NAME}" \
--instance-id "${instance_id}" \
--lifecycle-action-result "${lifecycle_action}"
else
echo >&2 "instance ${instance_id} is not in an autoscaling group"
return 64
fi
else
echo >&2 "failed to get autoscaling group name or instance ${instance_id} is not in an autoscaling group"
return 64
fi
}
# main
# =============================================================================
echo "stopping ${SYSTEMD_SERVICE_NAME}"
if /bin/systemctl stop "${SYSTEMD_SERVICE_NAME}"
then
lifecycle_action="${LIFECYCLE_ACTION_OK}"
else
echo >&2 "warning: stop command failed or timed out for ${SYSTEMD_SERVICE_NAME}"
lifecycle_action="${LIFECYCLE_ACTION_ERROR}"
fi
instance_region="$(get_instance_region)"
instance_id="$(get_instance_id)"
if ! complete_lifecycle_action "${lifecycle_action}" "${instance_region}" "${instance_id}"
then
echo >&2 "error: failed to complete lifecycle action"
exit 64
fi
exit 0
[Unit]
Description=retires the concourse worker on shutdown
Before=datadog-agent.service
After=network-online.target concourse-worker.service
[Service]
Type=oneshot
RemainAfterExit=yes
User=root
ExecStart=/bin/true
EnvironmentFile=${CONCOURSE_SERVICE_WORKER_ENV_FILE_PATH}
ExecStop=${CONCOURSE_WORKER_SHUTDOWN_SCRIPT_PATH}
TimeoutStartSec=5
TimeoutStopSec=65m
SyslogIdentifier=concourse-worker-shutdown
[Install]
WantedBy=multi-user.target
import boto3
# constants
# =============================================================================
ASG_WORKER_LIFECYCLE_SHUTDOWN_SCRIPT_NAME: str = \
'asg-worker-lifecycle-shutdown'
ASG_WORKER_LIFECYCLE_SHUTDOWN_SCRIPT_PATH: str = \
f'/opt/concourse/bin/{ASG_WORKER_LIFECYCLE_SHUTDOWN_SCRIPT_NAME}'
# exceptions
# =============================================================================
class SSMCommandError(ValueError):
pass
# private functions
# =============================================================================
def _check_cloudwatch_event(event: dict) -> None:
# schema:
# {
# "version": "0",
# "id": "12345678-1234-1234-1234-123456789012",
# "detail-type": "EC2 Instance-terminate Lifecycle Action",
# "source": "aws.autoscaling",
# "account": "123456789012",
# "time": "yyyy-mm-ddThh:mm:ssZ",
# "region": "us-west-2",
# "resources": [
# "auto-scaling-group-arn"
# ],
# "detail": {
# "LifecycleActionToken":"87654321-4321-4321-4321-210987654321",
# "AutoScalingGroupName":"my-asg",
# "LifecycleHookName":"my-lifecycle-hook",
# "EC2InstanceId":"i-1234567890abcdef0",
# "LifecycleTransition":"autoscaling:EC2_INSTANCE_TERMINATING",
# "NotificationMetadata":"additional-info"
# }
# }
# from https://docs.aws.amazon.com/autoscaling/ec2/userguide/
# cloud-watch-events.html#terminate-lifecycle-action
cloudwatch_event_source_autoscaling: str = 'aws.autoscaling'
cloudwatch_event_source: str = event['source']
if cloudwatch_event_source != cloudwatch_event_source_autoscaling:
raise ValueError('expected source '
f'{cloudwatch_event_source_autoscaling} but'
f' received {cloudwatch_event_source}')
cloudwatch_event_detail_type_ec2_terminate: str = \
'EC2 Instance-terminate Lifecycle Action'
cloudwatch_event_detail_type: str = event['detail-type']
if (cloudwatch_event_detail_type !=
cloudwatch_event_detail_type_ec2_terminate):
raise ValueError('expected detail-type '
f'{cloudwatch_event_detail_type_ec2_terminate} but'
f' received {cloudwatch_event_detail_type}')
lifecycle_transition_terminating: str = \
'autoscaling:EC2_INSTANCE_TERMINATING'
lifecycle_transition: str = event['detail']['LifecycleTransition']
if lifecycle_transition != lifecycle_transition_terminating:
raise ValueError('expected LifecycleTransition '
f'{lifecycle_transition_terminating} but'
f' received {lifecycle_transition}')
def _complete_lifecycle_action_with_error(
region: str,
autoscaling_group_name: str,
instance_id: str,
lifecycle_hook_name: str,
lifecycle_action_token: str) -> None:
autoscaling_client = boto3.client('autoscaling', region_name=region)
autoscaling_client.complete_lifecycle_action(
AutoScalingGroupName=autoscaling_group_name,
InstanceId=instance_id,
LifecycleHookName=lifecycle_hook_name,
LifecycleActionToken=lifecycle_action_token,
LifecycleActionResult='ABANDON'
)
def _process_cloudwatch_event(event: dict) -> None:
_check_cloudwatch_event(event)
region: str = event['region']
instance_id: str = event['detail']['EC2InstanceId']
ssm_shutdown_command_response = \
_trigger_ssm_shutdown_command(region, instance_id)
try:
_validate_shutdown_command_response(ssm_shutdown_command_response)
except SSMCommandError:
# tell aws to terminate the instance
print('received an SSMCommandError, proceeding with termination')
print('event details:')
print(event)
autoscaling_group_name = event['detail']['AutoScalingGroupName']
lifecycle_hook_name = event['detail']['LifecycleHookName']
lifecycle_action_token = event['detail']['LifecycleActionToken']
_complete_lifecycle_action_with_error(
region,
autoscaling_group_name,
instance_id,
lifecycle_hook_name,
lifecycle_action_token)
raise
def _trigger_ssm_shutdown_command(
region: str,
instance_id: str) -> dict:
ssm_client = boto3.client('ssm', region_name=region)
return ssm_client.send_command(
InstanceIds=[instance_id],
DocumentName='AWS-RunShellScript',
TimeoutSeconds=45,
Parameters={
'commands': [
f'echo "starting {ASG_WORKER_LIFECYCLE_SHUTDOWN_SCRIPT_NAME}'
+ ' in the background..."',
f'nohup {ASG_WORKER_LIFECYCLE_SHUTDOWN_SCRIPT_PATH} '
+ f'1>{ASG_WORKER_LIFECYCLE_SHUTDOWN_SCRIPT_NAME}.stdout '
+ f'2>{ASG_WORKER_LIFECYCLE_SHUTDOWN_SCRIPT_NAME}.stderr '
+ '</dev/null &',
f'echo "started {ASG_WORKER_LIFECYCLE_SHUTDOWN_SCRIPT_NAME}, '
+ 'exiting..."',
'exit 0'
],
'executionTimeout': ['300'],
'workingDirectory': ['/tmp']
}
)
def _validate_shutdown_command_response(response: dict) -> None:
ssm_command_ok_statuses: list = [
'Pending',
'InProgress',
'Success'
]
command_status = response['Command']['Status']
if command_status not in ssm_command_ok_statuses:
raise SSMCommandError(f'expected Command Status in '
+ ', '.join(ssm_command_ok_statuses)
+ f' but received {command_status}')
# public functions
# =============================================================================
# entry point for lambda
def process_lambda_event(event, context) -> None:
_process_cloudwatch_event(event)
#!/usr/bin/env bash
# this script will retire the concourse worker, if found
# and then wait for the process to exit
# expects to be ran from source-exec with the env file for the worker
# disable history expansion so exclamation marks don't break commands
set +H
# constants
# =============================================================================
readonly START_TIME="$(date +%s)"
readonly MAX_TIMEOUT_SECONDS=3600 # one hour
readonly CONCOURSE_INSTALL_DIR="/opt/concourse"
readonly CONCOURSE_BIN_PATH="${CONCOURSE_INSTALL_DIR}/bin/concourse"
readonly CONCOURSE_WORKER_RUN_FILE_PATH="${CONCOURSE_INSTALL_DIR}/run/worker.pid"
# functions
# =============================================================================
warn_and_exit() {
echo >&2 "warning: cannot retire the concourse worker"
exit 0
}
get_worker_name() {
hostname
}
concourse_pid_file_exists() {
[[ -e "${CONCOURSE_WORKER_RUN_FILE_PATH}" ]]
}
retire_worker() {
local -r worker_name="${1}"
echo "retiring concourse worker ${worker_name}"
"${CONCOURSE_BIN_PATH}" retire-worker --name "${worker_name}"
}
ensure_worker_is_retired() {
local -r worker_name="${1}"
if ! concourse_pid_file_exists
then
echo >&2 "pid file not found at ${CONCOURSE_WORKER_RUN_FILE_PATH}"
echo >&2 "concourse worker does not appear to be running"
return 64
else
if ! retire_worker "${worker_name}"
then
echo >&2 "concourse retire worker command failed"
return 64
fi
while concourse_pid_file_exists
do
local current_time=
current_time="$(date +%s)"
local elapsed_time=$((current_time - START_TIME))
if [[ $elapsed_time -lt $MAX_TIMEOUT_SECONDS ]]
then
echo >&2 "pid file exists, waiting 30 seconds and retrying... elapsed time: $elapsed_time/$MAX_TIMEOUT_SECONDS seconds"
sleep 30
else
echo >&2 "timeout waiting for worker to retire"
return 64
fi
done
echo "concourse worker has retired"
return 0
fi
}
# main
# =============================================================================
if [[ -z "${CONCOURSE_TSA_HOST}" ]]
then
echo >&2 "CONCOURSE_TSA_HOST is empty or unset"
warn_and_exit
fi
if [[ -z "${CONCOURSE_TSA_PUBLIC_KEY}" ]]
then
echo >&2 "CONCOURSE_TSA_PUBLIC_KEY is empty or unset"
warn_and_exit
fi
if [[ -z "${CONCOURSE_TSA_WORKER_PRIVATE_KEY}" ]]
then
echo >&2 "CONCOURSE_TSA_WORKER_PRIVATE_KEY is empty or unset"
warn_and_exit
fi
if ! worker_name="$(get_worker_name)"
then
echo >&2 "failed to get worker name"
warn_and_exit
fi
if ! ensure_worker_is_retired "${worker_name}"
then
echo >&2 "failed to ensure worker was retired"
warn_and_exit
fi
#!/usr/bin/env bash
# this script will source a target file into the environment
# and then exec a target command with any provided args
# disable history expansion so exclamation marks don't break commands
set +H
source_file="${1:-}"
if [[ -z "${source_file}" ]]
then
echo >&2 "must provide path to source file"
exit 64
fi
shift
target_command="${1:-}"
if [[ -z "${target_command}" ]]
then
echo >&2 "must provide target command"
exit 64
fi
shift
# catching error usually doesn't work on early bash versions
# but worth putting here regardless
if ! source "${source_file}"
then
echo >&2 "failed to source file ${source_file}"
exit 64
fi
exec "${target_command}" ${@+"${@}"}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment