-
-
Save eedwards-sk/c91dc8e5d88b34d0a8eb1e77cbe867ac to your computer and use it in GitHub Desktop.
concourse worker shutdown scripts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# this script will attempt to run the nursing-home script | |
# using systemd (concourse-worker-shutdown.service) | |
# this script is designed to be ran by an aws ssm command | |
# which is triggered by a lambda function | |
# which is triggered by a cloudwatch event | |
# which is fired whenever an asg marks an instance for termination | |
# disable history expansion so exclamation marks don't break commands | |
set +H | |
# constants | |
# ============================================================================= | |
readonly EC2_INSTANCE_METADATA_URL="http://169.254.169.254/latest/meta-data" | |
readonly EC2_INSTANCE_DYNAMIC_DATA_URL="http://169.254.169.254/latest/dynamic" | |
readonly LIFECYCLE_ACTION_OK='CONTINUE' | |
readonly LIFECYCLE_ACTION_ERROR='ABANDON' | |
readonly LIFECYCLE_HOOK_NAME='concourse-worker-shutdown' | |
readonly SYSTEMD_SERVICE_NAME='concourse-worker-shutdown.service' | |
# functions | |
# ============================================================================= | |
lookup_path_in_instance_metadata() { | |
local -r path="${1}" | |
curl --silent --show-error --location "$EC2_INSTANCE_METADATA_URL/$path/" | |
} | |
lookup_path_in_instance_dynamic_data() { | |
local -r path="${1}" | |
curl --silent --show-error --location "$EC2_INSTANCE_DYNAMIC_DATA_URL/$path/" | |
} | |
get_instance_id() { | |
lookup_path_in_instance_metadata "instance-id" | |
} | |
get_instance_region() { | |
lookup_path_in_instance_dynamic_data "instance-identity/document" | jq -r ".region" | |
} | |
get_autoscaling_group_name() { | |
local -r region="${1}" | |
local -r instance_id="${2}" | |
aws ec2 describe-tags \ | |
--region "${region}" \ | |
--filters "Name=resource-id,Values=${instance_id}" "Name=key,Values=aws:autoscaling:groupName" \ | |
--query "Tags[].Value" \ | |
--output text | |
} | |
complete_lifecycle_action() { | |
local -r lifecycle_action="${1}" | |
local -r instance_region="${2:-}" | |
local -r instance_id="${3:-}" | |
if [[ -z "${instance_region}" || -z "${instance_id}" ]] | |
then | |
echo >&2 "cannot complete lifecycle action - instance_region or instance_id is empty or unset" | |
return 64 | |
fi | |
if autoscaling_group_name="$(get_autoscaling_group_name "${instance_region}" "${instance_id}")" | |
then | |
if [[ -n "${autoscaling_group_name}" ]] | |
then | |
aws autoscaling complete-lifecycle-action \ | |
--region "${instance_region}" \ | |
--auto-scaling-group-name "${autoscaling_group_name}" \ | |
--lifecycle-hook-name "${LIFECYCLE_HOOK_NAME}" \ | |
--instance-id "${instance_id}" \ | |
--lifecycle-action-result "${lifecycle_action}" | |
else | |
echo >&2 "instance ${instance_id} is not in an autoscaling group" | |
return 64 | |
fi | |
else | |
echo >&2 "failed to get autoscaling group name or instance ${instance_id} is not in an autoscaling group" | |
return 64 | |
fi | |
} | |
# main | |
# ============================================================================= | |
echo "stopping ${SYSTEMD_SERVICE_NAME}" | |
if /bin/systemctl stop "${SYSTEMD_SERVICE_NAME}" | |
then | |
lifecycle_action="${LIFECYCLE_ACTION_OK}" | |
else | |
echo >&2 "warning: stop command failed or timed out for ${SYSTEMD_SERVICE_NAME}" | |
lifecycle_action="${LIFECYCLE_ACTION_ERROR}" | |
fi | |
instance_region="$(get_instance_region)" | |
instance_id="$(get_instance_id)" | |
if ! complete_lifecycle_action "${lifecycle_action}" "${instance_region}" "${instance_id}" | |
then | |
echo >&2 "error: failed to complete lifecycle action" | |
exit 64 | |
fi | |
exit 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[Unit] | |
Description=retires the concourse worker on shutdown | |
Before=datadog-agent.service | |
After=network-online.target concourse-worker.service | |
[Service] | |
Type=oneshot | |
RemainAfterExit=yes | |
User=root | |
ExecStart=/bin/true | |
EnvironmentFile=${CONCOURSE_SERVICE_WORKER_ENV_FILE_PATH} | |
ExecStop=${CONCOURSE_WORKER_SHUTDOWN_SCRIPT_PATH} | |
TimeoutStartSec=5 | |
TimeoutStopSec=65m | |
SyslogIdentifier=concourse-worker-shutdown | |
[Install] | |
WantedBy=multi-user.target |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
# constants | |
# ============================================================================= | |
ASG_WORKER_LIFECYCLE_SHUTDOWN_SCRIPT_NAME: str = \ | |
'asg-worker-lifecycle-shutdown' | |
ASG_WORKER_LIFECYCLE_SHUTDOWN_SCRIPT_PATH: str = \ | |
f'/opt/concourse/bin/{ASG_WORKER_LIFECYCLE_SHUTDOWN_SCRIPT_NAME}' | |
# exceptions | |
# ============================================================================= | |
class SSMCommandError(ValueError): | |
pass | |
# private functions | |
# ============================================================================= | |
def _check_cloudwatch_event(event: dict) -> None: | |
# schema: | |
# { | |
# "version": "0", | |
# "id": "12345678-1234-1234-1234-123456789012", | |
# "detail-type": "EC2 Instance-terminate Lifecycle Action", | |
# "source": "aws.autoscaling", | |
# "account": "123456789012", | |
# "time": "yyyy-mm-ddThh:mm:ssZ", | |
# "region": "us-west-2", | |
# "resources": [ | |
# "auto-scaling-group-arn" | |
# ], | |
# "detail": { | |
# "LifecycleActionToken":"87654321-4321-4321-4321-210987654321", | |
# "AutoScalingGroupName":"my-asg", | |
# "LifecycleHookName":"my-lifecycle-hook", | |
# "EC2InstanceId":"i-1234567890abcdef0", | |
# "LifecycleTransition":"autoscaling:EC2_INSTANCE_TERMINATING", | |
# "NotificationMetadata":"additional-info" | |
# } | |
# } | |
# from https://docs.aws.amazon.com/autoscaling/ec2/userguide/ | |
# cloud-watch-events.html#terminate-lifecycle-action | |
cloudwatch_event_source_autoscaling: str = 'aws.autoscaling' | |
cloudwatch_event_source: str = event['source'] | |
if cloudwatch_event_source != cloudwatch_event_source_autoscaling: | |
raise ValueError('expected source ' | |
f'{cloudwatch_event_source_autoscaling} but' | |
f' received {cloudwatch_event_source}') | |
cloudwatch_event_detail_type_ec2_terminate: str = \ | |
'EC2 Instance-terminate Lifecycle Action' | |
cloudwatch_event_detail_type: str = event['detail-type'] | |
if (cloudwatch_event_detail_type != | |
cloudwatch_event_detail_type_ec2_terminate): | |
raise ValueError('expected detail-type ' | |
f'{cloudwatch_event_detail_type_ec2_terminate} but' | |
f' received {cloudwatch_event_detail_type}') | |
lifecycle_transition_terminating: str = \ | |
'autoscaling:EC2_INSTANCE_TERMINATING' | |
lifecycle_transition: str = event['detail']['LifecycleTransition'] | |
if lifecycle_transition != lifecycle_transition_terminating: | |
raise ValueError('expected LifecycleTransition ' | |
f'{lifecycle_transition_terminating} but' | |
f' received {lifecycle_transition}') | |
def _complete_lifecycle_action_with_error( | |
region: str, | |
autoscaling_group_name: str, | |
instance_id: str, | |
lifecycle_hook_name: str, | |
lifecycle_action_token: str) -> None: | |
autoscaling_client = boto3.client('autoscaling', region_name=region) | |
autoscaling_client.complete_lifecycle_action( | |
AutoScalingGroupName=autoscaling_group_name, | |
InstanceId=instance_id, | |
LifecycleHookName=lifecycle_hook_name, | |
LifecycleActionToken=lifecycle_action_token, | |
LifecycleActionResult='ABANDON' | |
) | |
def _process_cloudwatch_event(event: dict) -> None: | |
_check_cloudwatch_event(event) | |
region: str = event['region'] | |
instance_id: str = event['detail']['EC2InstanceId'] | |
ssm_shutdown_command_response = \ | |
_trigger_ssm_shutdown_command(region, instance_id) | |
try: | |
_validate_shutdown_command_response(ssm_shutdown_command_response) | |
except SSMCommandError: | |
# tell aws to terminate the instance | |
print('received an SSMCommandError, proceeding with termination') | |
print('event details:') | |
print(event) | |
autoscaling_group_name = event['detail']['AutoScalingGroupName'] | |
lifecycle_hook_name = event['detail']['LifecycleHookName'] | |
lifecycle_action_token = event['detail']['LifecycleActionToken'] | |
_complete_lifecycle_action_with_error( | |
region, | |
autoscaling_group_name, | |
instance_id, | |
lifecycle_hook_name, | |
lifecycle_action_token) | |
raise | |
def _trigger_ssm_shutdown_command( | |
region: str, | |
instance_id: str) -> dict: | |
ssm_client = boto3.client('ssm', region_name=region) | |
return ssm_client.send_command( | |
InstanceIds=[instance_id], | |
DocumentName='AWS-RunShellScript', | |
TimeoutSeconds=45, | |
Parameters={ | |
'commands': [ | |
f'echo "starting {ASG_WORKER_LIFECYCLE_SHUTDOWN_SCRIPT_NAME}' | |
+ ' in the background..."', | |
f'nohup {ASG_WORKER_LIFECYCLE_SHUTDOWN_SCRIPT_PATH} ' | |
+ f'1>{ASG_WORKER_LIFECYCLE_SHUTDOWN_SCRIPT_NAME}.stdout ' | |
+ f'2>{ASG_WORKER_LIFECYCLE_SHUTDOWN_SCRIPT_NAME}.stderr ' | |
+ '</dev/null &', | |
f'echo "started {ASG_WORKER_LIFECYCLE_SHUTDOWN_SCRIPT_NAME}, ' | |
+ 'exiting..."', | |
'exit 0' | |
], | |
'executionTimeout': ['300'], | |
'workingDirectory': ['/tmp'] | |
} | |
) | |
def _validate_shutdown_command_response(response: dict) -> None: | |
ssm_command_ok_statuses: list = [ | |
'Pending', | |
'InProgress', | |
'Success' | |
] | |
command_status = response['Command']['Status'] | |
if command_status not in ssm_command_ok_statuses: | |
raise SSMCommandError(f'expected Command Status in ' | |
+ ', '.join(ssm_command_ok_statuses) | |
+ f' but received {command_status}') | |
# public functions | |
# ============================================================================= | |
# entry point for lambda | |
def process_lambda_event(event, context) -> None: | |
_process_cloudwatch_event(event) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# this script will retire the concourse worker, if found | |
# and then wait for the process to exit | |
# expects to be ran from source-exec with the env file for the worker | |
# disable history expansion so exclamation marks don't break commands | |
set +H | |
# constants | |
# ============================================================================= | |
readonly START_TIME="$(date +%s)" | |
readonly MAX_TIMEOUT_SECONDS=3600 # one hour | |
readonly CONCOURSE_INSTALL_DIR="/opt/concourse" | |
readonly CONCOURSE_BIN_PATH="${CONCOURSE_INSTALL_DIR}/bin/concourse" | |
readonly CONCOURSE_WORKER_RUN_FILE_PATH="${CONCOURSE_INSTALL_DIR}/run/worker.pid" | |
# functions | |
# ============================================================================= | |
warn_and_exit() { | |
echo >&2 "warning: cannot retire the concourse worker" | |
exit 0 | |
} | |
get_worker_name() { | |
hostname | |
} | |
concourse_pid_file_exists() { | |
[[ -e "${CONCOURSE_WORKER_RUN_FILE_PATH}" ]] | |
} | |
retire_worker() { | |
local -r worker_name="${1}" | |
echo "retiring concourse worker ${worker_name}" | |
"${CONCOURSE_BIN_PATH}" retire-worker --name "${worker_name}" | |
} | |
ensure_worker_is_retired() { | |
local -r worker_name="${1}" | |
if ! concourse_pid_file_exists | |
then | |
echo >&2 "pid file not found at ${CONCOURSE_WORKER_RUN_FILE_PATH}" | |
echo >&2 "concourse worker does not appear to be running" | |
return 64 | |
else | |
if ! retire_worker "${worker_name}" | |
then | |
echo >&2 "concourse retire worker command failed" | |
return 64 | |
fi | |
while concourse_pid_file_exists | |
do | |
local current_time= | |
current_time="$(date +%s)" | |
local elapsed_time=$((current_time - START_TIME)) | |
if [[ $elapsed_time -lt $MAX_TIMEOUT_SECONDS ]] | |
then | |
echo >&2 "pid file exists, waiting 30 seconds and retrying... elapsed time: $elapsed_time/$MAX_TIMEOUT_SECONDS seconds" | |
sleep 30 | |
else | |
echo >&2 "timeout waiting for worker to retire" | |
return 64 | |
fi | |
done | |
echo "concourse worker has retired" | |
return 0 | |
fi | |
} | |
# main | |
# ============================================================================= | |
if [[ -z "${CONCOURSE_TSA_HOST}" ]] | |
then | |
echo >&2 "CONCOURSE_TSA_HOST is empty or unset" | |
warn_and_exit | |
fi | |
if [[ -z "${CONCOURSE_TSA_PUBLIC_KEY}" ]] | |
then | |
echo >&2 "CONCOURSE_TSA_PUBLIC_KEY is empty or unset" | |
warn_and_exit | |
fi | |
if [[ -z "${CONCOURSE_TSA_WORKER_PRIVATE_KEY}" ]] | |
then | |
echo >&2 "CONCOURSE_TSA_WORKER_PRIVATE_KEY is empty or unset" | |
warn_and_exit | |
fi | |
if ! worker_name="$(get_worker_name)" | |
then | |
echo >&2 "failed to get worker name" | |
warn_and_exit | |
fi | |
if ! ensure_worker_is_retired "${worker_name}" | |
then | |
echo >&2 "failed to ensure worker was retired" | |
warn_and_exit | |
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# this script will source a target file into the environment | |
# and then exec a target command with any provided args | |
# disable history expansion so exclamation marks don't break commands | |
set +H | |
source_file="${1:-}" | |
if [[ -z "${source_file}" ]] | |
then | |
echo >&2 "must provide path to source file" | |
exit 64 | |
fi | |
shift | |
target_command="${1:-}" | |
if [[ -z "${target_command}" ]] | |
then | |
echo >&2 "must provide target command" | |
exit 64 | |
fi | |
shift | |
# catching error usually doesn't work on early bash versions | |
# but worth putting here regardless | |
if ! source "${source_file}" | |
then | |
echo >&2 "failed to source file ${source_file}" | |
exit 64 | |
fi | |
exec "${target_command}" ${@+"${@}"} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment