hectorj2f/gist:6c7d21c92238d239e5165d522fff7337

## gistfile1.txt
#!/bin/bash

# DIR stores the path to the directory where this script is located
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# ETCD stores the command used to run etcd
ETCD="/usr/local/bin/etcd"
# ETCDCTL stores the command used to run etcdctl
ETCDCTL="/usr/local/bin/etcdctl"

# we use this to choose the path to the etcd/etcdctl binaries, but etcd cannot
# be started with this environment variable set, so we must unset it here
unset ETCD_VERSION

# ETCD_CLUSTERING_OPERATION_CREATE is used to signal that we intend on creating
# a new etcd cluster
ETCD_CLUSTERING_OPERATION_CREATE="create"
# ETCD_CLUSTERING_OPERATION_JOIN is used to signal that we intend on joining an
# existing etcd cluster
ETCD_CLUSTERING_OPERATION_JOIN="join"
# ETCD_CLUSTERING_OPERATION will hold either "create" or "join", a value which
# is later used by the start function
ETCD_CLUSTERING_OPERATION=""

# create_new_cluster starts a new etcd cluster with the current task as the
# single member.
# NOTE: in practice this function is only ever called for etcd-0-peer, and this
# only happens when installing the framework.
function create_new_cluster() {
    # give some info about the current operation
    printf "creating new etcd cluster with ${TASK_NAME} as the single member...\n"
    # export ETCD_NAME (the name of the etcd-0-peer)
    export ETCD_NAME="infra${POD_INSTANCE_INDEX}"
    # export ETCD_INITIAL_CLUSTER (comprised only of etcd-0-peer)
    export ETCD_INITIAL_CLUSTER="infra${POD_INSTANCE_INDEX}=https://${TASK_NAME}.${FRAMEWORK_HOST}:${ETCD_LISTEN_PEER_PORT}"
    # export ETCD_INITIAL_CLUSTER_STATE (this is a brand new cluster)
    export ETCD_INITIAL_CLUSTER_STATE="new"
    # start the etcd process
    start_etcd
}

# get_clustering_operation detects whether we should create a new cluster or
# join an existing one.
function get_clustering_operation() {
    # give some info about the current operation
    printf "detecting whether to create a new etcd cluster or join an existing one...\n"

    # instances of etcd having indexes greater than 0 (etcd-1-peer,
    # etcd-2-peer, ...) are not allowed to create a cluster anew - they
    # should always join an existing one
    if [[ "${POD_INSTANCE_INDEX}" -gt "0" ]];
    then
        printf "joining an existing etcd cluster...\n"
        ETCD_CLUSTERING_OPERATION=${ETCD_CLUSTERING_OPERATION_JOIN}
        return 0
    fi

    # the current pod instance is etcd-0-peer, and so we need to understand
    # whether we should create a new cluster (such as when the framework is
    # being installed for the first time) or join an existing one - such as when
    # the etcd-0-peer task has failed or has been replaced. in order to do that
    # we try to get the list of peers from etcd.${FRAMEWORK_HOST}. if we get
    # at least one member, we join the cluster. otherwise, we retry up to a
    # maximum of 5 times and after that we create a new cluster.

    # ETCD_MEMBER_COUNT_RETRIES is a counter for the number of
    # retries of "etcdctl member list" made so far
    local ETCD_MEMBER_COUNT_RETRIES=0
    # ETCD_MEMBER_COUNT_MAX_RETRIES is the maximum number of retries
    # of "etcdctl member list" to make before creating a new cluster
    local ETCD_MEMBER_COUNT_MAX_RETRIES=5
    # try to populate ETCD_MEMBER_COUNT taking into account the fact that
    # "etcdctl member list" may fail for a myriad of reasons (e.g., networking)
    # even though a cluster is already present
    until (( ${ETCD_MEMBER_COUNT_RETRIES} >= ${ETCD_MEMBER_COUNT_MAX_RETRIES} ));
    do
        # list etcd members and count the number of lines
        ETCD_MEMBER_LIST=$(
            ${ETCDCTL} \
            --cert=etcd-crt.pem \
            --key=etcd-key.pem \
            --cacert=ca-crt.pem \
            --endpoints=https://etcd.${FRAMEWORK_VIP_HOST}:${ETCD_LISTEN_CLIENT_PORT} member list)

        # if "etcdctl member list" was successful, we should join the existing
        # cluster
        if [[ $? == 0 ]];
        then
            # give some info about the current operation
            printf "joining an existing etcd cluster...\n${ETCD_MEMBER_LIST}\n"
            # "etcd member list" succeeded, which means that at least one etcd
            # peer is active and that we should join an existing cluster
            ETCD_CLUSTERING_OPERATION=${ETCD_CLUSTERING_OPERATION_JOIN}
            # exit the function
            return 0
        fi

        # give some info about the current operation
        printf "failed to list members\...\n"
        # increment ETCD_MEMBER_COUNT_RETRIES
        ETCD_MEMBER_COUNT_RETRIES=$((ETCD_MEMBER_COUNT_RETRIES+1))

        # if ETCD_MEMBER_COUNT_MAX_RETRIES hasn't been reached yet, we should
        # sleep until the next retry
        if (( ${ETCD_MEMBER_COUNT_RETRIES} < ${ETCD_MEMBER_COUNT_MAX_RETRIES} ));
        then
            # give some info about the current operation
            printf "sleeping...\n"
            # wait a litle between the next retry (1s -> 2s -> 4s -> 8s -> 16s)
            sleep $((2**${ETCD_MEMBER_COUNT_RETRIES}))
            # continue onto the next iteration
            continue
        fi

        # at this point "etcd member list" has failed everytime, which means
        # (most probably) that there are no active etcd peers and that we should
        # create a new cluster

        # give some info about the current operation
        printf "failed to list members. creating a new etcd cluster...\n"

        # set the current operation as "create"
        ETCD_CLUSTERING_OPERATION=${ETCD_CLUSTERING_OPERATION_CREATE}
    done
}

# health_check performs a health check on the current peer by checking if it is
# listed as "healthy" in "etcdctl endpoint health".
function health_check()
{
    # check the current peer's health using "etcdctl endpoint health"
    local ETCD_PEER_IS_HEALTHY
    ETCD_PEER_IS_HEALTHY=$(${ETCDCTL} \
        --cert=etcd-crt.pem \
        --key=etcd-key.pem \
        --cacert=ca-crt.pem \
        --endpoints=https://etcd-${POD_INSTANCE_INDEX}-peer.${FRAMEWORK_HOST}:${ETCD_LISTEN_CLIENT_PORT} \
        endpoint health | grep -c "is healthy")
    # return accordingly
    if [[ "${ETCD_PEER_IS_HEALTHY}" == "1" ]];
    then
        # the peer is healthy
        exit 0
    else
        # the peer is not healthy
        exit 1
    fi
}

# join_existing_cluster attempts to perform an "etcd member add" operation on
# the existing cluster. it then starts etcd.
function join_existing_cluster()
{
    # check if a previous data-dir exists, in which case we don"t need to run
    # "etcdctl member add" and can just start etcd
    if [[ -d "${DIR}/${ETCD_DATA_DIR}/member" ]];
    then
        # give some info about the current operation
        printf "data-dir is not empty. starting etcd...\n"
        # start etcd
        start_etcd
        # exit the function
        return 0
    fi

    # at this point no data-dir exists, so we must run "etcdctl member add"
    # before starting etcd

    # give some info about the current operation
    printf "data-dir is empty. running \"etcdctl member add\"...\n"

    # wait for the vip to resolve
    wait_for_dns_to_resolve "etcd.${FRAMEWORK_VIP_HOST}"

    # add the new member, storing the output
    local ETCD_MEMBER_ADD_OUTPUT
    ETCD_MEMBER_ADD_OUTPUT=$(${ETCDCTL} \
    --cert=etcd-crt.pem \
    --key=etcd-key.pem \
    --cacert=ca-crt.pem \
    --endpoints=https://etcd.${FRAMEWORK_VIP_HOST}:${ETCD_LISTEN_CLIENT_PORT} \
    member add \
    --peer-urls=https://etcd-${POD_INSTANCE_INDEX}-peer.${FRAMEWORK_HOST}:${ETCD_LISTEN_PEER_PORT} \
    infra${POD_INSTANCE_INDEX})

    # check whether "etcd member add" was successful
    if [[ $? != 0 ]];
    then
        # the call to "etcd member add" failed, and so we should fail the task
        # and wait for it to be relaunched
        printf "failed to add member etcd-${POD_INSTANCE_INDEX}-peer to cluster...\n${ETCD_MEMBER_ADD_OUTPUT}\n"
        # fail the task
        exit 1
    else
        # grep the output for the relevant lines and export the
        # necessary environment variables. this will set the following
        # environment variables:
        # - ETCD_INITIAL_CLUSTER
        # - ETCD_INITIAL_CLUSTER_STATE
        # - ETCD_NAME
        eval export $(echo "${ETCD_MEMBER_ADD_OUTPUT}" | grep ETCD_)
        # start etcd
        start_etcd
    fi
}

# readines_check performs a readiness check on the current peer (i.e. checks
# whether the framework scheduler can proceed with the next task).
function readiness_check()
{
    # grab a list of current cluster members
    local ETCD_MEMBER_LIST
    ETCD_MEMBER_LIST=$(${ETCDCTL} \
        --cert=etcd-crt.pem \
        --key=etcd-key.pem \
        --cacert=ca-crt.pem \
        --endpoints=https://etcd-${POD_INSTANCE_INDEX}-peer.${FRAMEWORK_HOST}:${ETCD_LISTEN_CLIENT_PORT} member list)
    # check the status of the "etcdctl member list command"
    if [[ $? != 0 ]];
    then
        # at this point the current peer is not ready

        # give some info about the current operation
        printf "readiness_check: failed to list the members of the cluster...\n"
        # the peer is not ready
        exit 1
    fi

    # count the number of current cluster members
    local ETCD_CLUSTER_MEMBER_COUNT
    ETCD_CLUSTER_MEMBER_COUNT=$(echo "${ETCD_MEMBER_LIST}" | wc -l)

    # count the number of current cluster members that are healthy
    local ETCD_HEALTHY_MEMBER_COUNT
    ETCD_HEALTHY_MEMBER_COUNT=$(${ETCDCTL} \
        --cert=etcd-crt.pem \
        --key=etcd-key.pem \
        --cacert=ca-crt.pem \
        --endpoints=https://etcd-${POD_INSTANCE_INDEX}-peer.${FRAMEWORK_HOST}:${ETCD_LISTEN_CLIENT_PORT} \
        endpoint health --cluster | grep -c " is healthy")

    # if all existing members are healthy...
    if (( ${ETCD_CLUSTER_MEMBER_COUNT} > 0 )) && [[ "${ETCD_HEALTHY_MEMBER_COUNT}" == "${ETCD_CLUSTER_MEMBER_COUNT}" ]];
    then
        # give some info about the current operation
        printf "readiness_check: ok...\n"
        # ... the readiness check should pass...
        exit 0
    else
        # give some info about the current operation
        printf "readiness_check: failed...\nETCD_CLUSTER_MEMBER_COUNT: %s\nETCD_HEALTHY_MEMBER_COUNT: %s\n" ETCD_CLUSTER_MEMBER_COUNT ETCD_HEALTHY_MEMBER_COUNT
        # ... otherwise, it should fail.
        exit 1
    fi
}

# recover performs recovery steps on the current peer
function recover()
{
    # give some info about the current operation
    printf "recovering etcd-${POD_INSTANCE_INDEX}-peer...\n"

    # wait for the vip to resolve
    wait_for_dns_to_resolve "etcd.${FRAMEWORK_VIP_HOST}"

    # fetch the list of members in the cluster
    local ETCD_MEMBER_LIST
    ETCD_MEMBER_LIST=$(
        ${ETCDCTL} \
        --cert=etcd-crt.pem \
        --key=etcd-key.pem \
        --cacert=ca-crt.pem \
        --endpoints=https://etcd.${FRAMEWORK_VIP_HOST}:${ETCD_LISTEN_CLIENT_PORT} member list)
    if [[ $? != 0 ]];
    then
        # the call to "etcd member list" failed, and so we should fail the task
        # and wait for it to be relaunched
        printf "failed to list the members of the cluster\n"
        # exit the task
        exit 1
    else
        # wipe any pre-existing data
        rm -rf data-dir/member

        # the call to "etcd member list" succeeded, so we should attempt to
        # remove any previously existing instance of this peer from the member
        # list

        # extract the dead peer"s id from the output
        local ETCD_DEAD_PEER_ID
        ETCD_DEAD_PEER_ID=$(echo "${ETCD_MEMBER_LIST}" | awk -F "," "/etcd-${POD_INSTANCE_INDEX}-peer/ { print \$1 }")
        if [[ "${ETCD_DEAD_PEER_ID}" == "" ]];
        then
            # the current peer is not known by the cluster, so we're good
            return 0
        else
            # give some info about the current operation
            printf "removing etcd-${POD_INSTANCE_INDEX}-peer (id ${ETCD_DEAD_PEER_ID})...\n"
            # the current peer is already known by the cluster, so we must
            # remove it
            ${ETCDCTL} \
            --cert=etcd-crt.pem \
            --key=etcd-key.pem \
            --cacert=ca-crt.pem \
            --endpoints https://etcd.${FRAMEWORK_VIP_HOST}:${ETCD_LISTEN_CLIENT_PORT} member remove ${ETCD_DEAD_PEER_ID}
            if [[ $? != 0 ]];
            then
                # the call to "etcd member remove" failed, and so we should fail the
                # task and wait for it to be relaunched
                exit 1
            fi
        fi
    fi
}

# start determines whether the current peer should create a new cluster or join
# an existing one.
function start()
{
    # wait for the name of the current task to resolve in order to prevent etcd
    # from crashing before it reaches the "started" state (which later
    # originates cluster id mismatch errors)
    wait_for_dns_to_resolve "etcd-${POD_INSTANCE_INDEX}-peer.${FRAMEWORK_HOST}"

    # detect whether we need to create a new cluster or join an existing one
    get_clustering_operation

    # act accordingly
    case "${ETCD_CLUSTERING_OPERATION}" in
        ${ETCD_CLUSTERING_OPERATION_CREATE})
            create_new_cluster
            ;;
        ${ETCD_CLUSTERING_OPERATION_JOIN})
            join_existing_cluster
            ;;
    esac
}

# start_etcd starts etcd, replacing the current script.
function start_etcd()
{
    # give some info about the current operation
    printf "starting etcd...\n"

    # start etcd (replacing the current script)
    exec $ETCD \
        --name=infra${POD_INSTANCE_INDEX} \
        --cert-file=etcd-crt.pem \
        --key-file=etcd-key.pem \
        --client-cert-auth \
        --trusted-ca-file=ca-crt.pem \
        --peer-cert-file=etcd-crt.pem \
        --peer-key-file=etcd-key.pem \
        --peer-trusted-ca-file=ca-crt.pem \
        --listen-peer-urls=https://${MESOS_CONTAINER_IP}:${ETCD_LISTEN_PEER_PORT} \
        --initial-advertise-peer-urls=https://etcd-${POD_INSTANCE_INDEX}-peer.${FRAMEWORK_HOST}:${ETCD_LISTEN_PEER_PORT} \
        --listen-client-urls=https://${MESOS_CONTAINER_IP}:${ETCD_LISTEN_CLIENT_PORT} \
        --advertise-client-urls=https://etcd-${POD_INSTANCE_INDEX}-peer.${FRAMEWORK_HOST}:${ETCD_LISTEN_CLIENT_PORT} \
        --log-output=stdout \
        --quota-backend-bytes=${ETCD_QUOTA_BACKEND_BYTES} \
        --election-timeout=${ETCD_ELECTION_TIMEOUT} \
        --heartbeat-interval=${ETCD_HEARTBEAT_INTERVAL} \
        --enable-v2=false
}

# wait_for_dns_to_resolve waits for the name passed as the first parameter to
# resolve before returning. if the name doesn't resolve after 5 retries with an
# exponential backoff, it causes the script to exit.
function wait_for_dns_to_resolve()
{
    # give some info about the current operation
    printf "waiting for \"${1}\" to resolve...\n"

    # DNS_NAME holds the name which resolution we want to wait for. we will wait
    # for it to resolve.
    local DNS_NAME="${1}"
    # DNS_RESOLVE_RETRIES holds the number of retries done so far.
    local DNS_RESOLVE_RETRIES=0
    # DNS_RESOLVE_MAX_RETRIES holds the maximum number of retries that will be
    # made.
    local DNS_RESOLVE_MAX_RETRIES=5

    # loop until DNS_NAME resolves or DNS_RESOLVE_MAX_RETRIES is reached
    while (( ${DNS_RESOLVE_RETRIES} <= ${DNS_RESOLVE_MAX_RETRIES} ));
    do
        # check if DNS_NAME resolves
        host "${DNS_NAME}"
        # check host's exit code
        if [[ $? == 0 ]];
        then
            # give some info about the current operation
            printf "${DNS_NAME} successfully resolved...\n"
            # exit the function
            return 0
        fi

        # at this point, dns resolution has failed, so we should check whether
        # we should retry or end the loop

        # give some info about the current operation
        printf "${DNS_NAME} doesn't resolve yet...\n"
        # increment the counter
        DNS_RESOLVE_RETRIES=$((DNS_RESOLVE_RETRIES+1))

        # if this is not the last retry...
        if (( ${DNS_RESOLVE_RETRIES} < ${DNS_RESOLVE_MAX_RETRIES} ));
        then
            # give some info about the current operation
            printf "sleeping...\n"
            # wait a litle between the next retry (1s -> 2s -> 4s -> 8s -> 16s)
            sleep $((2**${DNS_RESOLVE_RETRIES}))
            # continue onto the next iteration
            continue
        fi

        # at this point, dns resolution has failed everytime, so we should exit the
        # task

        # give some info about the current operation
        echo "${DNS_NAME} didn't resolve after $((DNS_RESOLVE_MAX_RETRIES)) retries..."
        # exit the task
        exit 1
    done
}

# parse the first argument and run the corresponding command
case "$1" in
    "health-check")
        health_check
        ;;
    "start")
        start;
        ;;
    "readiness-check")
        readiness_check
        ;;
    "recover")
        recover;
        ;;
    *)
        exit 1;
        ;;
esac
	#!/bin/bash

	# DIR stores the path to the directory where this script is located
	DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
	# ETCD stores the command used to run etcd
	ETCD="/usr/local/bin/etcd"
	# ETCDCTL stores the command used to run etcdctl
	ETCDCTL="/usr/local/bin/etcdctl"

	# we use this to choose the path to the etcd/etcdctl binaries, but etcd cannot
	# be started with this environment variable set, so we must unset it here
	unset ETCD_VERSION

	# ETCD_CLUSTERING_OPERATION_CREATE is used to signal that we intend on creating
	# a new etcd cluster
	ETCD_CLUSTERING_OPERATION_CREATE="create"
	# ETCD_CLUSTERING_OPERATION_JOIN is used to signal that we intend on joining an
	# existing etcd cluster
	ETCD_CLUSTERING_OPERATION_JOIN="join"
	# ETCD_CLUSTERING_OPERATION will hold either "create" or "join", a value which
	# is later used by the start function
	ETCD_CLUSTERING_OPERATION=""

	# create_new_cluster starts a new etcd cluster with the current task as the
	# single member.
	# NOTE: in practice this function is only ever called for etcd-0-peer, and this
	# only happens when installing the framework.
	function create_new_cluster() {
	# give some info about the current operation
	printf "creating new etcd cluster with ${TASK_NAME} as the single member...\n"
	# export ETCD_NAME (the name of the etcd-0-peer)
	export ETCD_NAME="infra${POD_INSTANCE_INDEX}"
	# export ETCD_INITIAL_CLUSTER (comprised only of etcd-0-peer)
	export ETCD_INITIAL_CLUSTER="infra${POD_INSTANCE_INDEX}=https://${TASK_NAME}.${FRAMEWORK_HOST}:${ETCD_LISTEN_PEER_PORT}"
	# export ETCD_INITIAL_CLUSTER_STATE (this is a brand new cluster)
	export ETCD_INITIAL_CLUSTER_STATE="new"
	# start the etcd process
	start_etcd
	}

	# get_clustering_operation detects whether we should create a new cluster or
	# join an existing one.
	function get_clustering_operation() {
	# give some info about the current operation
	printf "detecting whether to create a new etcd cluster or join an existing one...\n"

	# instances of etcd having indexes greater than 0 (etcd-1-peer,
	# etcd-2-peer, ...) are not allowed to create a cluster anew - they
	# should always join an existing one
	if [[ "${POD_INSTANCE_INDEX}" -gt "0" ]];
	then
	printf "joining an existing etcd cluster...\n"
	ETCD_CLUSTERING_OPERATION=${ETCD_CLUSTERING_OPERATION_JOIN}
	return 0
	fi

	# the current pod instance is etcd-0-peer, and so we need to understand
	# whether we should create a new cluster (such as when the framework is
	# being installed for the first time) or join an existing one - such as when
	# the etcd-0-peer task has failed or has been replaced. in order to do that
	# we try to get the list of peers from etcd.${FRAMEWORK_HOST}. if we get
	# at least one member, we join the cluster. otherwise, we retry up to a
	# maximum of 5 times and after that we create a new cluster.

	# ETCD_MEMBER_COUNT_RETRIES is a counter for the number of
	# retries of "etcdctl member list" made so far
	local ETCD_MEMBER_COUNT_RETRIES=0
	# ETCD_MEMBER_COUNT_MAX_RETRIES is the maximum number of retries
	# of "etcdctl member list" to make before creating a new cluster
	local ETCD_MEMBER_COUNT_MAX_RETRIES=5
	# try to populate ETCD_MEMBER_COUNT taking into account the fact that
	# "etcdctl member list" may fail for a myriad of reasons (e.g., networking)
	# even though a cluster is already present
	until (( ${ETCD_MEMBER_COUNT_RETRIES} >= ${ETCD_MEMBER_COUNT_MAX_RETRIES} ));
	do
	# list etcd members and count the number of lines
	ETCD_MEMBER_LIST=$(
	${ETCDCTL} \
	--cert=etcd-crt.pem \
	--key=etcd-key.pem \
	--cacert=ca-crt.pem \
	--endpoints=https://etcd.${FRAMEWORK_VIP_HOST}:${ETCD_LISTEN_CLIENT_PORT} member list)

	# if "etcdctl member list" was successful, we should join the existing
	# cluster
	if [[ $? == 0 ]];
	then
	# give some info about the current operation
	printf "joining an existing etcd cluster...\n${ETCD_MEMBER_LIST}\n"
	# "etcd member list" succeeded, which means that at least one etcd
	# peer is active and that we should join an existing cluster
	ETCD_CLUSTERING_OPERATION=${ETCD_CLUSTERING_OPERATION_JOIN}
	# exit the function
	return 0
	fi

	# give some info about the current operation
	printf "failed to list members\...\n"
	# increment ETCD_MEMBER_COUNT_RETRIES
	ETCD_MEMBER_COUNT_RETRIES=$((ETCD_MEMBER_COUNT_RETRIES+1))

	# if ETCD_MEMBER_COUNT_MAX_RETRIES hasn't been reached yet, we should
	# sleep until the next retry
	if (( ${ETCD_MEMBER_COUNT_RETRIES} < ${ETCD_MEMBER_COUNT_MAX_RETRIES} ));
	then
	# give some info about the current operation
	printf "sleeping...\n"
	# wait a litle between the next retry (1s -> 2s -> 4s -> 8s -> 16s)
	sleep $((2**${ETCD_MEMBER_COUNT_RETRIES}))
	# continue onto the next iteration
	continue
	fi

	# at this point "etcd member list" has failed everytime, which means
	# (most probably) that there are no active etcd peers and that we should
	# create a new cluster

	# give some info about the current operation
	printf "failed to list members. creating a new etcd cluster...\n"

	# set the current operation as "create"
	ETCD_CLUSTERING_OPERATION=${ETCD_CLUSTERING_OPERATION_CREATE}
	done
	}

	# health_check performs a health check on the current peer by checking if it is
	# listed as "healthy" in "etcdctl endpoint health".
	function health_check()
	{
	# check the current peer's health using "etcdctl endpoint health"
	local ETCD_PEER_IS_HEALTHY
	ETCD_PEER_IS_HEALTHY=$(${ETCDCTL} \
	--cert=etcd-crt.pem \
	--key=etcd-key.pem \
	--cacert=ca-crt.pem \
	--endpoints=https://etcd-${POD_INSTANCE_INDEX}-peer.${FRAMEWORK_HOST}:${ETCD_LISTEN_CLIENT_PORT} \
	endpoint health \| grep -c "is healthy")
	# return accordingly
	if [[ "${ETCD_PEER_IS_HEALTHY}" == "1" ]];
	then
	# the peer is healthy
	exit 0
	else
	# the peer is not healthy
	exit 1
	fi
	}

	# join_existing_cluster attempts to perform an "etcd member add" operation on
	# the existing cluster. it then starts etcd.
	function join_existing_cluster()
	{
	# check if a previous data-dir exists, in which case we don"t need to run
	# "etcdctl member add" and can just start etcd
	if [[ -d "${DIR}/${ETCD_DATA_DIR}/member" ]];
	then
	# give some info about the current operation
	printf "data-dir is not empty. starting etcd...\n"
	# start etcd
	start_etcd
	# exit the function
	return 0
	fi

	# at this point no data-dir exists, so we must run "etcdctl member add"
	# before starting etcd

	# give some info about the current operation
	printf "data-dir is empty. running \"etcdctl member add\"...\n"

	# wait for the vip to resolve
	wait_for_dns_to_resolve "etcd.${FRAMEWORK_VIP_HOST}"

	# add the new member, storing the output
	local ETCD_MEMBER_ADD_OUTPUT
	ETCD_MEMBER_ADD_OUTPUT=$(${ETCDCTL} \
	--cert=etcd-crt.pem \
	--key=etcd-key.pem \
	--cacert=ca-crt.pem \
	--endpoints=https://etcd.${FRAMEWORK_VIP_HOST}:${ETCD_LISTEN_CLIENT_PORT} \
	member add \
	--peer-urls=https://etcd-${POD_INSTANCE_INDEX}-peer.${FRAMEWORK_HOST}:${ETCD_LISTEN_PEER_PORT} \
	infra${POD_INSTANCE_INDEX})

	# check whether "etcd member add" was successful
	if [[ $? != 0 ]];
	then
	# the call to "etcd member add" failed, and so we should fail the task
	# and wait for it to be relaunched
	printf "failed to add member etcd-${POD_INSTANCE_INDEX}-peer to cluster...\n${ETCD_MEMBER_ADD_OUTPUT}\n"
	# fail the task
	exit 1
	else
	# grep the output for the relevant lines and export the
	# necessary environment variables. this will set the following
	# environment variables:
	# - ETCD_INITIAL_CLUSTER
	# - ETCD_INITIAL_CLUSTER_STATE
	# - ETCD_NAME
	eval export $(echo "${ETCD_MEMBER_ADD_OUTPUT}" \| grep ETCD_)
	# start etcd
	start_etcd
	fi
	}

	# readines_check performs a readiness check on the current peer (i.e. checks
	# whether the framework scheduler can proceed with the next task).
	function readiness_check()
	{
	# grab a list of current cluster members
	local ETCD_MEMBER_LIST
	ETCD_MEMBER_LIST=$(${ETCDCTL} \
	--cert=etcd-crt.pem \
	--key=etcd-key.pem \
	--cacert=ca-crt.pem \
	--endpoints=https://etcd-${POD_INSTANCE_INDEX}-peer.${FRAMEWORK_HOST}:${ETCD_LISTEN_CLIENT_PORT} member list)
	# check the status of the "etcdctl member list command"
	if [[ $? != 0 ]];
	then
	# at this point the current peer is not ready

	# give some info about the current operation
	printf "readiness_check: failed to list the members of the cluster...\n"
	# the peer is not ready
	exit 1
	fi

	# count the number of current cluster members
	local ETCD_CLUSTER_MEMBER_COUNT
	ETCD_CLUSTER_MEMBER_COUNT=$(echo "${ETCD_MEMBER_LIST}" \| wc -l)

	# count the number of current cluster members that are healthy
	local ETCD_HEALTHY_MEMBER_COUNT
	ETCD_HEALTHY_MEMBER_COUNT=$(${ETCDCTL} \
	--cert=etcd-crt.pem \
	--key=etcd-key.pem \
	--cacert=ca-crt.pem \
	--endpoints=https://etcd-${POD_INSTANCE_INDEX}-peer.${FRAMEWORK_HOST}:${ETCD_LISTEN_CLIENT_PORT} \
	endpoint health --cluster \| grep -c " is healthy")

	# if all existing members are healthy...
	if (( ${ETCD_CLUSTER_MEMBER_COUNT} > 0 )) && [[ "${ETCD_HEALTHY_MEMBER_COUNT}" == "${ETCD_CLUSTER_MEMBER_COUNT}" ]];
	then
	# give some info about the current operation
	printf "readiness_check: ok...\n"
	# ... the readiness check should pass...
	exit 0
	else
	# give some info about the current operation
	printf "readiness_check: failed...\nETCD_CLUSTER_MEMBER_COUNT: %s\nETCD_HEALTHY_MEMBER_COUNT: %s\n" ETCD_CLUSTER_MEMBER_COUNT ETCD_HEALTHY_MEMBER_COUNT
	# ... otherwise, it should fail.
	exit 1
	fi
	}

	# recover performs recovery steps on the current peer
	function recover()
	{
	# give some info about the current operation
	printf "recovering etcd-${POD_INSTANCE_INDEX}-peer...\n"

	# wait for the vip to resolve
	wait_for_dns_to_resolve "etcd.${FRAMEWORK_VIP_HOST}"

	# fetch the list of members in the cluster
	local ETCD_MEMBER_LIST
	ETCD_MEMBER_LIST=$(
	${ETCDCTL} \
	--cert=etcd-crt.pem \
	--key=etcd-key.pem \
	--cacert=ca-crt.pem \
	--endpoints=https://etcd.${FRAMEWORK_VIP_HOST}:${ETCD_LISTEN_CLIENT_PORT} member list)
	if [[ $? != 0 ]];
	then
	# the call to "etcd member list" failed, and so we should fail the task
	# and wait for it to be relaunched
	printf "failed to list the members of the cluster\n"
	# exit the task
	exit 1
	else
	# wipe any pre-existing data
	rm -rf data-dir/member

	# the call to "etcd member list" succeeded, so we should attempt to
	# remove any previously existing instance of this peer from the member
	# list

	# extract the dead peer"s id from the output
	local ETCD_DEAD_PEER_ID
	ETCD_DEAD_PEER_ID=$(echo "${ETCD_MEMBER_LIST}" \| awk -F "," "/etcd-${POD_INSTANCE_INDEX}-peer/ { print \$1 }")
	if [[ "${ETCD_DEAD_PEER_ID}" == "" ]];
	then
	# the current peer is not known by the cluster, so we're good
	return 0
	else
	# give some info about the current operation
	printf "removing etcd-${POD_INSTANCE_INDEX}-peer (id ${ETCD_DEAD_PEER_ID})...\n"
	# the current peer is already known by the cluster, so we must
	# remove it
	${ETCDCTL} \
	--cert=etcd-crt.pem \
	--key=etcd-key.pem \
	--cacert=ca-crt.pem \
	--endpoints https://etcd.${FRAMEWORK_VIP_HOST}:${ETCD_LISTEN_CLIENT_PORT} member remove ${ETCD_DEAD_PEER_ID}
	if [[ $? != 0 ]];
	then
	# the call to "etcd member remove" failed, and so we should fail the
	# task and wait for it to be relaunched
	exit 1
	fi
	fi
	fi
	}

	# start determines whether the current peer should create a new cluster or join
	# an existing one.
	function start()
	{
	# wait for the name of the current task to resolve in order to prevent etcd
	# from crashing before it reaches the "started" state (which later
	# originates cluster id mismatch errors)
	wait_for_dns_to_resolve "etcd-${POD_INSTANCE_INDEX}-peer.${FRAMEWORK_HOST}"

	# detect whether we need to create a new cluster or join an existing one
	get_clustering_operation

	# act accordingly
	case "${ETCD_CLUSTERING_OPERATION}" in
	${ETCD_CLUSTERING_OPERATION_CREATE})
	create_new_cluster
	;;
	${ETCD_CLUSTERING_OPERATION_JOIN})
	join_existing_cluster
	;;
	esac
	}

	# start_etcd starts etcd, replacing the current script.
	function start_etcd()
	{
	# give some info about the current operation
	printf "starting etcd...\n"

	# start etcd (replacing the current script)
	exec $ETCD \
	--name=infra${POD_INSTANCE_INDEX} \
	--cert-file=etcd-crt.pem \
	--key-file=etcd-key.pem \
	--client-cert-auth \
	--trusted-ca-file=ca-crt.pem \
	--peer-cert-file=etcd-crt.pem \
	--peer-key-file=etcd-key.pem \
	--peer-trusted-ca-file=ca-crt.pem \
	--listen-peer-urls=https://${MESOS_CONTAINER_IP}:${ETCD_LISTEN_PEER_PORT} \
	--initial-advertise-peer-urls=https://etcd-${POD_INSTANCE_INDEX}-peer.${FRAMEWORK_HOST}:${ETCD_LISTEN_PEER_PORT} \
	--listen-client-urls=https://${MESOS_CONTAINER_IP}:${ETCD_LISTEN_CLIENT_PORT} \
	--advertise-client-urls=https://etcd-${POD_INSTANCE_INDEX}-peer.${FRAMEWORK_HOST}:${ETCD_LISTEN_CLIENT_PORT} \
	--log-output=stdout \
	--quota-backend-bytes=${ETCD_QUOTA_BACKEND_BYTES} \
	--election-timeout=${ETCD_ELECTION_TIMEOUT} \
	--heartbeat-interval=${ETCD_HEARTBEAT_INTERVAL} \
	--enable-v2=false
	}

	# wait_for_dns_to_resolve waits for the name passed as the first parameter to
	# resolve before returning. if the name doesn't resolve after 5 retries with an
	# exponential backoff, it causes the script to exit.
	function wait_for_dns_to_resolve()
	{
	# give some info about the current operation
	printf "waiting for \"${1}\" to resolve...\n"

	# DNS_NAME holds the name which resolution we want to wait for. we will wait
	# for it to resolve.
	local DNS_NAME="${1}"
	# DNS_RESOLVE_RETRIES holds the number of retries done so far.
	local DNS_RESOLVE_RETRIES=0
	# DNS_RESOLVE_MAX_RETRIES holds the maximum number of retries that will be
	# made.
	local DNS_RESOLVE_MAX_RETRIES=5

	# loop until DNS_NAME resolves or DNS_RESOLVE_MAX_RETRIES is reached
	while (( ${DNS_RESOLVE_RETRIES} <= ${DNS_RESOLVE_MAX_RETRIES} ));
	do
	# check if DNS_NAME resolves
	host "${DNS_NAME}"
	# check host's exit code
	if [[ $? == 0 ]];
	then
	# give some info about the current operation
	printf "${DNS_NAME} successfully resolved...\n"
	# exit the function
	return 0
	fi

	# at this point, dns resolution has failed, so we should check whether
	# we should retry or end the loop

	# give some info about the current operation
	printf "${DNS_NAME} doesn't resolve yet...\n"
	# increment the counter
	DNS_RESOLVE_RETRIES=$((DNS_RESOLVE_RETRIES+1))

	# if this is not the last retry...
	if (( ${DNS_RESOLVE_RETRIES} < ${DNS_RESOLVE_MAX_RETRIES} ));
	then
	# give some info about the current operation
	printf "sleeping...\n"
	# wait a litle between the next retry (1s -> 2s -> 4s -> 8s -> 16s)
	sleep $((2**${DNS_RESOLVE_RETRIES}))
	# continue onto the next iteration
	continue
	fi

	# at this point, dns resolution has failed everytime, so we should exit the
	# task

	# give some info about the current operation
	echo "${DNS_NAME} didn't resolve after $((DNS_RESOLVE_MAX_RETRIES)) retries..."
	# exit the task
	exit 1
	done
	}

	# parse the first argument and run the corresponding command
	case "$1" in
	"health-check")
	health_check
	;;
	"start")
	start;
	;;
	"readiness-check")
	readiness_check
	;;
	"recover")
	recover;
	;;
	*)
	exit 1;
	;;
	esac