Last active
June 30, 2017 11:45
-
-
Save mumoshu/d52bca05186bb021fb71a40c3daf1b05 to your computer and use it in GitHub Desktop.
NOTICE: A successor of this script now exists as `etcdadm` in kube-aws(https://github.com/kubernetes-incubator/kube-aws/)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
ETCD_WORK_DIR=${ETCD_WORK_DIR:-$(pwd)/work} | |
# Usage: | |
# AWS_DEFAULT_REGION=ap-northeast-1 \ | |
# AWS_ACCESS_KEY_ID=... \ | |
# AWS_SECRET_ACCESS_KEY=... \ | |
# ETCD_VERSION=3.1.2 \ | |
# ETCD_SYSTEMD_UNIT_NAME=etcd-member \ | |
# ETCD_SNAPSHOTS_S3_URI=s3://myetcdsnapshots/snapshots \ | |
# ETCD_COUNT=3 \ | |
# ETCD_INDEX=0 \ | |
# ETCD_INITIAL_CLUSTER=etcd0=http://127.0.0.1:3080,etcd1=http://127.0.0.1:3180,etcd2=http://127.0.0.1:3280 \ | |
# ETCD_DATA_DIR=/var/lib/etcd \ | |
# ETCDCTL_ENDPOINTS=http://127.0.0.1:3079,etcd1=http://127.0.0.1:3179,etcd2=http://127.0.0.1:3279, \ | |
# ETCDCTL_CONTAINER_RUNTIME=rkt \ | |
# ETCD_MEMBER_FAILURE_PERIOD_LIMIT=10 \ | |
# ETCD_CLUSTER_FAILURE_PERIOD_LIMIT=30 \ | |
# ETCDADM_STATE_FILES_DIR=/var/run/coreos/etcdadm \ | |
# ./etcdadm [save|check|reconfigure] | |
if [ "${DEBUG:-}" == "yes" ]; then | |
set -vx | |
fi | |
_array_join() { | |
local r="" | |
for t in "${@}"; do | |
if [ "$r" != "" ]; then | |
r="$r \"$t\"" | |
else | |
r="\"$t\"" | |
fi | |
done | |
echo "$r" | |
} | |
_current_time() { | |
date +%s | |
} | |
_run_as_root() { | |
local status | |
if [ "${USER:-na}" != root ]; then | |
echo running command as root: "$(_array_join "$@")" | |
_array_join "${@}" | sudo bash | |
else | |
_array_join "${@}" | bash | |
fi | |
status=$? | |
return $status | |
} | |
config_awscli_docker_image() { | |
echo "quay.io/coreos/awscli" | |
} | |
awscli_rkt_image() { | |
echo "docker://$(config_awscli_docker_image)" | |
} | |
config_aws_region() { | |
echo "${AWS_DEFAULT_REGION:-ap-northeast-1}" | |
} | |
config_aws_access_key_id() { | |
echo "${AWS_ACCESS_KEY_ID}" | |
} | |
config_aws_secret_access_key() { | |
echo "${AWS_SECRET_ACCESS_KEY}" | |
} | |
config_snapshots_s3_uri() { | |
echo "${SNAPSHOTS_S3_URI}" | |
} | |
config_etcd_initial_cluster() { | |
echo "${ETCD_INITIAL_CLUSTER}" | |
} | |
config_etcdctl_endpoints() { | |
echo "${ETCDCTL_ENDPOINTS}" | |
} | |
config_etcd_version() { | |
echo "${ETCD_VERSION:-3.1.2}" | |
} | |
config_etcd_count() { | |
echo "${ETCD_COUNT:-3}" | |
} | |
config_state_dir() { | |
echo "${ETCDADM_STATE_FILES_DIR:-/var/run/coreos/$(member_name)-state}" | |
} | |
config_etcd_index() { | |
echo "${ETCD_INDEX}" | |
} | |
cluster_etcd_aci_url() { | |
local v | |
v=v$(config_etcd_version) | |
echo "https://github.com/coreos/etcd/releases/download/$v/etcd-$v-linux-amd64.aci" | |
} | |
cluster_member_indices() { | |
i=0 | |
until [ "$i" == "$(config_etcd_count)" ]; do | |
echo $i | |
i=$((i + 1)) | |
done | |
} | |
cluster_is_healthy() { | |
! cluster_is_unhealthy | |
} | |
# i.e. cluster_quorum_may_have_been_lost. The lose may or may not be permanent. | |
# We don't have way to determine whether it is permanent or transient? | |
cluster_is_unhealthy() { | |
local healthy | |
local quorum | |
healthy=$(cluster_num_healthy_members) | |
quorum=$(cluster_majority) | |
echo "quorum=$quorum healthy=$healthy" 1>&2 | |
if (( healthy < quorum )); then | |
echo cluster is unhealthy 1>&2; | |
return 0 | |
fi | |
echo cluster is healthy 1>&2 | |
return 1 | |
} | |
cluster_majority() { | |
local m | |
m=$(( $(config_etcd_count) / 2 + 1 )) | |
echo ${m} | |
} | |
cluster_num_running_nodes() { | |
# TODO aws autoscaling describe-auto-scaling-group | |
local f | |
local n | |
f=$(tester_num_running_nodes_file) | |
if [ -f "${f}" ]; then | |
n=$(cat "${f}") | |
fi | |
echo "${n:-0}" | |
} | |
cluster_num_healthy_members() { | |
local i | |
local n | |
n=$(config_etcd_count) | |
for i in $(cluster_member_indices); do | |
if ! ETCD_INDEX=$i member_is_healthy; then | |
n=$((n - 1)) | |
fi | |
done | |
echo ${n} | |
} | |
cluster_is_failing_longer_than_limit() { | |
cluster_failure_beginning_time_is_set && | |
(( $(current_time) > $(cluster_failure_beginning_time) + $(cluster_failure_period_limit) )) | |
} | |
cluster_failure_beginning_time_is_set() { | |
test -f "$(cluster_failure_beginning_time_file)" | |
} | |
cluster_failure_period_limit() { | |
echo "${ETCD_CLUSTER_FAILURE_PERIOD_LIMIT:-10}" | |
} | |
cluster_failure_beginning_time() { | |
cat "$(cluster_failure_beginning_time_file)" | |
} | |
cluster_failure_beginning_time_file() { | |
echo "$(config_state_dir)/cluster-failure-beginning-time" | |
} | |
cluster_failure_beginning_time_clear() { | |
_run_as_root rm -f "$(cluster_failure_beginning_time_file)" | |
} | |
cluster_failure_beginning_time_set() { | |
local file | |
file=$(cluster_failure_beginning_time_file) | |
_run_as_root bash -c "echo '$1' > $file" | |
} | |
cluster_failure_beginning_time_record() { | |
if ! cluster_failure_beginning_time_is_set; then | |
cluster_failure_beginning_time_set "$(current_time)" | |
fi | |
} | |
cluster_check() { | |
if member_is_healthy; then | |
member_failure_beginning_time_clear | |
else | |
member_failure_beginning_time_record | |
fi | |
if cluster_is_healthy; then | |
cluster_failure_beginning_time_clear | |
else | |
cluster_failure_beginning_time_record | |
fi | |
} | |
member_next_index() { | |
echo $(( ($(config_etcd_index) + 1) % $(config_etcd_count) )) | |
} | |
member_systemd_unit_name() { | |
echo "${ETCD_SYSTEMD_UNIT_NAME:-$(member_systemd_service_name).service}" | |
} | |
member_systemd_service_name() { | |
echo "${ETCD_SYSTEMD_SERVICE_NAME:-etcd-member-$(config_etcd_index)}" | |
} | |
member_snapshots_dir_name() { | |
echo snapshots | |
} | |
member_host_snapshots_dir_path() { | |
echo "$(config_state_dir)/$(member_snapshots_dir_name)" | |
} | |
member_snapshot_name() { | |
echo "$(member_name).db" | |
} | |
member_snapshot_host_path() { | |
echo "$(member_host_snapshots_dir_path)/$(member_snapshot_name)" | |
} | |
member_snapshot_relative_path() { | |
echo "$(member_snapshots_dir_name)/$(member_snapshot_name)" | |
} | |
member_save_snapshot() { | |
local snapshot_name | |
snapshot_name=$(member_snapshot_relative_path) | |
if cluster_is_healthy; then | |
member_etcdctl snapshot save "$snapshot_name" | |
member_etcdctl snapshot status "$snapshot_name" | |
member_upload_snapshot | |
member_remove_snapshot | |
else | |
echo cluster is not healthy. skipped snapshotting because the cluster can be unhealthy due to the corrupted etcd data of members, including this member 1>&2 | |
fi | |
} | |
member_remove_snapshot() { | |
local file | |
file=$(member_snapshot_host_path) | |
echo "removing write protected local snapshot file: ${file}" 1>&2 | |
_run_as_root rm -f "${file}" | |
} | |
member_upload_snapshot() { | |
local cmd | |
local src | |
local dst | |
src=$(member_snapshot_host_path) | |
dst=$(member_remote_snapshot_s3_uri) | |
cmd=$(_awscli_command s3 cp "${src}" "${dst}") | |
echo "uploading ${src} to ${dst}" | |
_run_as_root ${cmd[*]} | |
echo verifying the upload... | |
member_remote_snapshot_exists | |
} | |
member_remote_snapshot_s3_uri() { | |
echo "$(config_snapshots_s3_uri)/snapshot.db" | |
} | |
member_remote_snapshot_exists() { | |
local cmd | |
local uri | |
uri=$(member_remote_snapshot_s3_uri) | |
cmd=$(_awscli_command s3 ls "${uri}") | |
echo checking existence of "${uri}" 1>&2 | |
if _run_as_root $cmd; then | |
echo "${uri}" exists | |
else | |
echo "${uri}" does not exist | |
return 1 | |
fi | |
} | |
member_download_snapshot() { | |
local cmd | |
local dir | |
local dst | |
local src | |
dst=$(member_snapshot_host_path) | |
src=$(member_remote_snapshot_s3_uri) | |
cmd=$(_awscli_command s3 cp "${src}" "${dst}") | |
dir=$(dirname "$(member_snapshot_host_path)") | |
if ! [ -d "${dir}" ]; then | |
echo directory "${dir}" not found. creating... | |
_run_as_root mkdir -p "${dir}" | |
fi | |
echo downloading "${dst}" from "${src}" 1>&2 | |
_run_as_root $cmd | |
member_local_snapshot_exists | |
} | |
_awscli_command() { | |
_docker_awscli_command "${@}" | |
} | |
_docker_awscli_command() { | |
local dir | |
local cmd | |
dir=$(dirname "$(config_state_dir)") | |
echo docker | |
echo run | |
printf "%s\n" "-e" | |
echo "AWS_DEFAULT_REGION=$(config_aws_region)" | |
echo "--net=host" | |
echo "--volume" | |
echo "${dir}:${dir}" | |
if [ "${AWS_ACCESS_KEY_ID:-}" != "" ]; then | |
printf "%s\n" "-e" | |
echo "AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:?}" | |
fi | |
if [ "${AWS_SECRET_ACCESS_KEY:-}" != "" ]; then | |
printf "%s\n" "-e" | |
echo "AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:?}" | |
fi | |
echo "$(config_awscli_docker_image)" | |
echo "aws" | |
for t in "${@}"; do | |
echo "$t" | |
done | |
} | |
_rkt_aws() { | |
local dir | |
local uuid_file | |
local cmd | |
dir=$(dirname "$(config_state_dir)") | |
uuid_file=$(config_state_dir)/awscli.uuid | |
cmd="rkt run \ | |
--set-env AWS_DEFAULT_REGION=$(config_aws_region) \ | |
--set-env AWS_ACCESS_KEY_ID=$(config_aws_access_key_id) \ | |
--set-env AWS_SECRET_ACCESS_KEY=$(config_aws_secret_access_key) \ | |
--volume=dns,kind=host,source=/etc/resolv.conf,readOnly=true \ | |
--mount volume=dns,target=/etc/resolv.conf \lib | |
--volume=state,kind=host,source=${dir} \ | |
--mount volume=state,target=${dir} \lib | |
--insecure-options=image \ | |
--net=host \ | |
--uuid-file-save=$uuid_file \ | |
$(awscli_image) \ | |
--exec aws -- $(_array_join "$@")" | |
echo "running awscli: $cmd" 1>&2 | |
_run_as_root $cmd | |
_run_as_root rkt rm --uuid-file "$uuid_file" | |
} | |
member_local_snapshot_exists() { | |
local file | |
file=$(member_snapshot_host_path) | |
echo "checking existence of file $file" 1>&2 | |
[ -f "$file" ] | |
} | |
member_clean_data_dir() { | |
local data_dir | |
data_dir=$(member_data_dir) | |
echo "cleaning data dir of $(member_name)" | |
if [ -d "${data_dir}" ]; then | |
echo "data dir ${data_dir} exists. finding files to remove" 1>&2 | |
sudo find "$data_dir" -type f | while read -r file; do | |
echo "removing file $file" | |
sudo rm "$file" | |
done | |
echo "removing directory $data_dir" 1>&2 | |
sudo rm -rf "$data_dir" | |
else | |
echo "data dir ${data_dir} does not exist. nothing to remove" 1>&2 | |
fi | |
} | |
member_replace_failed() { | |
local name | |
local peer_url | |
local next_index | |
local client_url | |
local id | |
name=$(member_name) | |
peer_url=$(member_peer_url) | |
next_index=$(member_next_index) | |
client_url=$(ETCD_INDEX=${next_index} member_client_url) | |
member_clean_data_dir | |
echo "connecting to ${client_url}" | |
etcdctl --peers "${client_url}" member list | |
id=$(etcdctl --peers "${client_url}" member list | grep "name=${name}" | cut -d ':' -f 1) | |
echo "removing member ${id}" 1>&2 | |
etcdctl --peers "${client_url}" member remove "${id}" | |
# Wait until the cluster becomes healthy when the removed member was the leader | |
sleep 1 | |
echo "adding member ${id}" 1>&2 | |
etcdctl --peers "${client_url}" member add "${name}" "${peer_url}" | |
member_set_initial_cluster_state existing | |
} | |
member_bootstrap() { | |
if member_remote_snapshot_exists; then | |
member_download_snapshot | |
else | |
echo "remote snapshot for $(member_name) does not exist. skipped downloading" | |
fi | |
if member_local_snapshot_exists; then | |
echo "backup found. restoring $(member_name)..." | |
member_restore_from_local_snapshot | |
else | |
echo "backup not found. starting brand new $(member_name)..." | |
fi | |
member_set_initial_cluster_state new | |
} | |
member_restore_from_local_snapshot() { | |
local uuid_file | |
local cmd | |
uuid_file=$(config_state_dir)/etcdctl-snapshot-restore.uuid | |
snapshot_name=$(member_snapshot_relative_path) | |
member_clean_data_dir | |
echo "restoring $(member_name)" | |
# * Don't try to mount the data-dir directly or etcdctl ends up with "Error: data-dir "/etcd-data" exists" | |
# * `--volume data-dir,kind=empty` is required to suppress the warning: "stage1: warning: no volume specified for mount point "data-dir", implicitly creating an "empty" volume. This volume will be removed when the pod is garbage-collected." | |
_run_as_root rkt run \ | |
--insecure-options=image \ | |
--set-env ETCDCTL_API=3 \ | |
--net=host \ | |
--volume $(member_snapshots_dir_name),kind=host,source="$(member_host_snapshots_dir_path)" \ | |
--mount volume="$(member_snapshots_dir_name)",target=/"$(member_snapshots_dir_name)" \ | |
--volume data-dir-root,kind=host,source="$(dirname "$(member_data_dir)")" \ | |
--mount volume=data-dir-root,target="$(dirname "$(member_data_dir)")" \ | |
--volume data-dir,kind=empty \ | |
--mount volume=data-dir,target=/var/lib/etcd \ | |
--uuid-file-save="$uuid_file" \ | |
"$(cluster_etcd_aci_url)" \ | |
--exec etcdctl -- \ | |
--write-out simple \ | |
--endpoints "$(member_client_url)" snapshot restore \ | |
--data-dir "$(member_data_dir)" \ | |
--initial-cluster "$(config_etcd_initial_cluster)" \ | |
--initial-advertise-peer-urls "$(member_peer_url)" \ | |
--name "$(member_name)" \ | |
"$snapshot_name" | |
_run_as_root rkt stop --force --uuid-file "$uuid_file" || echo pod is already stopped | |
_run_as_root rkt rm --uuid-file "$uuid_file" | |
# Do this or etcd ends up with "error listing data dir /var/lib/etcd" | |
_run_as_root chown -R etcd:etcd "$(member_data_dir)" | |
member_remove_snapshot | |
echo "restored $(member_name)" | |
} | |
member_env_file() { | |
local name | |
local env_file | |
name=$(member_name) | |
env_file=$(config_state_dir)/${name}.env | |
echo "${env_file}" | |
} | |
member_set_initial_cluster_state() { | |
local desired=$1 | |
echo "setting initial cluster state to: $desired" | |
local f | |
f=$(member_env_file) | |
_run_as_root bash -c "cat > ${f} << EOS | |
ETCD_INITIAL_CLUSTER_STATE=$desired | |
EOS | |
" | |
} | |
member_set_unit_type() { | |
local desired=$1 | |
echo setting etcd unit type to "$desired". \`systemctl daemon-reload\` required afterwards 1>&2 | |
local drop_in_file | |
drop_in_file=$(tester_member_systemd_drop_in_path 30-unit-type) | |
_run_as_root bash -c "cat > ${drop_in_file} << EOS | |
[Service] | |
Type=$desired | |
EOS | |
" | |
} | |
member_is_failing_longer_than_limit() { | |
member_failure_beginning_time_is_set && | |
(( $(_current_time) > $(member_failure_beginning_time) + $(member_failure_period_limit) )) | |
} | |
member_failure_beginning_time_is_set() { | |
test -f "$(member_failure_beginning_time_file)" | |
} | |
member_failure_period_limit() { | |
echo "${ETCD_MEMBER_FAILURE_PERIOD_LIMIT:-10}" | |
} | |
member_failure_beginning_time() { | |
cat "$(member_failure_beginning_time_file)" | |
} | |
member_failure_beginning_time_file() { | |
echo "$(config_state_dir)/member-failure-beginning-time" | |
} | |
member_failure_beginning_time_clear() { | |
_run_as_root rm -f "$(member_failure_beginning_time_file)" | |
} | |
member_failure_beginning_time_set() { | |
local file | |
file=$(member_failure_beginning_time_file) | |
_run_as_root bash -c "echo '$1' > $file" | |
} | |
member_failure_beginning_time_record() { | |
if ! member_failure_beginning_time_is_set; then | |
member_failure_beginning_time_set "$(_current_time)" | |
fi | |
} | |
member_reconfigure() { | |
member_validate | |
# Assuming this node has failed or has not yet started hence this sequence is invoked... | |
local healthy | |
local quorum | |
healthy=$(cluster_num_healthy_members) | |
quorum=$(cluster_majority) | |
echo "observing cluster state: quorum=$quorum healthy=$healthy" 1>&2 | |
if (( healthy >= quorum )); then | |
# At least N/2+1 members are working | |
if member_is_unstarted; then | |
# This member appeared to be "unstarted" in outputs of `etcdctl member list` against other etcd members | |
# | |
# It happens only when a cluster has successfully recovered from a snapshot and | |
# the snapshot contained the information about this member hence it is recognized to be "unstarted" by other members, | |
# instead of just being invisible from them. | |
echo 'cluster is already healthy but still in bootstrap process. searching for a etcd snapshot to recover this member' 1>&2 | |
member_bootstrap | |
elif member_is_failing_longer_than_limit; then | |
# This member seems to be consistently failing | |
# | |
# As the cluster is still healthy, it can happen only when: | |
# * the etcd data of this member is broken somehow or | |
# * this member has a network connectivity issue between other members in the cluster | |
# The latter should be eventually managed by operators or AWS. | |
# To deal with the former case, we just restart this member with fresh data. | |
# | |
# This process is documented in the section "Replace failed etcd member" in the etcd documentation. | |
# See https://coreos.com/etcd/docs/latest/etcd-live-cluster-reconfiguration.html#replace-a-failed-etcd-member-on-coreos-container-linux | |
echo 'this member is failing longer than limit' 1>&2 | |
member_replace_failed | |
else | |
# This member has just been restarted. | |
# | |
# The restart may have been caused by the following reasons: | |
# * EC2 instance which had been hosting this member terminated due to a failure, and then the ASG recreated it | |
# * The user initiated a reboot of the EC2 instance hosting this member | |
# Although there's no way to certainly determine which one it is, | |
# we can safely retry until the failing period exceeds the threshold and hope the member eventually becomes healthy | |
# if the failure is'nt permanent. | |
echo 'this member has just restarted' 1>&2 | |
fi | |
else | |
# At least N/2+1 members are NOT working | |
local running_num | |
local remaining_num | |
local total_num | |
running_num=$(cluster_num_running_nodes) | |
total_num=$(config_etcd_count) | |
if (( running_num < total_num )); then | |
remaining_num=$(( quorum - running_num + 1 )) | |
else | |
remaining_num=$(( quorum - healthy )) | |
fi | |
echo "${remaining_num} more members required until the quorum is met" 1>&2 | |
if (( remaining_num >= 2 )); then | |
member_set_unit_type simple | |
else | |
member_set_unit_type notify | |
fi | |
if (( running_num < total_num )); then | |
echo "only ${running_num} of ${total_num} nodes for etcd members are running, which means cluster is still in bootstrap process. searching for a etcd snapshot to recover this member" | |
member_bootstrap | |
elif cluster_is_failing_longer_than_limit; then | |
echo "all the nodes for etcd members are running but cluster has been unhealthy for a while, which means cluster is now in disaster recovery process. searching for a etcd snapshot to recover this member" | |
member_bootstrap | |
else | |
echo "all the nodes are present but cluster is still unhealthy, which means cluster failed to bootstrap. keep retrying a while" | |
fi | |
fi | |
echo "running \`systemctl daemon-reload\` to reload $(member_systemd_unit_name)" | |
_run_as_root systemctl daemon-reload | |
} | |
member_is_unstarted() { | |
local name | |
local peer_url | |
local next_index | |
local client_url | |
name=$(member_name) | |
peer_url=$(member_peer_url) | |
next_index=$(member_next_index) | |
client_url=$(ETCD_INDEX=${next_index} member_client_url) | |
echo "connecting to ${client_url}" 1>&2 | |
etcdctl --peers "${client_url}" member list | |
local unstarted_peer | |
unstarted_peer=$(etcdctl --peers "${client_url}" member list | grep unstarted | grep "${peer_url}") | |
if [ "${unstarted_peer}" != "" ]; then | |
echo "unstarted peer for this member($(member_name))" is found 1>&2 | |
return 0 | |
fi | |
return 1 | |
} | |
member_name() { | |
_nth_peer_name "$(config_etcd_index)" | |
} | |
member_peer_url() { | |
_nth_peer_url "$(config_etcd_index)" | |
} | |
member_client_url() { | |
_nth_client_url "$(config_etcd_index)" | |
} | |
_nth_client_url() { | |
local peers | |
local url | |
peers=($(config_etcdctl_endpoints | tr "," "\n")) | |
url="${peers[$1]}" | |
echo "${url}" | |
} | |
_nth_peer_name() { | |
local peers | |
local url | |
peers=($(config_etcd_initial_cluster | tr "," "\n")) | |
url=$(echo "${peers[$1]}" | cut -d '=' -f 1) | |
echo "${url}" | |
} | |
_nth_peer_url() { | |
local peers | |
local url | |
peers=($(config_etcd_initial_cluster | tr "," "\n")) | |
url=$(echo "${peers[$1]}" | cut -d '=' -f 2) | |
echo "${url}" | |
} | |
member_data_dir() { | |
echo "${ETCD_DATA_DIR:-${ETCD_WORK_DIR:?}/$(member_name)}" | |
} | |
member_etcdctl() { | |
local uuid_file | |
uuid_file="$(config_state_dir)/etcdctl-$BASHPID.uuid" | |
_run_as_root rkt run \ | |
--insecure-options=image \ | |
--set-env ETCDCTL_API=3 \ | |
--net=host \ | |
--volume "$(member_snapshots_dir_name)",kind=host,source="$(member_host_snapshots_dir_path)" \ | |
--mount volume="$(member_snapshots_dir_name)",target=/"$(member_snapshots_dir_name)" \ | |
--volume data-dir,kind=host,source="$(member_data_dir)" \ | |
--mount volume=data-dir,target=/var/lib/etcd \ | |
--uuid-file-save="$uuid_file" \ | |
"$(cluster_etcd_aci_url)" \ | |
--exec etcdctl -- --endpoints "$(member_client_url)" ${*} | |
_run_as_root rkt rm --uuid-file "$uuid_file" | |
rm "$uuid_file" | |
} | |
member_is_healthy() { | |
member_etcdctl endpoint health | grep "is healthy" 1>&2 | |
} | |
member_etcdctl_v2() { | |
ETCDCTL_API=2 etcdctl --endpoints "$(member_client_url)" "${@:1}" | |
} | |
tester_member_pid() { | |
local name | |
local pid | |
name=$(member_name) | |
pid=$(ps auxww | grep "\(--name $name\)" | grep -v grep | awk '{ print $2 }') | |
if [ "$pid" == "" ]; then | |
return 1 | |
fi | |
echo "$pid" | |
} | |
tester_cluster_tail() { | |
journalctl -f -u etcd-node-0 -u etcd-node-1 -u etcd-node-2 -o json | jq -r '._SYSTEMD_UNIT + ": " + .MESSAGE' | |
} | |
tester_clean_local_snapshots() { | |
local dir | |
dir=$(member_host_snapshots_dir_path) | |
tester_log "removing local snapshots in ${dir}" | |
sudo rm -Rf "${dir}" | |
} | |
tester_clean_remote_snapshots() { | |
local cmd | |
local s3uri | |
s3uri=$(member_remote_snapshot_s3_uri) | |
tester_log "removing remote snapshot ${s3uri}" | |
cmd=$(_awscli_command s3 rm "${s3uri}") | |
_run_as_root $cmd || echo "${s3uri} not found" 1>&2 | |
} | |
tester_member_systemd_unit_path() { | |
echo "/etc/systemd/system/$(member_systemd_unit_name)" | |
} | |
tester_member_systemd_drop_in_path() { | |
local drop_in_name | |
drop_in_name=$1 | |
if [ "$1" == "" ]; then | |
echo "member_systemd_drop_in_path: missing argument drop_in_name=$1" 1>&2 | |
exit 1 | |
fi | |
echo "$(tester_member_systemd_unit_path).d/${drop_in_name}.conf" | |
} | |
tester_show_member_log() { | |
journalctl -u "$(member_systemd_unit_name)" -o json | jq -r '._SYSTEMD_UNIT + ": " + .MESSAGE' | |
} | |
tester_tail_member_log() { | |
journalctl -f -u "$(member_systemd_unit_name)" -o json | jq -r '._SYSTEMD_UNIT + ": " + .MESSAGE' | |
} | |
tester_remove_data_dir() { | |
local data_dir | |
data_dir=$(member_data_dir) | |
tester_log removing "${data_dir}" | |
_run_as_root rm -rf "${data_dir}" | |
} | |
tester_remove_state_dir() { | |
local d | |
d=$(config_state_dir) | |
tester_log removing state dir "${d}" | |
_run_as_root rm -rf "${d}" | |
} | |
tester_create_data_dir() { | |
local data_dir | |
data_dir=$(member_data_dir) | |
tester_log creating "${data_dir}" | |
_run_as_root mkdir -p "${data_dir}" | |
_run_as_root chown -R etcd:etcd "${data_dir}" | |
} | |
tester_create_state_dir() { | |
local d | |
d=$(config_state_dir) | |
tester_log creating state dir "${d}" | |
_run_as_root mkdir -p "${d}" | |
} | |
tester_append_member_to_cloud_config_file() { | |
local name | |
local f | |
name=$(member_name) | |
f=$(tester_work_dir)/$(member_systemd_unit_name) | |
cat > "$f" << EOS | |
[Unit] | |
Description=$(member_systemd_unit_name) | |
Documentation=https://github.com/coreos/etcd | |
Wants=network.target | |
Conflicts=etcd.service | |
Conflicts=etcd2.service | |
[Service] | |
Type=notify | |
Restart=on-failure | |
RestartSec=10s | |
TimeoutStartSec=0 | |
LimitNOFILE=40000 | |
Environment="ETCD_IMAGE_TAG=v3.0.10" | |
Environment="ETCD_NAME=%m" | |
Environment="ETCD_USER=etcd" | |
Environment="ETCD_DATA_DIR=/var/lib/etcd" | |
Environment="RKT_RUN_ARGS=--uuid-file-save=$(config_state_dir)/$(member_systemd_unit_name).uuid" | |
ExecStartPre=/usr/bin/mkdir --parents $(config_state_dir) | |
ExecStartPre=-/usr/bin/rkt rm --uuid-file=$(config_state_dir)/$(member_systemd_unit_name).uuid | |
ExecStart=/usr/lib/coreos/etcd-wrapper \$ETCD_OPTS | |
ExecStop=-/usr/bin/rkt stop --uuid-file=$(config_state_dir)/$(member_systemd_unit_name).uuid | |
[Install] | |
WantedBy=multi-user.target | |
EOS | |
sudo cp "$f" "$(tester_member_systemd_unit_path)" | |
local cloud_config_file | |
cloud_config_file=$(tester_cloud_config_file) | |
client_url=$(member_client_url) | |
peer_url=$(member_peer_url) | |
initial_cluster=$(config_etcd_initial_cluster) | |
data_dir=$(member_data_dir) | |
cat >> ${cloud_config_file} << EOS | |
- name: $(member_systemd_unit_name) | |
enable: true | |
drop-ins: | |
- name: 40-etcd3-cluster.conf | |
content: | | |
[Service] | |
EnvironmentFile=-$(member_env_file) | |
[Service] | |
Environment="ETCD_IMAGE_TAG=v$(config_etcd_version)" | |
Environment="ETCD_NAME=${name}" | |
Environment="ETCD_ADVERTISE_CLIENT_URLS=${client_url}" | |
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=${peer_url}" | |
Environment="ETCD_LISTEN_CLIENT_URLS=${client_url}" | |
Environment="ETCD_LISTEN_PEER_URLS=${peer_url}" | |
Environment="ETCD_INITIAL_CLUSTER=${initial_cluster}" | |
Environment="ETCD_STRICT_RECONFIG_CHECK=true" | |
Environment="ETCD_DATA_DIR=${data_dir}" | |
EOS | |
} | |
tester_bind_configurator_systemd_unit() { | |
tester_log binding the reconfiguration service to the etcd service | |
local drop_in_file=$(tester_member_systemd_drop_in_path 50-reconfiguration) | |
_run_as_root bash -c "cat > ${drop_in_file} << EOS | |
[Unit] | |
Requires=$(tester_member_configurator_systemd_unit_name) | |
After=$(tester_member_configurator_systemd_unit_name) | |
EOS | |
" | |
} | |
tester_member_configurator_systemd_unit_name() { | |
echo $(member_systemd_service_name)-configurator.service | |
} | |
tester_member_configurator_systemd_unit_path() { | |
echo /etc/systemd/system/$(tester_member_configurator_systemd_unit_name) | |
} | |
tester_log() { | |
echo tester: "${@}" 1>&2 | |
} | |
tester_num_running_nodes_file() { | |
local d | |
d=$(tester_work_dir) | |
echo "${d}/num-running-nodes" | |
} | |
tester_set_num_running_nodes() { | |
local f | |
f=$(tester_num_running_nodes_file) | |
_run_as_root bash -c "echo $1 > $f" | |
tester_log number of nodes set to $1 | |
} | |
tester_work_dir() { | |
echo $(pwd)/tester | |
} | |
tester_cloud_config_file() { | |
echo $(tester_work_dir)/cloud-config | |
} | |
tester_generate_cloud_config_file() { | |
mkdir -p $(member_snapshots_dir_name) | |
cat > $(tester_cloud_config_file) << EOS | |
#cloud-config | |
coreos: | |
units: | |
EOS | |
tester_each_member tester_append_member_to_cloud_config_file | |
tester_log starting coreos-cloudinit... | |
sudo /usr/bin/coreos-cloudinit --from-file $(tester_cloud_config_file) | |
tester_log coreos-cloudinit finished | |
tester_log reloading systemd | |
sudo systemctl daemon-reload | |
tester_log reloaded systemd | |
} | |
tester_each_member() { | |
for i in $(cluster_member_indices); do | |
tester_log $(ETCD_INDEX=$i member_name): running "$@" | |
ETCD_INDEX=$i $1 "${@:2}" | |
done | |
} | |
tester_start_member() { | |
tester_log starting $(member_name) | |
sudo systemctl start $(member_systemd_unit_name) | |
tester_log started $(member_name) | |
} | |
tester_create_configurator_systemd_unit() { | |
local unit_file | |
unit_file=$(tester_member_configurator_systemd_unit_path) | |
tester_log creating the reconfiguration service for the etcd service | |
_run_as_root bash -c "cat > ${unit_file} << EOS | |
[Unit] | |
Description=$(tester_member_configurator_systemd_unit_name) | |
BindsTo=$(member_systemd_unit_name) | |
Before=$(member_systemd_unit_name) | |
Wants=network.target | |
[Service] | |
Type=oneshot | |
RemainAfterExit=yes | |
#Setting to simple ends up etcd-member.service to hang up while starting | |
#Type=simple | |
#Restart=on-failure | |
RestartSec=5 | |
Environment=ETCD_INDEX=$(config_etcd_index) | |
Environment=STATE_FILES_DIR=$(config_state_dir) | |
Environment=SNAPSHOTS_S3_URI=$(config_snapshots_s3_uri) | |
Environment=AWS_ACCESS_KEY_ID=$(config_aws_access_key_id) | |
Environment=AWS_SECRET_ACCESS_KEY=$(config_aws_secret_access_key) | |
Environment=ETCD_COUNT=$(config_etcd_count) | |
Environment=ETCD_INITIAL_CLUSTER=$(config_etcd_initial_cluster) | |
Environment=ETCDCTL_ENDPOINTS=$(config_etcdctl_endpoints) | |
Environment=ETCD_DATA_DIR=$(member_data_dir) | |
ExecStart=$(pwd)/etcdadm reconfigure | |
EOS | |
" | |
} | |
tester_create_checker_systemd_unit() { | |
local timer | |
local name | |
name="$(member_systemd_service_name)-check" | |
timer="/etc/systemd/system/${name}.timer" | |
tester_log creating check timer for etcd | |
_run_as_root bash -c "cat > ${timer} << EOS | |
[Unit] | |
Description=periodic etcd health check | |
[Timer] | |
OnBootSec=60sec | |
# Actual interval would be 10+0~5 sec | |
OnUnitInactiveSec=10sec | |
AccuracySec=5sec | |
[Install] | |
WantedBy=timers.target | |
EOS | |
" | |
service="/etc/systemd/system/${name}.service" | |
tester_log creating check service for etcd | |
_run_as_root bash -c "cat > ${service} << EOS | |
[Unit] | |
Description=etcd health check | |
Wants=network.target | |
[Service] | |
Type=simple | |
Environment=ETCD_INDEX=$(config_etcd_index) | |
Environment=STATE_FILES_DIR=$(config_state_dir) | |
Environment=SNAPSHOTS_S3_URI=$(config_snapshots_s3_uri) | |
Environment=AWS_ACCESS_KEY_ID=$(config_aws_access_key_id) | |
Environment=AWS_SECRET_ACCESS_KEY=$(config_aws_secret_access_key) | |
Environment=ETCD_COUNT=$(config_etcd_count) | |
Environment=ETCD_INITIAL_CLUSTER=$(config_etcd_initial_cluster) | |
Environment=ETCDCTL_ENDPOINTS=$(config_etcdctl_endpoints) | |
Environment=ETCD_DATA_DIR=$(member_data_dir) | |
ExecStart=$(pwd)/etcdadm check | |
EOS | |
" | |
_run_as_root systemctl daemon-reload | |
_run_as_root systemctl restart ${name}.service | |
_run_as_root systemctl restart ${name}.timer | |
} | |
tester_create_snapshot_systemd_unit() { | |
local timer | |
local name | |
name="$(member_systemd_service_name)-snapshot" | |
timer="/etc/systemd/system/${name}.timer" | |
tester_log creating check timer for etcd | |
_run_as_root bash -c "cat > ${timer} << EOS | |
[Unit] | |
Description=periodic etcd snapshot | |
[Timer] | |
OnBootSec=120sec | |
# Actual interval would be 10+0~5 sec | |
OnUnitInactiveSec=60sec | |
AccuracySec=5sec | |
[Install] | |
WantedBy=timers.target | |
EOS | |
" | |
service="/etc/systemd/system/${name}.service" | |
tester_log creating check service for etcd | |
_run_as_root bash -c "cat > ${service} << EOS | |
[Unit] | |
Description=etcd snapshot | |
Wants=network.target | |
[Service] | |
Type=simple | |
Environment=ETCD_INDEX=$(config_etcd_index) | |
Environment=STATE_FILES_DIR=$(config_state_dir) | |
Environment=SNAPSHOTS_S3_URI=$(config_snapshots_s3_uri) | |
Environment=AWS_ACCESS_KEY_ID=$(config_aws_access_key_id) | |
Environment=AWS_SECRET_ACCESS_KEY=$(config_aws_secret_access_key) | |
Environment=ETCD_COUNT=$(config_etcd_count) | |
Environment=ETCD_INITIAL_CLUSTER=$(config_etcd_initial_cluster) | |
Environment=ETCDCTL_ENDPOINTS=$(config_etcdctl_endpoints) | |
Environment=ETCD_DATA_DIR=$(member_data_dir) | |
Restart=no | |
ExecStartPre=systemctl is-active $(member_systemd_unit_name) | |
ExecStart=$(pwd)/etcdadm save | |
EOS | |
" | |
_run_as_root systemctl daemon-reload | |
_run_as_root systemctl restart ${name}.service | |
_run_as_root systemctl restart ${name}.timer | |
} | |
tester_simulate_node_startup() { | |
tester_set_num_running_nodes $(( $(cluster_num_running_nodes) + 1 )) | |
tester_log num running nodes is now $(cluster_num_running_nodes) | |
tester_start_member | |
} | |
tester_stop_member() { | |
if systemctl status $(member_systemd_unit_name) >/dev/null 2>&1; then | |
tester_log stopping $(member_name) | |
sudo systemctl stop $(member_systemd_unit_name) | |
tester_log stopped $(member_name) | |
else | |
tester_log $(member_name) is already stopped | |
fi | |
} | |
tester_simulate_permanent_member_failure() { | |
tester_stop_member | |
member_failure_beginning_time_set 0 | |
} | |
tester_simulate_temporary_member_failure() { | |
tester_stop_member | |
member_failure_beginning_time_clear | |
} | |
tester_simulate_permanent_cluster_failure() { | |
for i in $(cluster_member_indices); do | |
ETCD_INDEX=$i tester_simulate_permanent_member_failure | |
ETCD_INDEX=$i cluster_failure_beginning_time_set 0 | |
done | |
} | |
tester_break_member() { | |
local d | |
d=$(member_data_dir) | |
svc=$(member_systemd_service_name) | |
_run_as_root bash -ec "echo breakit > ${d}/member/wal/0000000000000000-0000000000000000.wal | |
ls -lah ${d}/member/wal | |
systemctl stop ${svc} | |
" | |
} | |
tester_break_cluster() { | |
for i in 0 1 2; do | |
ETCD_INDEX=$i tester_break_member | |
done | |
} | |
tester_trigger_disaster_recovery() { | |
tester_break_cluster | |
sleep 80 | |
for i in 0 1 2; do | |
sudo systemctl start etcd-member-${i}.service | |
done | |
} | |
tester_put_v3() { | |
tester_log writing v3 key $1=$2 | |
member_etcdctl put "${@:1}" | |
} | |
tester_put_v2() { | |
tester_log writing v2 key $1=$2 | |
member_etcdctl_v2 set "${@:1}" | |
} | |
tester_get_v3() { | |
tester_log reading v3 key "${@:1}" | |
status=-1 | |
until [ "$status" == "0" ]; do | |
member_etcdctl --consistency l get "${@:1}" | |
status=$? | |
if [ "$status" != "0" ]; then | |
tester_log failed to read. retrying... | |
fi | |
done | |
} | |
tester_get_v2() { | |
tester_log reading v2 key "${@:1}" | |
status=-1 | |
until [ "$status" == "0" ]; do | |
member_etcdctl_v2 get "${@:1}" | |
status=$? | |
if [ "$status" != "0" ]; then | |
tester_log failed to read. retrying... | |
fi | |
done | |
} | |
tester_assert_v3_key_missing() { | |
tester_log checking v3 key "${@:1}" | |
(member_etcdctl --consistency l get "${@:1}" && echo expected the key to be missing but it was not 1>&2) || echo key not found, as expected 1>2 | |
} | |
tester_assert_v2_key_missing() { | |
tester_log checking v2 key "${@:1}" | |
(member_etcdctl_v2 get "${@:1}" && echo expected the key to be missing but it was not 1>&2) || echo key not found, as expected 1>2 | |
} | |
tester_cluster_health() { | |
member_etcdctl_v2 cluster-health | |
} | |
tester_member_is_ready() { | |
# If you mistakenly used ETCDCTL_API=2 here, a get request with ETCDCTL_API=3 may end up with: | |
# "[29444.043939] etcd[5]: Error: context deadline exceeded" | |
member_etcdctl endpoint health >/dev/null 2>&1 && echo yes || echo no | |
} | |
tester_wait_until_ready() { | |
tester_log waiting until $(member_name) is ready... | |
until [ "$(tester_member_is_ready)" == "yes" ]; do | |
sleep 1 | |
done | |
until tester_put_v2 /ready yes; do | |
sleep 1 | |
done | |
until tester_put_v3 /ready yes; do | |
sleep 1 | |
done | |
tester_log $(member_name) is now ready | |
} | |
tester_systemd_ensure_stopped() { | |
local unit=$1 | |
if systemctl is-enabled "$unit"; then | |
_run_as_root systemctl stop "$unit" | |
fi | |
} | |
tester_bootstrap_cluster() { | |
mkdir -p $(tester_work_dir) | |
local num_nodes_file | |
num_nodes_file=$(tester_num_running_nodes_file) | |
if [ -f "${num_nodes_file}" ]; then | |
rm $(tester_num_running_nodes_file) | |
fi | |
for i in $(cluster_member_indices); do | |
setup() { | |
tester_stop_member | |
tester_systemd_ensure_stopped $(member_systemd_service_name)-check.timer | |
tester_systemd_ensure_stopped $(member_systemd_service_name)-snapshot.timer | |
tester_remove_data_dir | |
tester_remove_state_dir | |
tester_clean_local_snapshots | |
tester_clean_remote_snapshots | |
tester_log removing file $(cluster_failure_beginning_time_file) | |
cluster_failure_beginning_time_clear | |
tester_log creating directory $(member_host_snapshots_dir_path) | |
sudo mkdir -p $(member_host_snapshots_dir_path) | |
tester_create_data_dir | |
} | |
ETCD_INDEX=$i setup | |
done | |
tester_each_member member_failure_beginning_time_clear | |
tester_generate_cloud_config_file | |
tester_each_member tester_create_checker_systemd_unit | |
tester_each_member tester_create_configurator_systemd_unit | |
tester_each_member tester_create_snapshot_systemd_unit | |
tester_each_member tester_bind_configurator_systemd_unit | |
_run_as_root systemctl daemon-reload | |
tester_each_member tester_simulate_node_startup | |
tester_each_member tester_wait_until_ready | |
tester_log reading values from a brand-new etcd cluster... 1>&2 | |
tester_each_member tester_assert_v3_key_missing /foo | |
tester_each_member tester_assert_v2_key_missing /foo | |
ETCD_INDEX=0 tester_put_v3 /foo FOO_v3 | |
ETCD_INDEX=0 tester_put_v2 /foo FOO_v2 | |
tester_log reading values written just now... 1>&2 | |
tester_each_member tester_get_v3 /foo | |
tester_each_member tester_get_v2 /foo | |
tester_each_member tester_cluster_health | |
} | |
tester_run_all_tests() { | |
tester_bootstrap_cluster | |
for i in $(cluster_member_indices); do | |
setup() { | |
tester_systemd_ensure_stopped $(member_systemd_service_name)-check.timer | |
tester_systemd_ensure_stopped $(member_systemd_service_name)-snapshot.timer | |
} | |
ETCD_INDEX=$i setup | |
done | |
echo | |
echo started testing recovery from temporary cluster failure | |
echo | |
tester_each_member tester_simulate_temporary_member_failure | |
tester_each_member tester_start_member | |
tester_each_member tester_wait_until_ready | |
echo reading values after all the members are restarted... 1>&2 | |
tester_each_member tester_get_v3 /foo | |
tester_each_member tester_get_v2 /foo | |
tester_each_member tester_cluster_health | |
echo | |
echo finished testing recovery from temporary cluster failure | |
echo | |
# Disaster recovery: Static cluster bootstrap from snapshots (ETCD_INITIAL_CLUSTER_STATE=new) | |
echo | |
echo started testing recovery from permanent cluster failure | |
echo | |
ETCD_INDEX=0 tester_put_v3 /foo VALUE_NOT_READ_v3 | |
ETCD_INDEX=0 tester_put_v2 /foo VALUE_NOT_READ_v2 | |
tester_each_member member_save_snapshot | |
ETCD_INDEX=0 tester_put_v3 /foo VALUE_NOT_READ_v3 | |
ETCD_INDEX=0 tester_put_v2 /foo VALUE_NOT_READ_v2 | |
ETCD_INDEX=1 member_save_snapshot | |
ETCD_INDEX=0 tester_put_v3 /foo VALUE_TO_BE_RESTORED_v3 | |
ETCD_INDEX=0 tester_put_v2 /foo VALUE_TO_BE_RESTORED_v2 | |
ETCD_INDEX=2 member_save_snapshot | |
ETCD_INDEX=0 tester_put_v3 /foo VALUE_TO_BE_LOST_v3 | |
ETCD_INDEX=0 tester_put_v2 /foo VALUE_TO_BE_LOST_v2 | |
# CAUTION: If we used each member's own snapshot to recover the member, the data can be inconsistent | |
# So ensure that all the members reads from the same snapshot taken from a single member | |
echo reading values written just now... 1>&2 | |
tester_each_member tester_get_v3 /foo | |
tester_each_member tester_get_v2 /foo | |
tester_simulate_permanent_cluster_failure | |
tester_each_member tester_start_member | |
tester_each_member tester_wait_until_ready | |
echo reading values restored just now... 1>&2 | |
tester_each_member tester_get_v3 /foo | |
tester_each_member tester_assert_v2_key_missing /foo | |
tester_each_member tester_cluster_health | |
echo | |
echo finished testing recovery from permanent cluster failure | |
echo | |
echo | |
echo started testing recovery from temporary member failures | |
echo | |
for i in $(cluster_member_indices); do | |
setup() { | |
echo started testing $(member_name) | |
tester_simulate_temporary_member_failure | |
sleep 3 | |
tester_start_member | |
tester_wait_until_ready | |
sleep 3 | |
tester_cluster_health | |
tester_get_v3 /foo | |
tester_assert_v2_key_missing /foo | |
echo finished testing $(member_name) | |
} | |
ETCD_INDEX=$i setup | |
done | |
echo | |
echo finished testing recovery from temporary member failures | |
echo | |
#member_stop 0 | |
# (1) this procedure ends up with the well-known error: | |
# "panic: tocommit(19) is out of range [lastIndex(0)]. Was the raft log corrupted, truncated, or lost?" | |
#member_clean 0 | |
#member_set_initial_cluster_state_to_existing 0 | |
#member_start 0 | |
# (2) this procedure ends up with the well-known error: | |
# "panic: tocommit(19) is out of range [lastIndex(<non zero value>)]. Was the raft log corrupted, truncated, or lost?" | |
#member_restore 0 | |
#member_set_initial_cluster_state_to_existing 0 | |
#member_start 0 | |
# Dynamic reconfiguration: replace etcd members with corrupted data one by one (ETCD_INITIAL_CLUSTER_STATE=existing) | |
echo | |
echo started testing recovery from permanent member failures | |
echo | |
for i in $(cluster_member_indices); do | |
{ | |
export ETCD_INDEX=$i | |
echo started testing $(member_name) | |
tester_simulate_permanent_member_failure | |
sleep 3 | |
tester_start_member | |
tester_wait_until_ready | |
sleep 3 | |
tester_cluster_health | |
tester_get_v3 /foo | |
tester_assert_v2_key_missing /foo | |
echo finished testing $(member_name) | |
unset ETCD_INDEX | |
} | |
done | |
echo | |
echo finished testing recovery from permanent member failures | |
echo | |
echo all the tests passed. tailing log... | |
tester_cluster_tail | |
} | |
member_validate() { | |
if ! [ -d $(config_state_dir) ]; then | |
echo "panic! directory $(config_state_dir) does not exist" 1>&2 | |
exit 1 | |
fi | |
if ! sudo [ -w $(config_state_dir) ]; then | |
echo "panic! directory $(config_state_dir) is not writable from $USER" 1>&2 | |
exit 1 | |
fi | |
if ! [ -d $(member_host_snapshots_dir_path) ]; then | |
echo "panic! directory $(member_host_snapshots_dir_path) does not exist" 1>&2 | |
exit 1 | |
fi | |
if ! sudo [ -w $(member_host_snapshots_dir_path) ]; then | |
echo "panic! directory $(member_host_snapshots_dir_path) is not writable from $USER" 1>&2 | |
exit 1 | |
fi | |
if ! [ -d $(member_data_dir) ]; then | |
echo "panic! etcd data dir \"$(member_data_dir)\" does not exist" 1>&2 | |
exit | |
fi | |
if ! sudo [ -w $(member_data_dir) ]; then | |
echo "panic! etcd data dir \"$(member_data_dir)\" is not writable from $USER" 1>&2 | |
exit | |
fi | |
} | |
main() { | |
local cmd=$1 | |
case "${cmd}" in | |
"save" ) | |
member_save_snapshot | |
;; | |
"reconfigure" ) | |
member_reconfigure | |
;; | |
"check" ) | |
cluster_check | |
;; | |
* ) | |
if [ "$(type -t "$cmd")" == "function" ]; then | |
"$cmd" "${@:2}" | |
else | |
echo "Unexpected command: $cmd" 1>&2 | |
exit 1 | |
fi | |
;; | |
esac | |
} | |
set -o nounset | |
set -o errexit | |
set -o pipefail | |
IFS=$'\n\t' | |
main "$@" | |
exit $? | |
# Notes: | |
# If you see "panic: runtime error: index out of range" after restoring a node while forcing a new cluster | |
# Perhaps you're using too old etcd2. For me, it was 2.3.1. | |
# https://github.com/coreos/etcd/issues/6322 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment