Last active
January 25, 2022 09:53
-
-
Save devgrok/f2bf2696776bb3e8e4b720cb12985821 to your computer and use it in GitHub Desktop.
Add ephemera storage to Amazon ECS-optimized AMIs (v1)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Content-Type: multipart/mixed; boundary="==BOUNDARY==" | |
MIME-Version: 1.0 | |
--==BOUNDARY== | |
MIME-Version: 1.0 | |
Content-Type: text/cloud-config | |
# this script ensures that empheral storage doesn't get mounted by cloud-init | |
mounts: | |
- [ ephemeral0 ] | |
--==BOUNDARY== | |
MIME-Version: 1.0 | |
Content-Type: text/cloud-boothook; charset="us-ascii" | |
#!/bin/bash +x | |
# an early running script to setup scripts that get triggered after the ECS init service is started | |
# | |
# Write the bootstrap script to /opt/devgrok and run it - so output definitely logged | |
mkdir -p /opt/devgrok | |
cat > /opt/devgrok/bootstrap-devgrok-init.sh <<- 'EOF' | |
#!/bin/bash -x | |
mkdir -p /opt/devgrok/ecs-init-scripts/ | |
yum install -y aws-cli | |
aws s3 sync s3://devgrok-blog-files/amazon-ecs/ecs-init-scripts/ /opt/devgrok/ecs-init-scripts/ | |
chmod +x /opt/devgrok/ecs-init-scripts/*.sh | |
# setup aws cloudwatch | |
/opt/devgrok/ecs-init-scripts/install-cw-agent.sh | |
[[ -e "/etc/init/devgrok-start-cloudwatch.conf" ]] || \ | |
cp /opt/devgrok/ecs-init-scripts/upstart-start-cloudwatch.conf /etc/init/devgrok-start-cloudwatch.conf | |
# setup storage | |
/opt/devgrok/ecs-init-scripts/docker-add-ephemeral.sh | |
initctl reload-configuration | |
EOF | |
chmod +x /opt/devgrok/bootstrap-devgrok-init.sh | |
/opt/devgrok/bootstrap-devgrok-init.sh 2>&1 | tee -a /var/log/bootstrap-devgrok-init.log | |
# this sets the maximum docker container size to 100GB | |
cloud-init-per once docker_options echo 'OPTIONS="${OPTIONS} --storage-opt dm.basesize=100G"' >> /etc/sysconfig/docker | |
--==BOUNDARY==-- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Content-Type: multipart/mixed; boundary="==BOUNDARY==" | |
MIME-Version: 1.0 | |
--==BOUNDARY== | |
MIME-Version: 1.0 | |
Content-Type: text/cloud-config | |
# this script ensures that empheral storage doesn't get mounted by cloud-init | |
mounts: | |
- [ ephemeral0 ] | |
--==BOUNDARY== | |
MIME-Version: 1.0 | |
Content-Type: text/cloud-boothook; charset="us-ascii" | |
#!/bin/bash +x | |
# an early running script to setup scripts that get triggered after the ECS init service is started | |
# | |
# Write the bootstrap script to /opt/devgrok and run it - so output definitely logged | |
mkdir -p /opt/devgrok | |
cat > /opt/devgrok/bootstrap-devgrok-init.sh <<- 'EOF' | |
#!/bin/bash -x | |
mkdir -p /opt/devgrok/ecs-init-scripts/ | |
yum install -y aws-cli | |
aws s3 sync s3://devgrok-blog-files/amazon-ecs/ecs-init-scripts/ /opt/devgrok/ecs-init-scripts/ | |
chmod +x /opt/devgrok/ecs-init-scripts/*.sh | |
# setup aws cloudwatch | |
/opt/devgrok/ecs-init-scripts/install-cw-agent.sh | |
[[ -e "/etc/systemd/system/bootstrap-awslogs.service" ]] || \ | |
cp /opt/devgrok/ecs-init-scripts/systemd-bootstrap-awslogs.service /etc/systemd/system/bootstrap-awslogs.service | |
# setup storage | |
/opt/devgrok/ecs-init-scripts/docker-amzn-2-add-ephemeral.sh | |
EOF | |
chmod +x /opt/devgrok/bootstrap-devgrok-init.sh | |
/opt/devgrok/bootstrap-devgrok-init.sh 2>&1 | tee -a /var/log/bootstrap-devgrok-init.log | |
--==BOUNDARY== | |
Content-Type: text/x-shellscript; charset="us-ascii" | |
#!/bin/sh | |
# this runs later in the boot process | |
systemctl daemon-reload | |
systemctl enable bootstrap-awslogs.service | |
# the no-block is important as the ecs-init service may be dependant on the cloud-init boot process finishing first | |
systemctl start bootstrap-awslogs.service --no-block | |
--==BOUNDARY==-- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash -exu | |
# find ephemeral devices using the metadata service | |
# note: this will block loading until the metadata service is ready | |
find_ephemeral_devices() { | |
echo "Querying metadata instance store for ephemeral volumes" >&2 | |
local DEVICE_NOT_FOUND=0 | |
for d in $(curl -s "http://169.254.169.254/latest/meta-data/block-device-mapping/" | grep ephemeral); do | |
NAME=$(curl -s "http://169.254.169.254/latest/meta-data/block-device-mapping/$d") | |
DEVICE_NAME="/dev/$NAME" | |
echo "Detected ephemeral device $d corresponding to $DEVICE_NAME" >&2 | |
# if block device (-b) | |
if [[ -b "$DEVICE_NAME" ]]; then | |
echo "$DEVICE_NAME" | |
DEVICE_NOT_FOUND=0 | |
else | |
echo "Couldn't find device $DEVICE_NAME" >&2 | |
DEVICE_NOT_FOUND=1 | |
fi | |
done | |
return ${DEVICE_NOT_FOUND} | |
} | |
# on nvme instances the device names don't match what AWS reports, so just filter by model name | |
find_nvme_ephemeral() { | |
lsblk -o KNAME,MODEL | grep "Amazon EC2 NVMe Instance Storage" | awk '{ print "/dev/"$1 }' | |
} | |
METADATA_DEVICES=$(find_ephemeral_devices) | |
NVME_DEVICES=$(find_nvme_ephemeral) | |
DEVS=/dev/xvdcz | |
[[ ! -z "$NVME_DEVICES" ]] && DEVS="$NVME_DEVICES $DEVS" | |
[[ ! -z "$METADATA_DEVICES" ]] && DEVS="$METADATA_DEVICES $DEVS" | |
echo "Updating docker storage devices to '$DEVS'" | |
sed -i -e "s;DEVS=.*;DEVS=\"${DEVS}\";g" /etc/sysconfig/docker-storage-setup |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#-- | |
# Copyright 2014-2017 Red Hat, Inc. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
#++ | |
# Purpose: This script sets up the storage for container runtimes. | |
# Author: Andy Grimm <agrimm@redhat.com> | |
# | |
# | |
# Trimmed down version to setup storage based of ext4+overlay2 on the current | |
# AMi provided by AWS (Amazon ECS-optimized Amazon Linux 2 AMI). | |
# This is instead of LVM/thin-pool+devicemapper configured in previous AMI | |
# (Amazon ECS-optimized Amazon Linux AMI) | |
# This is a heavily cut=down version of: | |
# https://github.com/projectatomic/container-storage-setup/blob/master/container-storage-setup.sh | |
# | |
# Author: Chris Watts <watts.chris@gmail.com> | |
set -ex | |
# Partition type related | |
_MAX_MBR_SIZE_BYTES="2199023255040" | |
_TEMPDIR=$(mktemp --tmpdir -d) | |
# Keeps track of resolved device paths | |
_DEVS_RESOLVED="" | |
# Keeps track of if we created a volume group or not. | |
_VG_CREATED= | |
VG=docker | |
LV_NAME=docker-volumes | |
LV_MOUNT_PATH=/var/lib/docker/volumes | |
WIPE_SIGNATURES=true | |
CREATE_VG=true | |
# find devices which aren't the root partition, doesn't use metadata service to find ephemeral to speed up start-up time | |
function find_non_root_devices() { | |
local root_part root_dev other_devices | |
shopt -s nullglob | |
root_part=$(lsblk -Pp | grep 'MOUNTPOINT="/"' | sed -nr 's/NAME="([^"]+)".*/\1/p') | |
root_dev=$(lsblk -pno pkname ${root_part}) | |
if [[ -z "$root_dev" ]]; then | |
Fatal "Couldn't find root device, bailing" | |
fi | |
# filter out devices which aren't disks or are the root volume | |
other_devices=$(lsblk -Pp | grep 'TYPE="disk"' | grep -v 'NAME="'${root_dev}'"' | sed -nr 's/NAME="([^"]+)".*/\1/p') | |
Info "Found storage devices $other_devices to use" | |
echo "$other_devices" | |
} | |
# Wait for a device for certain time interval. If device is found 0 is | |
# returned otherwise 1. | |
wait_for_dev() { | |
local devpath=$1 | |
local timeout=$DEVICE_WAIT_TIMEOUT | |
if [ -b "$devpath" ];then | |
Info "Device node $devpath exists." | |
return 0 | |
fi | |
if [ -z "$DEVICE_WAIT_TIMEOUT" ] || [ "$DEVICE_WAIT_TIMEOUT" == "0" ];then | |
Info "Not waiting for device $devpath as DEVICE_WAIT_TIMEOUT=${DEVICE_WAIT_TIMEOUT}." | |
return 0 | |
fi | |
while [ $timeout -gt 0 ]; do | |
Info "Waiting for device $devpath to be available. Wait time remaining is $timeout seconds" | |
if [ $timeout -le 5 ];then | |
sleep $timeout | |
else | |
sleep 5 | |
fi | |
timeout=$((timeout-5)) | |
if [ -b "$devpath" ]; then | |
Info "Device node $devpath exists." | |
return 0 | |
fi | |
done | |
Info "Timed out waiting for device $devpath" | |
return 1 | |
} | |
systemd_escaped_filename () { | |
local escaped_path filename path=$1 | |
escaped_path=$(echo ${path}|sed 's|-|\\x2d|g') | |
filename=$(echo ${escaped_path}.mount|sed 's|/|-|g' | cut -b 2-) | |
echo $filename | |
} | |
# | |
# In the past we created a systemd mount target file, we no longer | |
# use it, but if one pre-existed we still need to handle it. | |
# | |
remove_systemd_mount_target () { | |
local mp=$1 | |
local filename=$(systemd_escaped_filename $mp) | |
if [ -f /etc/systemd/system/$filename ]; then | |
if [ -x /usr/bin/systemctl ];then | |
systemctl disable $filename >/dev/null 2>&1 | |
systemctl stop $filename >/dev/null 2>&1 | |
systemctl daemon-reload | |
fi | |
rm -f /etc/systemd/system/$filename >/dev/null 2>&1 | |
fi | |
} | |
check_wipe_block_dev_sig() { | |
local bdev=$1 | |
local sig | |
if ! sig=$(wipefs -p $bdev); then | |
Fatal "Failed to check signatures on device $bdev" | |
fi | |
[ "$sig" == "" ] && return 0 | |
if [ "$WIPE_SIGNATURES" == "true" ];then | |
Info "Wipe Signatures is set to true. Any signatures on $bdev will be wiped." | |
if ! wipefs -a $bdev; then | |
Fatal "Failed to wipe signatures on device $bdev" | |
fi | |
return 0 | |
fi | |
while IFS=, read offset uuid label type; do | |
[ "$offset" == "# offset" ] && continue | |
Fatal "Found $type signature on device ${bdev} at offset ${offset}. Wipe signatures using wipefs or use WIPE_SIGNATURES=true and retry." | |
done <<< "$sig" | |
} | |
determine_partition_type() { | |
local dev="$1" size_bytes part_type | |
if ! size_bytes=$(blockdev --getsize64 "$dev"); then | |
Fatal "Failed to determine size of disk $dev" | |
fi | |
if [ $size_bytes -gt $_MAX_MBR_SIZE_BYTES ];then | |
part_type="gpt" | |
else | |
part_type="dos" | |
fi | |
echo $part_type | |
} | |
create_partition_sfdisk(){ | |
local dev="$1" part_type="$2" size part_label | |
# Use a single partition of a whole device | |
# TODO: | |
# * Consider gpt, or unpartitioned volumes | |
# * Error handling when partition(s) already exist | |
# * Deal with loop/nbd device names. See growpart code | |
if [ "$part_type" == "gpt" ];then | |
# Linux LVM GUID for GPT. Taken from Wiki. | |
part_label="E6D6D379-F507-44C2-A23C-238F2A3DF928" | |
# Create as big a partition as possible. | |
size="" | |
else | |
part_label="8e" | |
size=$(( $( awk "\$4 ~ /"$( basename $dev )"/ { print \$3 }" /proc/partitions ) * 2 - 2048 )) | |
fi | |
cat <<EOF | sfdisk $dev | |
unit: sectors | |
label: $part_type | |
2048,${size},$part_label | |
EOF | |
} | |
create_partition_parted(){ | |
local dev="$1" | |
local part_type="$2" | |
if [ "$part_type" == "gpt" ];then | |
parted $dev --script mklabel gpt mkpart "container-partition" 0% 100% set 1 lvm on | |
else | |
parted $dev --script mklabel msdos mkpart primary 0% 100% set 1 lvm on | |
fi | |
} | |
create_partition() { | |
local dev="$1" part part_type | |
part_type=`determine_partition_type "$dev"` | |
if [ -x "/usr/sbin/parted" ]; then | |
create_partition_parted $dev "$part_type" | |
else | |
create_partition_sfdisk $dev "$part_type" | |
fi | |
# Sometimes on slow storage it takes a while for partition device to | |
# become available. Wait for device node to show up. | |
if ! udevadm settle;then | |
Fatal "udevadm settle after partition creation failed. Exiting." | |
fi | |
part=$(dev_query_first_child $dev) | |
if ! wait_for_dev ${part}; then | |
Fatal "Partition device ${part} is not available" | |
fi | |
} | |
dev_query_first_child() { | |
lsblk -npl -o NAME "$1" | tail -n +2 | head -1 | |
} | |
create_disk_partitions() { | |
local devs="$1" part | |
for dev in $devs; do | |
# wipefs /dev/disk does not wipe any lvm signatures which might be | |
# present on /dev/diskpart1. This signature will become visible to | |
# lvm udev rules and will kickstart volume creation as soon as partion | |
# is created and race with further partition commands like wipefs, | |
# pvcreate etc. So zero out first few MB of disk in an attempt to | |
# wipe any lvm signatures on first partition. | |
# | |
# By now we have ownership of disk and we have checked there are no | |
# signatures on disk or signatures have been wiped. Dont care about | |
# any signatures now on in the middle of disk. | |
Info "Writing zeros to first 4MB of device $dev" | |
if ! dd if=/dev/zero of=$dev bs=1M count=4; then | |
Fatal "Failed to zero first 4MB of device $dev" | |
fi | |
create_partition $dev | |
part=$(dev_query_first_child $dev) | |
# It now seems unnecessary to do wipefs on partition given we already | |
# zeroed out first 4MB. Only time it will be required if partition | |
# starts beyong 4MB. Keep it for now. | |
if ! wipefs -f -a ${part}; then | |
Fatal "Failed to wipe signatures on device ${part}" | |
fi | |
pvcreate ${part} | |
_PVS="$_PVS ${part}" | |
done | |
} | |
create_extend_volume_group() { | |
if [ -z "$_VG_EXISTS" ]; then | |
vgcreate $VG $_PVS | |
_VG_CREATED=1 | |
_VG_EXISTS=1 | |
else | |
# TODO: | |
# * Error handling when PV is already part of a VG | |
vgextend $VG $_PVS | |
fi | |
} | |
# This returns the mountpoint of $1 | |
extra_lv_mountpoint() { | |
local mounts | |
local vg=$1 | |
local lv_name=$2 | |
local mount_dir=$3 | |
mounts=$(findmnt -n -o TARGET --source /dev/$vg/$lv_name | grep "^$mount_dir") | |
echo $mounts | |
} | |
mount_extra_volume() { | |
local vg=$1 | |
local lv_name=$2 | |
local mount_dir=$3 | |
remove_systemd_mount_target $mount_dir | |
mounts=$(extra_lv_mountpoint $vg $lv_name $mount_dir) | |
if [ -z "$mounts" ]; then | |
mount -t ext4 /dev/$vg/$lv_name $mount_dir | |
fi | |
} | |
# Create a logical volume of size specified by first argument. Name of the | |
# volume is specified using second argument. | |
create_lv() { | |
local data_size=$1 | |
local data_lv_name=$2 | |
# striping like this means the max volume group size = min(of each device) * count(devices) | |
if [[ $data_size == *%* ]]; then | |
lvcreate -y -l $data_size -n $data_lv_name --stripes $(echo "$DEVS" |wc -l) $VG || return 1 | |
else | |
lvcreate -y -L $data_size -n $data_lv_name --stripes $(echo "$DEVS" |wc -l) $VG || return 1 | |
fi | |
return 0 | |
} | |
setup_extra_volume() { | |
local lv_name=$1 | |
local mount_dir=$2 | |
local lv_size=$3 | |
if ! create_lv $lv_size $lv_name; then | |
Fatal "Failed to create volume $lv_name of size ${lv_size}." | |
fi | |
if ! mkfs -t ext4 /dev/$VG/$lv_name > /dev/null; then | |
Fatal "Failed to create filesystem on /dev/$VG/${lv_name}." | |
fi | |
if ! mount_extra_volume $VG $lv_name $mount_dir; then | |
Fatal "Failed to mount volume ${lv_name} on ${mount_dir}" | |
fi | |
# setup right selinux label first time fs is created. Mount operation | |
# changes the label of directory to reflect the label on root inode | |
# of mounted fs. | |
if ! restore_selinux_context $mount_dir; then | |
return 1 | |
fi | |
} | |
restore_selinux_context() { | |
local dir=$1 | |
if ! restorecon -R $dir; then | |
Error "restorecon -R $dir failed." | |
return 1 | |
fi | |
} | |
partition_disks_create_vg() { | |
local dev_list | |
# If there is no volume group specified or no root volume group, there is | |
# nothing to do in terms of dealing with disks. | |
if [[ -n "$DEVS" && -n "$VG" ]]; then | |
_DEVS_RESOLVED="${DEVS}" # already resolved | |
dev_list="$_DEVS_RESOLVED" # wipe all disks | |
for dev in $dev_list; do | |
check_wipe_block_dev_sig $dev | |
done | |
create_disk_partitions "$dev_list" | |
create_extend_volume_group | |
fi | |
} | |
function run_command_create() { | |
Info "Doing new script mountain volume group creation" | |
DEVS="$(find_non_root_devices)" | |
if [[ -z "$DEVS" ]]; then | |
# allow it to be run on instance without extra storage | |
Info "no devices to use so skipping volume storage creation" | |
exit 0 | |
fi | |
if ! vg_exists "$VG"; then | |
Info "Checking/creating volume group" | |
partition_disks_create_vg | |
vgdisplay | |
echo "" | |
Info "Creating logical volume" | |
setup_extra_volume "$LV_NAME" "$LV_MOUNT_PATH" '100%VG' | |
else | |
Info "Volume group '$VG' already exists, skipping an actions" | |
fi | |
lvdisplay | |
Info "Done" | |
} | |
# source library (borrowed from container-storage-setup) | |
source $(dirname $0)/libcss.sh | |
# make sure lvm is installed | |
yum install -y lvm2 | |
if [[ "$CREATE_VG" == "true" ]]; then | |
# ensure mount point exists | |
mkdir -p ${LV_MOUNT_PATH} | |
run_command_create | |
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Library for common functions (copied from https://github.com/projectatomic/container-storage-setup/blob/master/libcss.sh) | |
# echo info messages on stdout | |
Info() { | |
# stdout is used to pass back output from bash functions | |
# so we use stderr | |
echo "INFO: ${1}" >&2 | |
} | |
# echo warn messages on stderr | |
Warn() { | |
echo "WARN: ${1}" >&2 | |
} | |
# echo error messages on stderr | |
Error() { | |
echo "ERROR: ${1}" >&2 | |
} | |
# echo error on stderr and exit with error code 1 | |
Fatal() { | |
Error "${1}" | |
exit 1 | |
} | |
# Check if passed in vg exists. Returns 0 if volume group exists. | |
vg_exists() { | |
local vg=$1 | |
for vg_name in $(vgs --noheadings -o vg_name); do | |
if [ "$vg_name" == "$vg" ]; then | |
return 0 | |
fi | |
done | |
return 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment