Skip to content

Instantly share code, notes, and snippets.

@devgrok
Last active January 25, 2022 09:53
Show Gist options
  • Save devgrok/f2bf2696776bb3e8e4b720cb12985821 to your computer and use it in GitHub Desktop.
Save devgrok/f2bf2696776bb3e8e4b720cb12985821 to your computer and use it in GitHub Desktop.
Add ephemera storage to Amazon ECS-optimized AMIs (v1)
Content-Type: multipart/mixed; boundary="==BOUNDARY=="
MIME-Version: 1.0
--==BOUNDARY==
MIME-Version: 1.0
Content-Type: text/cloud-config
# this script ensures that empheral storage doesn't get mounted by cloud-init
mounts:
- [ ephemeral0 ]
--==BOUNDARY==
MIME-Version: 1.0
Content-Type: text/cloud-boothook; charset="us-ascii"
#!/bin/bash +x
# an early running script to setup scripts that get triggered after the ECS init service is started
#
# Write the bootstrap script to /opt/devgrok and run it - so output definitely logged
mkdir -p /opt/devgrok
cat > /opt/devgrok/bootstrap-devgrok-init.sh <<- 'EOF'
#!/bin/bash -x
mkdir -p /opt/devgrok/ecs-init-scripts/
yum install -y aws-cli
aws s3 sync s3://devgrok-blog-files/amazon-ecs/ecs-init-scripts/ /opt/devgrok/ecs-init-scripts/
chmod +x /opt/devgrok/ecs-init-scripts/*.sh
# setup aws cloudwatch
/opt/devgrok/ecs-init-scripts/install-cw-agent.sh
[[ -e "/etc/init/devgrok-start-cloudwatch.conf" ]] || \
cp /opt/devgrok/ecs-init-scripts/upstart-start-cloudwatch.conf /etc/init/devgrok-start-cloudwatch.conf
# setup storage
/opt/devgrok/ecs-init-scripts/docker-add-ephemeral.sh
initctl reload-configuration
EOF
chmod +x /opt/devgrok/bootstrap-devgrok-init.sh
/opt/devgrok/bootstrap-devgrok-init.sh 2>&1 | tee -a /var/log/bootstrap-devgrok-init.log
# this sets the maximum docker container size to 100GB
cloud-init-per once docker_options echo 'OPTIONS="${OPTIONS} --storage-opt dm.basesize=100G"' >> /etc/sysconfig/docker
--==BOUNDARY==--
Content-Type: multipart/mixed; boundary="==BOUNDARY=="
MIME-Version: 1.0
--==BOUNDARY==
MIME-Version: 1.0
Content-Type: text/cloud-config
# this script ensures that empheral storage doesn't get mounted by cloud-init
mounts:
- [ ephemeral0 ]
--==BOUNDARY==
MIME-Version: 1.0
Content-Type: text/cloud-boothook; charset="us-ascii"
#!/bin/bash +x
# an early running script to setup scripts that get triggered after the ECS init service is started
#
# Write the bootstrap script to /opt/devgrok and run it - so output definitely logged
mkdir -p /opt/devgrok
cat > /opt/devgrok/bootstrap-devgrok-init.sh <<- 'EOF'
#!/bin/bash -x
mkdir -p /opt/devgrok/ecs-init-scripts/
yum install -y aws-cli
aws s3 sync s3://devgrok-blog-files/amazon-ecs/ecs-init-scripts/ /opt/devgrok/ecs-init-scripts/
chmod +x /opt/devgrok/ecs-init-scripts/*.sh
# setup aws cloudwatch
/opt/devgrok/ecs-init-scripts/install-cw-agent.sh
[[ -e "/etc/systemd/system/bootstrap-awslogs.service" ]] || \
cp /opt/devgrok/ecs-init-scripts/systemd-bootstrap-awslogs.service /etc/systemd/system/bootstrap-awslogs.service
# setup storage
/opt/devgrok/ecs-init-scripts/docker-amzn-2-add-ephemeral.sh
EOF
chmod +x /opt/devgrok/bootstrap-devgrok-init.sh
/opt/devgrok/bootstrap-devgrok-init.sh 2>&1 | tee -a /var/log/bootstrap-devgrok-init.log
--==BOUNDARY==
Content-Type: text/x-shellscript; charset="us-ascii"
#!/bin/sh
# this runs later in the boot process
systemctl daemon-reload
systemctl enable bootstrap-awslogs.service
# the no-block is important as the ecs-init service may be dependant on the cloud-init boot process finishing first
systemctl start bootstrap-awslogs.service --no-block
--==BOUNDARY==--
#!/bin/bash -exu
# find ephemeral devices using the metadata service
# note: this will block loading until the metadata service is ready
find_ephemeral_devices() {
echo "Querying metadata instance store for ephemeral volumes" >&2
local DEVICE_NOT_FOUND=0
for d in $(curl -s "http://169.254.169.254/latest/meta-data/block-device-mapping/" | grep ephemeral); do
NAME=$(curl -s "http://169.254.169.254/latest/meta-data/block-device-mapping/$d")
DEVICE_NAME="/dev/$NAME"
echo "Detected ephemeral device $d corresponding to $DEVICE_NAME" >&2
# if block device (-b)
if [[ -b "$DEVICE_NAME" ]]; then
echo "$DEVICE_NAME"
DEVICE_NOT_FOUND=0
else
echo "Couldn't find device $DEVICE_NAME" >&2
DEVICE_NOT_FOUND=1
fi
done
return ${DEVICE_NOT_FOUND}
}
# on nvme instances the device names don't match what AWS reports, so just filter by model name
find_nvme_ephemeral() {
lsblk -o KNAME,MODEL | grep "Amazon EC2 NVMe Instance Storage" | awk '{ print "/dev/"$1 }'
}
METADATA_DEVICES=$(find_ephemeral_devices)
NVME_DEVICES=$(find_nvme_ephemeral)
DEVS=/dev/xvdcz
[[ ! -z "$NVME_DEVICES" ]] && DEVS="$NVME_DEVICES $DEVS"
[[ ! -z "$METADATA_DEVICES" ]] && DEVS="$METADATA_DEVICES $DEVS"
echo "Updating docker storage devices to '$DEVS'"
sed -i -e "s;DEVS=.*;DEVS=\"${DEVS}\";g" /etc/sysconfig/docker-storage-setup
#!/bin/bash
#--
# Copyright 2014-2017 Red Hat, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#++
# Purpose: This script sets up the storage for container runtimes.
# Author: Andy Grimm <agrimm@redhat.com>
#
#
# Trimmed down version to setup storage based of ext4+overlay2 on the current
# AMi provided by AWS (Amazon ECS-optimized Amazon Linux 2 AMI).
# This is instead of LVM/thin-pool+devicemapper configured in previous AMI
# (Amazon ECS-optimized Amazon Linux AMI)
# This is a heavily cut=down version of:
# https://github.com/projectatomic/container-storage-setup/blob/master/container-storage-setup.sh
#
# Author: Chris Watts <watts.chris@gmail.com>
set -ex
# Partition type related
_MAX_MBR_SIZE_BYTES="2199023255040"
_TEMPDIR=$(mktemp --tmpdir -d)
# Keeps track of resolved device paths
_DEVS_RESOLVED=""
# Keeps track of if we created a volume group or not.
_VG_CREATED=
VG=docker
LV_NAME=docker-volumes
LV_MOUNT_PATH=/var/lib/docker/volumes
WIPE_SIGNATURES=true
CREATE_VG=true
# find devices which aren't the root partition, doesn't use metadata service to find ephemeral to speed up start-up time
function find_non_root_devices() {
local root_part root_dev other_devices
shopt -s nullglob
root_part=$(lsblk -Pp | grep 'MOUNTPOINT="/"' | sed -nr 's/NAME="([^"]+)".*/\1/p')
root_dev=$(lsblk -pno pkname ${root_part})
if [[ -z "$root_dev" ]]; then
Fatal "Couldn't find root device, bailing"
fi
# filter out devices which aren't disks or are the root volume
other_devices=$(lsblk -Pp | grep 'TYPE="disk"' | grep -v 'NAME="'${root_dev}'"' | sed -nr 's/NAME="([^"]+)".*/\1/p')
Info "Found storage devices $other_devices to use"
echo "$other_devices"
}
# Wait for a device for certain time interval. If device is found 0 is
# returned otherwise 1.
wait_for_dev() {
local devpath=$1
local timeout=$DEVICE_WAIT_TIMEOUT
if [ -b "$devpath" ];then
Info "Device node $devpath exists."
return 0
fi
if [ -z "$DEVICE_WAIT_TIMEOUT" ] || [ "$DEVICE_WAIT_TIMEOUT" == "0" ];then
Info "Not waiting for device $devpath as DEVICE_WAIT_TIMEOUT=${DEVICE_WAIT_TIMEOUT}."
return 0
fi
while [ $timeout -gt 0 ]; do
Info "Waiting for device $devpath to be available. Wait time remaining is $timeout seconds"
if [ $timeout -le 5 ];then
sleep $timeout
else
sleep 5
fi
timeout=$((timeout-5))
if [ -b "$devpath" ]; then
Info "Device node $devpath exists."
return 0
fi
done
Info "Timed out waiting for device $devpath"
return 1
}
systemd_escaped_filename () {
local escaped_path filename path=$1
escaped_path=$(echo ${path}|sed 's|-|\\x2d|g')
filename=$(echo ${escaped_path}.mount|sed 's|/|-|g' | cut -b 2-)
echo $filename
}
#
# In the past we created a systemd mount target file, we no longer
# use it, but if one pre-existed we still need to handle it.
#
remove_systemd_mount_target () {
local mp=$1
local filename=$(systemd_escaped_filename $mp)
if [ -f /etc/systemd/system/$filename ]; then
if [ -x /usr/bin/systemctl ];then
systemctl disable $filename >/dev/null 2>&1
systemctl stop $filename >/dev/null 2>&1
systemctl daemon-reload
fi
rm -f /etc/systemd/system/$filename >/dev/null 2>&1
fi
}
check_wipe_block_dev_sig() {
local bdev=$1
local sig
if ! sig=$(wipefs -p $bdev); then
Fatal "Failed to check signatures on device $bdev"
fi
[ "$sig" == "" ] && return 0
if [ "$WIPE_SIGNATURES" == "true" ];then
Info "Wipe Signatures is set to true. Any signatures on $bdev will be wiped."
if ! wipefs -a $bdev; then
Fatal "Failed to wipe signatures on device $bdev"
fi
return 0
fi
while IFS=, read offset uuid label type; do
[ "$offset" == "# offset" ] && continue
Fatal "Found $type signature on device ${bdev} at offset ${offset}. Wipe signatures using wipefs or use WIPE_SIGNATURES=true and retry."
done <<< "$sig"
}
determine_partition_type() {
local dev="$1" size_bytes part_type
if ! size_bytes=$(blockdev --getsize64 "$dev"); then
Fatal "Failed to determine size of disk $dev"
fi
if [ $size_bytes -gt $_MAX_MBR_SIZE_BYTES ];then
part_type="gpt"
else
part_type="dos"
fi
echo $part_type
}
create_partition_sfdisk(){
local dev="$1" part_type="$2" size part_label
# Use a single partition of a whole device
# TODO:
# * Consider gpt, or unpartitioned volumes
# * Error handling when partition(s) already exist
# * Deal with loop/nbd device names. See growpart code
if [ "$part_type" == "gpt" ];then
# Linux LVM GUID for GPT. Taken from Wiki.
part_label="E6D6D379-F507-44C2-A23C-238F2A3DF928"
# Create as big a partition as possible.
size=""
else
part_label="8e"
size=$(( $( awk "\$4 ~ /"$( basename $dev )"/ { print \$3 }" /proc/partitions ) * 2 - 2048 ))
fi
cat <<EOF | sfdisk $dev
unit: sectors
label: $part_type
2048,${size},$part_label
EOF
}
create_partition_parted(){
local dev="$1"
local part_type="$2"
if [ "$part_type" == "gpt" ];then
parted $dev --script mklabel gpt mkpart "container-partition" 0% 100% set 1 lvm on
else
parted $dev --script mklabel msdos mkpart primary 0% 100% set 1 lvm on
fi
}
create_partition() {
local dev="$1" part part_type
part_type=`determine_partition_type "$dev"`
if [ -x "/usr/sbin/parted" ]; then
create_partition_parted $dev "$part_type"
else
create_partition_sfdisk $dev "$part_type"
fi
# Sometimes on slow storage it takes a while for partition device to
# become available. Wait for device node to show up.
if ! udevadm settle;then
Fatal "udevadm settle after partition creation failed. Exiting."
fi
part=$(dev_query_first_child $dev)
if ! wait_for_dev ${part}; then
Fatal "Partition device ${part} is not available"
fi
}
dev_query_first_child() {
lsblk -npl -o NAME "$1" | tail -n +2 | head -1
}
create_disk_partitions() {
local devs="$1" part
for dev in $devs; do
# wipefs /dev/disk does not wipe any lvm signatures which might be
# present on /dev/diskpart1. This signature will become visible to
# lvm udev rules and will kickstart volume creation as soon as partion
# is created and race with further partition commands like wipefs,
# pvcreate etc. So zero out first few MB of disk in an attempt to
# wipe any lvm signatures on first partition.
#
# By now we have ownership of disk and we have checked there are no
# signatures on disk or signatures have been wiped. Dont care about
# any signatures now on in the middle of disk.
Info "Writing zeros to first 4MB of device $dev"
if ! dd if=/dev/zero of=$dev bs=1M count=4; then
Fatal "Failed to zero first 4MB of device $dev"
fi
create_partition $dev
part=$(dev_query_first_child $dev)
# It now seems unnecessary to do wipefs on partition given we already
# zeroed out first 4MB. Only time it will be required if partition
# starts beyong 4MB. Keep it for now.
if ! wipefs -f -a ${part}; then
Fatal "Failed to wipe signatures on device ${part}"
fi
pvcreate ${part}
_PVS="$_PVS ${part}"
done
}
create_extend_volume_group() {
if [ -z "$_VG_EXISTS" ]; then
vgcreate $VG $_PVS
_VG_CREATED=1
_VG_EXISTS=1
else
# TODO:
# * Error handling when PV is already part of a VG
vgextend $VG $_PVS
fi
}
# This returns the mountpoint of $1
extra_lv_mountpoint() {
local mounts
local vg=$1
local lv_name=$2
local mount_dir=$3
mounts=$(findmnt -n -o TARGET --source /dev/$vg/$lv_name | grep "^$mount_dir")
echo $mounts
}
mount_extra_volume() {
local vg=$1
local lv_name=$2
local mount_dir=$3
remove_systemd_mount_target $mount_dir
mounts=$(extra_lv_mountpoint $vg $lv_name $mount_dir)
if [ -z "$mounts" ]; then
mount -t ext4 /dev/$vg/$lv_name $mount_dir
fi
}
# Create a logical volume of size specified by first argument. Name of the
# volume is specified using second argument.
create_lv() {
local data_size=$1
local data_lv_name=$2
# striping like this means the max volume group size = min(of each device) * count(devices)
if [[ $data_size == *%* ]]; then
lvcreate -y -l $data_size -n $data_lv_name --stripes $(echo "$DEVS" |wc -l) $VG || return 1
else
lvcreate -y -L $data_size -n $data_lv_name --stripes $(echo "$DEVS" |wc -l) $VG || return 1
fi
return 0
}
setup_extra_volume() {
local lv_name=$1
local mount_dir=$2
local lv_size=$3
if ! create_lv $lv_size $lv_name; then
Fatal "Failed to create volume $lv_name of size ${lv_size}."
fi
if ! mkfs -t ext4 /dev/$VG/$lv_name > /dev/null; then
Fatal "Failed to create filesystem on /dev/$VG/${lv_name}."
fi
if ! mount_extra_volume $VG $lv_name $mount_dir; then
Fatal "Failed to mount volume ${lv_name} on ${mount_dir}"
fi
# setup right selinux label first time fs is created. Mount operation
# changes the label of directory to reflect the label on root inode
# of mounted fs.
if ! restore_selinux_context $mount_dir; then
return 1
fi
}
restore_selinux_context() {
local dir=$1
if ! restorecon -R $dir; then
Error "restorecon -R $dir failed."
return 1
fi
}
partition_disks_create_vg() {
local dev_list
# If there is no volume group specified or no root volume group, there is
# nothing to do in terms of dealing with disks.
if [[ -n "$DEVS" && -n "$VG" ]]; then
_DEVS_RESOLVED="${DEVS}" # already resolved
dev_list="$_DEVS_RESOLVED" # wipe all disks
for dev in $dev_list; do
check_wipe_block_dev_sig $dev
done
create_disk_partitions "$dev_list"
create_extend_volume_group
fi
}
function run_command_create() {
Info "Doing new script mountain volume group creation"
DEVS="$(find_non_root_devices)"
if [[ -z "$DEVS" ]]; then
# allow it to be run on instance without extra storage
Info "no devices to use so skipping volume storage creation"
exit 0
fi
if ! vg_exists "$VG"; then
Info "Checking/creating volume group"
partition_disks_create_vg
vgdisplay
echo ""
Info "Creating logical volume"
setup_extra_volume "$LV_NAME" "$LV_MOUNT_PATH" '100%VG'
else
Info "Volume group '$VG' already exists, skipping an actions"
fi
lvdisplay
Info "Done"
}
# source library (borrowed from container-storage-setup)
source $(dirname $0)/libcss.sh
# make sure lvm is installed
yum install -y lvm2
if [[ "$CREATE_VG" == "true" ]]; then
# ensure mount point exists
mkdir -p ${LV_MOUNT_PATH}
run_command_create
fi
#!/bin/bash
# Library for common functions (copied from https://github.com/projectatomic/container-storage-setup/blob/master/libcss.sh)
# echo info messages on stdout
Info() {
# stdout is used to pass back output from bash functions
# so we use stderr
echo "INFO: ${1}" >&2
}
# echo warn messages on stderr
Warn() {
echo "WARN: ${1}" >&2
}
# echo error messages on stderr
Error() {
echo "ERROR: ${1}" >&2
}
# echo error on stderr and exit with error code 1
Fatal() {
Error "${1}"
exit 1
}
# Check if passed in vg exists. Returns 0 if volume group exists.
vg_exists() {
local vg=$1
for vg_name in $(vgs --noheadings -o vg_name); do
if [ "$vg_name" == "$vg" ]; then
return 0
fi
done
return 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment