|
#!/bin/bash |
|
#-- |
|
# Copyright 2014-2017 Red Hat, Inc. |
|
# |
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
|
# you may not use this file except in compliance with the License. |
|
# You may obtain a copy of the License at |
|
# |
|
# http://www.apache.org/licenses/LICENSE-2.0 |
|
# |
|
# Unless required by applicable law or agreed to in writing, software |
|
# distributed under the License is distributed on an "AS IS" BASIS, |
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
# See the License for the specific language governing permissions and |
|
# limitations under the License. |
|
#++ |
|
|
|
# Purpose: This script sets up the storage for container runtimes. |
|
# Author: Andy Grimm <agrimm@redhat.com> |
|
# |
|
# |
|
# Trimmed down version to setup storage based of ext4+overlay2 on the current |
|
# AMi provided by AWS (Amazon ECS-optimized Amazon Linux 2 AMI). |
|
# This is instead of LVM/thin-pool+devicemapper configured in previous AMI |
|
# (Amazon ECS-optimized Amazon Linux AMI) |
|
# This is a heavily cut=down version of: |
|
# https://github.com/projectatomic/container-storage-setup/blob/master/container-storage-setup.sh |
|
# |
|
# Author: Chris Watts <watts.chris@gmail.com> |
|
set -ex |
|
|
|
# Partition type related |
|
_MAX_MBR_SIZE_BYTES="2199023255040" |
|
|
|
_TEMPDIR=$(mktemp --tmpdir -d) |
|
|
|
# Keeps track of resolved device paths |
|
_DEVS_RESOLVED="" |
|
|
|
# Keeps track of if we created a volume group or not. |
|
_VG_CREATED= |
|
|
|
VG=docker |
|
LV_NAME=docker-volumes |
|
LV_MOUNT_PATH=/var/lib/docker/volumes |
|
WIPE_SIGNATURES=true |
|
CREATE_VG=true |
|
|
|
# find devices which aren't the root partition, doesn't use metadata service to find ephemeral to speed up start-up time |
|
function find_non_root_devices() { |
|
local root_part root_dev other_devices |
|
shopt -s nullglob |
|
root_part=$(lsblk -Pp | grep 'MOUNTPOINT="/"' | sed -nr 's/NAME="([^"]+)".*/\1/p') |
|
root_dev=$(lsblk -pno pkname ${root_part}) |
|
if [[ -z "$root_dev" ]]; then |
|
Fatal "Couldn't find root device, bailing" |
|
fi |
|
|
|
# filter out devices which aren't disks or are the root volume |
|
other_devices=$(lsblk -Pp | grep 'TYPE="disk"' | grep -v 'NAME="'${root_dev}'"' | sed -nr 's/NAME="([^"]+)".*/\1/p') |
|
Info "Found storage devices $other_devices to use" |
|
echo "$other_devices" |
|
} |
|
|
|
# Wait for a device for certain time interval. If device is found 0 is |
|
# returned otherwise 1. |
|
wait_for_dev() { |
|
local devpath=$1 |
|
local timeout=$DEVICE_WAIT_TIMEOUT |
|
|
|
if [ -b "$devpath" ];then |
|
Info "Device node $devpath exists." |
|
return 0 |
|
fi |
|
|
|
if [ -z "$DEVICE_WAIT_TIMEOUT" ] || [ "$DEVICE_WAIT_TIMEOUT" == "0" ];then |
|
Info "Not waiting for device $devpath as DEVICE_WAIT_TIMEOUT=${DEVICE_WAIT_TIMEOUT}." |
|
return 0 |
|
fi |
|
|
|
while [ $timeout -gt 0 ]; do |
|
Info "Waiting for device $devpath to be available. Wait time remaining is $timeout seconds" |
|
if [ $timeout -le 5 ];then |
|
sleep $timeout |
|
else |
|
sleep 5 |
|
fi |
|
timeout=$((timeout-5)) |
|
if [ -b "$devpath" ]; then |
|
Info "Device node $devpath exists." |
|
return 0 |
|
fi |
|
done |
|
|
|
Info "Timed out waiting for device $devpath" |
|
return 1 |
|
} |
|
|
|
systemd_escaped_filename () { |
|
local escaped_path filename path=$1 |
|
escaped_path=$(echo ${path}|sed 's|-|\\x2d|g') |
|
filename=$(echo ${escaped_path}.mount|sed 's|/|-|g' | cut -b 2-) |
|
echo $filename |
|
} |
|
|
|
# |
|
# In the past we created a systemd mount target file, we no longer |
|
# use it, but if one pre-existed we still need to handle it. |
|
# |
|
remove_systemd_mount_target () { |
|
local mp=$1 |
|
local filename=$(systemd_escaped_filename $mp) |
|
if [ -f /etc/systemd/system/$filename ]; then |
|
if [ -x /usr/bin/systemctl ];then |
|
systemctl disable $filename >/dev/null 2>&1 |
|
systemctl stop $filename >/dev/null 2>&1 |
|
systemctl daemon-reload |
|
fi |
|
rm -f /etc/systemd/system/$filename >/dev/null 2>&1 |
|
fi |
|
} |
|
|
|
check_wipe_block_dev_sig() { |
|
local bdev=$1 |
|
local sig |
|
|
|
if ! sig=$(wipefs -p $bdev); then |
|
Fatal "Failed to check signatures on device $bdev" |
|
fi |
|
|
|
[ "$sig" == "" ] && return 0 |
|
|
|
if [ "$WIPE_SIGNATURES" == "true" ];then |
|
Info "Wipe Signatures is set to true. Any signatures on $bdev will be wiped." |
|
if ! wipefs -a $bdev; then |
|
Fatal "Failed to wipe signatures on device $bdev" |
|
fi |
|
return 0 |
|
fi |
|
|
|
while IFS=, read offset uuid label type; do |
|
[ "$offset" == "# offset" ] && continue |
|
Fatal "Found $type signature on device ${bdev} at offset ${offset}. Wipe signatures using wipefs or use WIPE_SIGNATURES=true and retry." |
|
done <<< "$sig" |
|
} |
|
|
|
determine_partition_type() { |
|
local dev="$1" size_bytes part_type |
|
|
|
if ! size_bytes=$(blockdev --getsize64 "$dev"); then |
|
Fatal "Failed to determine size of disk $dev" |
|
fi |
|
|
|
if [ $size_bytes -gt $_MAX_MBR_SIZE_BYTES ];then |
|
part_type="gpt" |
|
else |
|
part_type="dos" |
|
fi |
|
echo $part_type |
|
} |
|
|
|
create_partition_sfdisk(){ |
|
local dev="$1" part_type="$2" size part_label |
|
# Use a single partition of a whole device |
|
# TODO: |
|
# * Consider gpt, or unpartitioned volumes |
|
# * Error handling when partition(s) already exist |
|
# * Deal with loop/nbd device names. See growpart code |
|
if [ "$part_type" == "gpt" ];then |
|
# Linux LVM GUID for GPT. Taken from Wiki. |
|
part_label="E6D6D379-F507-44C2-A23C-238F2A3DF928" |
|
# Create as big a partition as possible. |
|
size="" |
|
else |
|
part_label="8e" |
|
size=$(( $( awk "\$4 ~ /"$( basename $dev )"/ { print \$3 }" /proc/partitions ) * 2 - 2048 )) |
|
fi |
|
cat <<EOF | sfdisk $dev |
|
unit: sectors |
|
label: $part_type |
|
2048,${size},$part_label |
|
EOF |
|
} |
|
|
|
create_partition_parted(){ |
|
local dev="$1" |
|
local part_type="$2" |
|
|
|
if [ "$part_type" == "gpt" ];then |
|
parted $dev --script mklabel gpt mkpart "container-partition" 0% 100% set 1 lvm on |
|
else |
|
parted $dev --script mklabel msdos mkpart primary 0% 100% set 1 lvm on |
|
fi |
|
} |
|
|
|
create_partition() { |
|
local dev="$1" part part_type |
|
|
|
part_type=`determine_partition_type "$dev"` |
|
|
|
if [ -x "/usr/sbin/parted" ]; then |
|
create_partition_parted $dev "$part_type" |
|
else |
|
create_partition_sfdisk $dev "$part_type" |
|
fi |
|
|
|
# Sometimes on slow storage it takes a while for partition device to |
|
# become available. Wait for device node to show up. |
|
if ! udevadm settle;then |
|
Fatal "udevadm settle after partition creation failed. Exiting." |
|
fi |
|
|
|
part=$(dev_query_first_child $dev) |
|
|
|
if ! wait_for_dev ${part}; then |
|
Fatal "Partition device ${part} is not available" |
|
fi |
|
} |
|
|
|
dev_query_first_child() { |
|
lsblk -npl -o NAME "$1" | tail -n +2 | head -1 |
|
} |
|
|
|
create_disk_partitions() { |
|
local devs="$1" part |
|
|
|
for dev in $devs; do |
|
# wipefs /dev/disk does not wipe any lvm signatures which might be |
|
# present on /dev/diskpart1. This signature will become visible to |
|
# lvm udev rules and will kickstart volume creation as soon as partion |
|
# is created and race with further partition commands like wipefs, |
|
# pvcreate etc. So zero out first few MB of disk in an attempt to |
|
# wipe any lvm signatures on first partition. |
|
# |
|
# By now we have ownership of disk and we have checked there are no |
|
# signatures on disk or signatures have been wiped. Dont care about |
|
# any signatures now on in the middle of disk. |
|
Info "Writing zeros to first 4MB of device $dev" |
|
if ! dd if=/dev/zero of=$dev bs=1M count=4; then |
|
Fatal "Failed to zero first 4MB of device $dev" |
|
fi |
|
|
|
create_partition $dev |
|
part=$(dev_query_first_child $dev) |
|
|
|
# It now seems unnecessary to do wipefs on partition given we already |
|
# zeroed out first 4MB. Only time it will be required if partition |
|
# starts beyong 4MB. Keep it for now. |
|
if ! wipefs -f -a ${part}; then |
|
Fatal "Failed to wipe signatures on device ${part}" |
|
fi |
|
pvcreate ${part} |
|
_PVS="$_PVS ${part}" |
|
done |
|
} |
|
|
|
create_extend_volume_group() { |
|
if [ -z "$_VG_EXISTS" ]; then |
|
vgcreate $VG $_PVS |
|
_VG_CREATED=1 |
|
_VG_EXISTS=1 |
|
else |
|
# TODO: |
|
# * Error handling when PV is already part of a VG |
|
vgextend $VG $_PVS |
|
fi |
|
} |
|
|
|
# This returns the mountpoint of $1 |
|
extra_lv_mountpoint() { |
|
local mounts |
|
local vg=$1 |
|
local lv_name=$2 |
|
local mount_dir=$3 |
|
mounts=$(findmnt -n -o TARGET --source /dev/$vg/$lv_name | grep "^$mount_dir") |
|
echo $mounts |
|
} |
|
|
|
mount_extra_volume() { |
|
local vg=$1 |
|
local lv_name=$2 |
|
local mount_dir=$3 |
|
remove_systemd_mount_target $mount_dir |
|
mounts=$(extra_lv_mountpoint $vg $lv_name $mount_dir) |
|
if [ -z "$mounts" ]; then |
|
mount -t ext4 /dev/$vg/$lv_name $mount_dir |
|
fi |
|
} |
|
|
|
# Create a logical volume of size specified by first argument. Name of the |
|
# volume is specified using second argument. |
|
create_lv() { |
|
local data_size=$1 |
|
local data_lv_name=$2 |
|
|
|
# striping like this means the max volume group size = min(of each device) * count(devices) |
|
if [[ $data_size == *%* ]]; then |
|
lvcreate -y -l $data_size -n $data_lv_name --stripes $(echo "$DEVS" |wc -l) $VG || return 1 |
|
else |
|
lvcreate -y -L $data_size -n $data_lv_name --stripes $(echo "$DEVS" |wc -l) $VG || return 1 |
|
fi |
|
return 0 |
|
} |
|
|
|
setup_extra_volume() { |
|
local lv_name=$1 |
|
local mount_dir=$2 |
|
local lv_size=$3 |
|
|
|
if ! create_lv $lv_size $lv_name; then |
|
Fatal "Failed to create volume $lv_name of size ${lv_size}." |
|
fi |
|
|
|
if ! mkfs -t ext4 /dev/$VG/$lv_name > /dev/null; then |
|
Fatal "Failed to create filesystem on /dev/$VG/${lv_name}." |
|
fi |
|
|
|
if ! mount_extra_volume $VG $lv_name $mount_dir; then |
|
Fatal "Failed to mount volume ${lv_name} on ${mount_dir}" |
|
fi |
|
|
|
# setup right selinux label first time fs is created. Mount operation |
|
# changes the label of directory to reflect the label on root inode |
|
# of mounted fs. |
|
if ! restore_selinux_context $mount_dir; then |
|
return 1 |
|
fi |
|
} |
|
|
|
restore_selinux_context() { |
|
local dir=$1 |
|
|
|
if ! restorecon -R $dir; then |
|
Error "restorecon -R $dir failed." |
|
return 1 |
|
fi |
|
} |
|
|
|
partition_disks_create_vg() { |
|
local dev_list |
|
|
|
# If there is no volume group specified or no root volume group, there is |
|
# nothing to do in terms of dealing with disks. |
|
if [[ -n "$DEVS" && -n "$VG" ]]; then |
|
_DEVS_RESOLVED="${DEVS}" # already resolved |
|
dev_list="$_DEVS_RESOLVED" # wipe all disks |
|
|
|
for dev in $dev_list; do |
|
check_wipe_block_dev_sig $dev |
|
done |
|
create_disk_partitions "$dev_list" |
|
create_extend_volume_group |
|
fi |
|
} |
|
|
|
function run_command_create() { |
|
Info "Doing new script mountain volume group creation" |
|
|
|
DEVS="$(find_non_root_devices)" |
|
if [[ -z "$DEVS" ]]; then |
|
# allow it to be run on instance without extra storage |
|
Info "no devices to use so skipping volume storage creation" |
|
exit 0 |
|
fi |
|
|
|
if ! vg_exists "$VG"; then |
|
Info "Checking/creating volume group" |
|
partition_disks_create_vg |
|
vgdisplay |
|
echo "" |
|
|
|
Info "Creating logical volume" |
|
setup_extra_volume "$LV_NAME" "$LV_MOUNT_PATH" '100%VG' |
|
else |
|
Info "Volume group '$VG' already exists, skipping an actions" |
|
fi |
|
lvdisplay |
|
Info "Done" |
|
} |
|
|
|
# source library (borrowed from container-storage-setup) |
|
source $(dirname $0)/libcss.sh |
|
|
|
# make sure lvm is installed |
|
yum install -y lvm2 |
|
|
|
if [[ "$CREATE_VG" == "true" ]]; then |
|
# ensure mount point exists |
|
mkdir -p ${LV_MOUNT_PATH} |
|
run_command_create |
|
fi |