Skip to content

Instantly share code, notes, and snippets.

@j0ju
Last active June 19, 2021 21:18
Show Gist options
  • Save j0ju/61eb648932e74eb81a0bb6c77bfb0816 to your computer and use it in GitHub Desktop.
Save j0ju/61eb648932e74eb81a0bb6c77bfb0816 to your computer and use it in GitHub Desktop.
a simple CNI-alike for docker for a more specific network config in containers, also can act on unhealthy containers
[Unit]
Description=Mesh Docker Service Companion
ReloadPropagatedFrom=openvpn.service
After=network-online.target
Requires=docker.service
[Service]
Type=simple
PrivateTmp=true
WorkingDirectory=/run
ExecStart=/etc/init.d/mesh-docker-service-companion daemon
#CapabilityBoundingSet=CAP_NET_ADMIN CAP_SYS_ADMIN
LimitNPROC=24
DeviceAllow=/dev/null rw
ProtectSystem=true
ProtectHome=true
RestartSec=5s
Restart=on-failure
[Install]
WantedBy=multi-user.target
#!/bin/sh
### BEGIN INIT INFO
# Provides: mesh-docker-service-companion
# Required-Start: $local_fs $network $remote_fs
# Required-Stop: $local_fs $network $remote_fs
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
# Short-Description: mesh-docker-service-companion docker network attach sidecar helper
# Description: mesh-docker-service-companion docker network attach sidecar helper
### END INIT INFO
set -eu
# Label that this deamon would act on for modes in network mode 'none'
# service_ips= # comma seperated list for IPs on a dedicated peer to peer interfaces handed into the container
# bridge_member= # a bridge a interface should be added to
# bridge_ips= # an IP for the bridge interface
# service_interface= # a interface available on host were we add service ips via proxy arp
# Label that this deamon would act on for modes in network mode 'host'
# service_interface= # a interface available in network mode == host, where service IPs can bound to
# service_ips= # a comma seperated list of IPs added to the service_interface, and later cleaned up on exit
# Examples
# docker run --rm -dti --network=none -l service_ips=1.1.1.1 debian:buster bash -
# docker run --rm -dti --network=host -l service_interface=foo -l service_ips=8.8.8.8,heise.de debian:buster bash -
# Label for the "cat" runner functionality mimicing lxc
# cat= # if that label is set and the entrypoint is /lib/catrunner.sh, the container
# # is pivot rooted to /rootfs, (works best if /rootfs is a volume or bind mount)
# # for this you need this script bind mounted to /lib/catrunner.sh
# cat_cgroups=* # cgroups to keep
# # e.g.
# # * = all cgroups in RO (default)
# # pids = pids cgroup in RO
# # pids:rw = pids cgroup in RW
# cat_sysfs=no # if sysfs is present in pet (default: yes)
# Example
# docker run --rm --log-driver none -v /etc/init.d/mesh-docker-service-companion:/lib/catrunner.sh:ro --entrypoint /lib/catrunner.sh -l cat=yes hub.i.qx.ai/debian-buster:runit.amd64
# (hub.i.qx.ai/debian-buster:runit.amd64 is an image where runit is the local init system)
# * This example is not working properly as (buster has systemd as init system, with jessie it is working better (sysvinit))
# docker run --rm --log-driver none -v /etc/init.d/mesh-docker-service-companion:/lib/catrunner.sh:ro --entrypoint /lib/catrunner.sh -l cat=yes debian:buster
PIDFILE=/var/run/docker-events.pid
PIPE=/var/run/docker-events.pipe
PIPEPIDFILE=/var/run/docker-events.pipe.pid
STATEDIR=/run/mesh-docker-service-companion
HOST_SVC_IP=169.254.123.123/32
HOST_SVC_IPv6=fe80::1/64
CONTAINER_IPv6=fe80::dead/64
HOST_SVC_PFX=V
CONTAINER_SVC_PFX=v
HOST_BRIDGE_PFX=B
CONTAINER_BRIDGE_PFX=b
container_add_p2p_interface() { #
case "$network_mode" in
none | host ) ;; # we handle this types of network modes
* ) return 0 ;; # we do NOT
esac
local oIFS="$IFS"
# check if p2p interface already in place
if nsenter -t $pid -n -- ip link show dev p2p > /dev/null 2>&1; then
echo "I: container $cid: p2p interface already present, skipping"
return 0
fi
if [ -n "$label_service_interface" ] && [ -d "/sys/class/net/$label_service_interface" ]; then
HOST_SVC_PFX="S"
fi
# set interface names
# set host side interface name
for hostIf in "${HOST_SVC_PFX}$cname" "${HOST_SVC_PFX}$cid" $cid ${$}${cid}; do
hostIf="$(echo "${hostIf}" | head -c 15)"
if ! [ -d "/sys/class/net/$hostIf" ]; then
break
fi
echo "W: $hostIf is already used, skipping" >&2
hostIf=
done
if [ -z "$hostIf" ]; then
echo "E: no host side service interface name found, skipping" >&2
break
fi
# set container side interface name
for containerIf in "${CONTAINER_SVC_PFX}$cname" "${CONTAINER_SVC_PFX}$cid" $cid ${$}${cid}; do
containerIf="$(echo "${containerIf}" | head -c 15)"
if ! [ -d "/sys/class/net/$containerIf" ]; then
break
fi
echo "W: $containerIf is already used, skipping" >&2
containerIf=
done
if [ -z "$containerIf" ]; then
echo "E: no container side service interface name found, skipping" >&2
break
fi
# create interfaces
ip link add "$hostIf" type veth peer name "$containerIf"
sysctl > /dev/null -w net.ipv6.conf.$containerIf.disable_ipv6=1
sysctl > /dev/null -w net.ipv6.conf.$hostIf.disable_ipv6=1
ip link set netns "$pid" dev "$containerIf"
# setup container side
nsenter -t $pid -n -- ip link set up name p2p dev $containerIf
echo "$cname:p2p <-> $hostIf"
# setup host side
ip link set up dev $hostIf
# ToDo
V6_ready=no
V4_ready=no
# collect the list of IPs and hostnames and resolve them, afterwards pipe them to the address setting loop
IFS=,
for item in $label_service_ips; do # collect and resolve - output per line: $ip $item
IFS="$oIFS"
case "$item" in
"" )
# we have nothing
continue
;;
*:* | [1-9]*.*.* )
# assume we have an IPv6 or IPV4 address
echo "$item $item"
;;
* )
# assume we have an hostname, try to resolve it
getent ahosts "$item" | grep -Eo "^[^ ]+" | sort -u | sed -e "s/$/ $item/"
;;
esac
done | while IFS="$oIFS" read -r ip label; do # set the addresses
IFS="$oIFS"
case "$ip" in
*:* ) # assume we have an IPv6 address
ip="$ip/128"
if [ $V6_ready = no ]; then # enable v6, add IP to host and container
sysctl > /dev/null -w net.ipv6.conf.$hostIf.disable_ipv6=0
nsenter -t $pid -n -- sysctl > /dev/null -w net.ipv6.conf.p2p.disable_ipv6=0
ip addr add $HOST_SVC_IPv6 dev $hostIf
nsenter -t $pid -n -- ip addr add $CONTAINER_IPv6 dev p2p
nsenter -t $pid -n -- ip route add default via ${HOST_SVC_IPv6%/*} dev p2p metric 100
V6_ready=yes
fi
nsenter -t $pid -n -- ip addr add $ip dev p2p
# prevent asymetric routing
nsenter -t $pid -n -- ip route add default from "$ip" via ${HOST_SVC_IPv6%/*} dev p2p metric 100
# add route from host to container
ip route add $ip dev $hostIf
;;
[1-9]*.*.* ) # assume we have an IPv4 address
ip="$ip/32"
if [ $V4_ready = no ]; then
ip addr add $HOST_SVC_IP dev $hostIf
fi
nsenter -t $pid -n -- ip addr add "$ip" dev p2p
ip route add $ip dev $hostIf
if [ $V4_ready = no ]; then
nsenter -t $pid -n -- ip route add $HOST_SVC_IP dev p2p
# add a default route
nsenter -t $pid -n -- ip route add default via ${HOST_SVC_IP%/*} metric 100
# prevent asymetric routing via policy routing, as the FROM extension/source routing is not valid for ipv4
nsenter -t $pid -n -- ip route add default via ${HOST_SVC_IP%/*} metric 100 table 23
V4_ready=yes
fi
# prevent asymetric routing via policy routing, as the FROM extension is not valid for ipv4
nsenter -t $pid -n -- ip rule add from "$ip" table 23 pref 23
;;
* ) # this should not happen
echo "E: '$ip' is not a IPv4/6 address, set via label '$label', abort" >&2
break
;;
esac
if [ -n "$label_service_interface" ] && [ -d "/sys/class/net/$label_service_interface" ]; then
ip neigh add proxy ${ip%/*} dev $label_service_interface
echo "$hostIf $ip" >> "$statefile"
fi
echo "$cname:p2p added $ip"
done
echo "$cname:p2p $hostIf host service interface up"
# write out state for later usage on stop or destroy
echo "$hostIf" >> "$statefile"
IFS="$oIFS"
} #
container_add_eth0_interface() { #
local oIFS="$IFS"
case "$network_mode" in
none | host ) ;; # we handle this types of network modes
* ) return 0 ;; # we do NOT
esac
# check if bridge interface already in place
if nsenter -t $pid -n -- ip link show dev eth0 > /dev/null 2>&1; then
echo "I: container $cid: eth0 bridge interface already present, skipping"
return 0
fi
# TODO: trap on failure and delete interface
# set interface names
# set host side interface name
for hostIf in "${HOST_BRIDGE_PFX}$cname" "${HOST_BRIDGE_PFX}$cid" $cid ${$}${cid}; do
hostIf="$(echo "${hostIf}" | head -c 15)"
if ! [ -d "/sys/class/net/$hostIf" ]; then
break
fi
echo "W: $hostIf is already used, skipping" >&2
hostIf=
done
if [ -z "$hostIf" ]; then
echo "E: no host side service interface name found, skipping" >&2
break
fi
# set container side interface name
for containerIf in "${CONTAINER_BRIDGE_PFX}$cname" "${CONTAINER_BRIDGE_PFX}$cid" $cid ${$}${cid}; do
containerIf="$(echo "${containerIf}" | head -c 15)"
if ! [ -d "/sys/class/net/$containerIf" ]; then
break
fi
echo "W: $containerIf is already used, skipping" >&2
containerIf=
done
if [ -z "$containerIf" ]; then
echo "E: no container side service interface name found, skipping" >&2
break
fi
# create interfaces
h_mac="fe:$(echo $cname | md5sum | sed -re 's/^(..)(..)(..)(..)(..).*/\1:\2:\3:\4:\5/')"
c_mac="02:${h_mac##??:}"
ip link add "$hostIf" type veth peer name "$containerIf"
ip link set address "$h_mac" dev "$hostIf"
ip link set address "$c_mac" dev "$containerIf"
sysctl > /dev/null -w net.ipv6.conf.$hostIf.disable_ipv6=1
ip link set netns "$pid" dev "$containerIf"
# setup host side
ip link set up master "$label_bridge_member" dev $hostIf
echo "$cname:eth0 (mac: $c_mac) <-> $hostIf (mac: $h_mac, bridge: $label_bridge_member)"
# setup container side
nsenter -t $pid -n -- sysctl > /dev/null -w net.ipv6.conf.lo.disable_ipv6=0
nsenter -t $pid -n -- sysctl > /dev/null -w net.ipv6.conf.$containerIf.disable_ipv6=0
nsenter -t $pid -n -- ip link set up name eth0 dev $containerIf
if [ "$label_bridge_checksum_offload" = off ]; then
nsenter -t $pid -n -- ethtool -K eth0 rx off tx off
fi
if [ "$label_bridge_ips" = dhcp ]; then
# this needs to set up addesses/prefixes and routes
nsenter -t $pid -n -- udhcpc -i eth0 -q -x "hostname:$cname"
echo "$cname:eth0 got DHCP"
nsenter -t $pid -n -- ip addr show dev eth0 | sed -rne "/inet[6]?/ s/^[ ]+/$cname:eth0: / p"
else
# add addresses
( IFS=,
for ip in $label_bridge_ips; do
[ "$ip" = dhcp ] && \
continue || \
nsenter -t $pid -n -- ip addr add "$ip"/32 dev eth0
echo "$cname:eth0 added $ip"
done
)
# set device routes
ip r list dev $label_bridge_member | awk '$1 ~ "^default|/[0-9]+$" {print}' | while read -r pfx via gw _; do
if [ "$via" != via ]; then
nsenter -t $pid -n -- ip route add $pfx dev eth0 metric 10
echo "$cname:eth0 added device route $pfx"
fi
done
# set routes with gateways
ip r list dev $label_bridge_member | awk '$1 ~ "^default|/[0-9]+$" {print}' | while read -r pfx via gw _; do
if [ "$via" = via ]; then
nsenter -t $pid -n -- ip route add $pfx via "$gw" dev eth0 metric 10
echo "$cname:eth0 added route $pfx via $gw"
fi
done
fi
echo "$cname:eth0 $hostIf host bridge interface up"
# write out state for later usage on stop or destroy
echo "$hostIf" >> "$statefile"
IFS="$oIFS"
} #
container_add_service_ips() { #
case "$network_mode" in
none | host ) ;; # we handle this types of network modes
* ) return 0 ;; # we do NOT
esac
if ! ip link show dev "$label_service_interface" > /dev/null 2>&1; then
echo "E: interface from service_interface label '$label_service_interface' not found" >&2
echo "W: not adding any IPs" >&2
return 0
fi
local ip=
local label=
local oIFS="$IFS"
# get CIDR of primary addresses on interface
# use them to add svc address to interface to avoid more specific imports
local cidr4="$( ip -4 a s dev "$label_service_interface" | awk -F"[ /]+" '$2 == "inet" {print $4; exit}' )"
local cidr6="$( ip -6 a s dev "$label_service_interface" | awk -F"[ /]+" '$2 == "inet6" {print $4; exit}' )"
# collect the list of IPs and hostnames and resolve them, afterwards pipe them to the address setting loop
IFS=,
for ip in $label_service_ips; do # collect and resolve
IFS="$oIFS"
case "$ip" in
"" )
# we have nothing
continue
;;
*:* | [1-9]*.*.* )
# assume we have an IPv6 or IPV4 address
echo "$ip $ip"
;;
* )
# assume we have an hostname, try to resolve it
getent ahosts "$ip" | grep -Eo "^[^ ]+" | sort -u | sed -e "s/$/ $ip/"
;;
esac
done | while IFS="$oIFS" read -r ip label; do # set the addresses
IFS="$oIFS"
case "$ip" in
*:* ) # assume we have an IPv6 address
cidr=128
cidr_if="$cidr6"
;;
[1-9]*.*.* ) # assume we have an IPv4 address
cidr=32
cidr_if="$cidr4"
;;
* ) # this should not happen
echo "E: '$ip' is not a IPv4/6 address, set via label '$label', abort" >&2
break
;;
esac
# we now have either an IPv4 or an IPv6 address, determine the correct cidr
# if it is not routed via a gateway, but via the specified $label_service_interface", then use the interfaces
# CIDR, otherwise use /32 or /128 depending on the protocol
if route="$(ip route get "$ip" | grep " dev $label_service_interface ")"; then
case "$route" in
*" via "* ) ip="$ip/$cidr" ;;
* ) ip="$ip/$cidr_if" ;;
esac
else
ip="$ip/$cidr"
fi
if ip route get ${ip%/*} | grep ^local > /dev/null || ip addr add "$ip" dev "$label_service_interface"; then
echo "I: $cname: added $ip to $label_service_interface"
echo "$label_service_interface $ip" >> "$statefile"
else
echo "E: could not add '$ip' ($label) to '$label_service_interface'" >&2
fi
done
IFS="$oIFS"
} #
host_container_add_netns() { #
mkdir -p /var/run/netns
ln -sf "/proc/$pid/ns/net" "/var/run/netns/$cname"
ln -sf "/var/run/netns/$cname" "/var/run/netns/$cid"
} #
container_entrypoint_catrunner() { #
# this runs as PID 1 in the container via
# ... -v $0:/lib/catrunner.sh:ro --entrypoint /lib/catrunner.sh ...
if ! [ -r /proc/mounts ]; then
# if we have no /proc/mounts the container is high likely broken
echo "$0: ERROR: /this should no happend, /proc/mounts does not exists, ABORT"
exit 1
fi
# wait for /mnt, as the cat_preparer will prepare rootfs from the running image if possible
echo "${0}: wait for rootfs"
while ! grep " /mnt " /proc/mounts > /dev/null 2> /dev/null; do
sleep 1
done
# the cat_preparer has done the pivot root
cd /
echo "${0}: rootfs ready, execing init"
# start the init in the "cat" container
for init in /lib/preinit /sbin/init; do
[ -x "${init%% *}" ] || continue
echo "${0}: exec $init as init"
exec "$init" "$@"
done
echo "${0}: ERROR: exec of INIT failed $init, ABORT"
exit 1
} #
container_cat_preparer() { #
# init { #
local oIFS="$IFS"
# this is started by container_start, if label_cat is not empty
# 1st: check if the entrypoint is /lib/catrunner.sh
entrypoint="$(docker inspect --format='{{ index .Config.Entrypoint 0 }}' "$cid")"
if ! [ "$entrypoint" = /lib/catrunner.sh ]; then
echo "I: label cat is set but, entrypoint is not /lib/catrunner.sh, IGNORE"
return 0
fi
# 2nd check if /lib/catrunner.sh is still a bind mount, if not do not act on this container
if ! awk 'BEGIN { rs=1 } $2 == "'"$(nsenter -t $pid -m -p -- readlink -f /lib/catrunner.sh)"'" { rs=0; print $0 }; END { exit rs }' /proc/$pid/mounts; then
echo "I: /lib/catrunner.sh rootfs preparation has been run already, IGNORE"
return 0
fi
echo "I: new pet started with /lib/catrunner.sh helper"
# check if we /rootfs is a mountpoint
# if not do a bind so that the later pivot_root call is working
if ! grep " /rootfs " /proc/$pid/mounts > /dev/null; then
echo "I: no /rootfs mounted, creating bind mount / -> /rootfs"
nsenter -t $pid -m -p -- sh -c '
mkdir -p /rootfs
mount --bind / /rootfs
' # eo nsenter sh
fi
# } #
# do bootstrap of rootfs, if no init is found # { #
# if we find no init process to start, check if any file in /rootfs,
# if not bootstrap: copy the current image rootfs to /rootfs
nsenter -t $pid -m -p -- sh -c '
if [ ! -x /rootfs/sbin/init -a ! -x /rootfs/lib/preinit ]; then
if [ -z "$( ls -d /rootfs/* 2> /dev/null )" ]; then
echo "I: /rootfs empty, cloning rootfs from docker image"
for f in /*; do
case "$f" in
/rootfs )
continue
;;
/sys | /proc )
mkdir -p "/rootfs/$f"
continue
;;
/tmp )
mkdir -p "/rootfs/$f"
chmod 1777 "/rootfs/$f"
continue
;;
* )
if grep " $f " /proc/mounts > /dev/null; then
mkdir "/rootfs/$f" -p
continue
fi
;;
esac
cp -a "$f" /rootfs
done
rm -f /rootfs/lib/catrunner.sh
fi
fi
[ -d /rootfs/mnt ] || \
mkdir -p /rootfs/mnt
[ -d /rootfs/run ] || \
mkdir -p /rootfs/run
' # eo nsenter sh # } #
# mount mounts and pivot_root { #
# * move system mounts from docker, before pivot_root
# because it is a "cat", it could be modified by its user afterwards, eg. removal of directories
# the user thinks, that are not needed
# we only move, known system mounts for which a directory exists
# the rest under /mnt/ is cleared after execing init
# * if we have /tmp, mount a tmpfs there if there is not a mountpoint
# * if we have /run, mount a tmpfs there if there is not a mountpoint
nsenter -t $pid -m -p -- sh -c '
for m in /dev /sys /etc/resolv.conf /tmp /proc; do
if mount --move "$m" "/rootfs$m" 2> /dev/null; then
echo "I: moved system mount $m"
else
echo "E: error moving mount $m -> /rootfs"
fi
done
if [ -d /rootfs/tmp ]; then
if ! mountpoint /rootfs/tmp > /dev/null; then
echo "I: creating empty tmpfs in /tmp"
mount -t tmpfs tmpfs /rootfs/tmp -o '"size=$label_cat_tmp_size"'
chown root.root /rootfs/tmp
chmod 1777 /rootfs/tmp
fi
fi
if [ -d /rootfs/run ]; then
if ! mountpoint /rootfs/run > /dev/null; then
echo "I: creating empty tmpfs in /run"
mount -t tmpfs tmpfs /rootfs/run -o '"size=$label_cat_run_size"'
mkdir -p /rootfs/run/lock
chown root.root /rootfs/run /rootfs/run/lock
chmod 0755 /rootfs/run /rootfs/run/lock
fi
fi
'
# pivot_root
nsenter -t $pid -m -p -- pivot_root /rootfs /rootfs/mnt
echo "I: pivot root done"
# it will not be touched anymore by container_cat_preparer
# if /lib/catrunner.sh -> /mnt/lib/catrunner.sh is no mountpoint anymore (flag)
# } #
# unmount or modify filesystem mounts in pet container { #
# after this init is executed by container_entrypoint_catrunner
# - cleanup system mounts
# - try to clean up leftovers
# try to clean up leftovers in /mnt for 7 times with 1 second retry
_mountpoint_considered=
for i in 1 2 3 4 5 6 7; do
# cleanup system mounts
cat /proc/$pid/mounts > /tmp/.$$.mounts
while read what where type opts _ _; do # read from mounts in namepace
# see at the end of loop
case "$_mountpoint_considered" in
"$where" | *" $where "* | *" $where" )
#echo " $where already reconsiled"
continue
;;
esac
#echo " ($i) consiling $type:$where"
case "$type:$where" in
*:/proc/* ) # { #
nsenter -t $pid -m -p -- mount -t tmpfs -o size=16k,noexec,nosuid,noatime proc /mnt
nsenter -t $pid -m -p -- chmod 0755 /mnt
if [ -d "$where" ]; then
if nsenter -t $pid -m -p -- umount "$where"; then
nsenter -t $pid -m -p -- mount --move /mnt "$where" || \
nsenter -t $pid -m -p -- mount -o move /mnt "$where"
nsenter -t $pid -m -p -- mount -o nodev,remount -r "$where"
else
nsenter -t $pid -m -p -- umount /mnt
fi
else
nsenter -t $pid -m -p -- cp -a /dev/null "/mnt/inode"
if nsenter -t $pid -m -p -- umount "$where"; then
nsenter -t $pid -m -p -- mount -o bind /mnt/inode "$where"
nsenter -t $pid -m -p -- mount -o remount -r "$where"
fi
nsenter -t $pid -m -p -- umount /mnt
fi
;; # } #
cgroup:/sys/fs/cgroup/* ) # { #
# determine cgroup mode
cgroup="${where##*/}"
cgroup_mode=
IFS=,
for _cg in $label_cat_cgroups_default,$label_cat_cgroups; do
IFS="$oIFS"
_cgroup_mode="${_cg##*[=:]}"
_cgroup_mode="${_cgroup_mode%,}"
_cgroup="${_cg%%[=:]*}"
if [ "$_cgroup" = "*" -o "$_cgroup" = "$cgroup" ]; then
cgroup_mode="$_cgroup_mode"
fi
IFS=,
done
IFS="$oIFS"
# act
case "$cgroup_mode" in
rw | ro )
if ! nsenter -t $pid -m -p -C -- mount -o "remount,$cgroup_mode" "$where" 2> /dev/null ; then
opts="${opts#r[ow]}"
opts="${opts#,}"
nsenter -t $pid -m -p -C -- umount "$where"
nsenter -t $pid -m -p -C -- mount -t "$type" -o "$cgroup_mode,$opts" "$what" "$where"
fi
;;
umount | * )
if nsenter -t $pid -m -p -C -- umount "$where"; then
nsenter -t $pid -m -p -C -- rmdir "$where"
else
echo "E: could not unmount '$where'" >&2
fi
;;
esac
;; # } #
*:/sys ) # { #
nsenter -t $pid -m -p -C -- sh -e -c '
mount -t tmpfs -o size=16k,noexec,nosuid,noatime sysfs /mnt
chmod 0755 /mnt
mkdir -p /mnt/fs/cgroup/cgroup
mount --move /sys /tmp || \
mount -o move /sys /tmp
mount --move /mnt /sys || \
mount -o move /mnt /sys
mount --move /tmp /mnt || \
mount -o move /tmp /mnt
mount --bind /sys/fs/cgroup/cgroup /sys/fs/cgroup || \
mount -o bind /sys/fs/cgroup/cgroup /sys/fs/cgroup
for d in /mnt/fs/cgroup/*; do
[ -d "$d" ] || \
continue
[ ! -L "$d" ] || \
continue
mkdir -p "/sys/fs/cgroup/${d##*/}"
mount --move $d "/sys/fs/cgroup/${d##*/}" || \
mount -o move $d "/sys/fs/cgroup/${d##*/}"
done
' # eo nsenter
;; # } #
mqueue:* ) # { #
nsenter -t $pid -m -p -C -- umount "$where"
nsenter -t $pid -m -p -C -- rmdir "$where" 2> /dev/null || :
;; # } #
esac
# add this to conciled paths
#echo " ($i) conciled $type:$where"
_mountpoint_considered="$_mountpoint_considered $where"
done < /tmp/.$$.mounts
rm -f /tmp/.$$.mounts
nsenter -t $pid -m -p -C -- mount -o remount -r /sys || :
if mounts="$( grep -E -o " /mnt(|[^ ]+) " /proc/$pid/mounts )"; then
nsenter -t $pid -m -p -- sh -c '
mounts="'"$mounts"'"
for m in $mounts; do
if umount "$m" 2> /dev/null; then
echo "I: unmounted $m"
else
echo "W: $m still mounted, retrying ..."
fi
done
rmdir /rootfs 2> /dev/null || :;
'
sleep 1
else
break
fi
done
# } #
} #
# TODO: -?!?- start <-> cat-entrypoint, lock via /etc/resolv.conf, if preparetion is still running
container_start() { #
local cid="$1"
# ensure clean variables, set defaults
local label_service_ips= # comma seperated list of IPs on a dedicated peer to peer interfaces handed into the container
local label_bridge_member= # a bridge a interface should be added to
local label_bridge_ips= # an IP for the bridge interface, dhcp is an option
#
local label_bridge_checksum_offload= # when using bridge and local VMs not from KVM (eg VirtualBox, VMWare), we need this to fix checksum errors
#
local label_service_interface= # a interface available in network mode == host, where service IPs can bound to
local label_service_ips= # a comma seperated list of IPs added to the service_interface, and later cleaned up on exit
#
local label_cat=
local label_cat_tmp_size=262144k
local label_cat_run_size=131072k
local label_cat_cgroups=""
local label_cat_cgroups_default="*:umount,pids:rw"
local label_cat_sysfs="yes" # mounted
# only act if network mode is "none"
local network_mode="$( docker inspect --format='{{ range $key, $value := .NetworkSettings.Networks}}{{ $key }}{{ end}}' "$cid" )"
# fetch label from container
eval "$(docker inspect -f '{{ range $k, $v := .Config.Labels -}}
local label_{{ $k }}='"'"'{{ $v }}'"'"'
{{ end -}}' "$cid" | \
grep -E "^[ ]*local label_(service_interface|service_ips|bridge_ips|bridge_member|bridge_checksum_offload|cat|cat_cgroups|cat_sysfs|cat_tmp_size|cat_run_size)="
)
" # eo EVAL
# fetch container name
local cname="$(docker inspect --format='{{.Name}}' "$cid")"
local cname="${cname#/}"
# get pid for nsenter
local pid="$(docker inspect --format='{{.State.Pid}}' $cid)"
if [ "$pid" = 0 ]; then
# container died before we could do something with it
return 0
fi
# create/cleanup statefile
statefile="$STATEDIR/$cid.interfaces"
if [ -L "$statefile" ]; then
rm -f "$statefile"
fi
echo "$cname" > "$statefile"
echo "$cname: container started (pid $pid, id $cid)"
# add netns name/alias for container based on name
host_container_add_netns
case "$network_mode" in
none )
if [ -n "$label_bridge_ips" -a -n "$label_bridge_member" ]; then
container_add_eth0_interface
fi
# setup direct link p2p service interface
if [ -n "$label_service_ips" ]; then
container_add_p2p_interface
fi
;;
host )
if [ -n "$label_service_interface" -a -n "$label_service_ips" ]; then
container_add_service_ips
fi
;;
esac
if [ -n "$label_cat" ]; then
container_cat_preparer
fi
} #
container_stop() { #
container_die "$1"
} #
container_die() { #
local cid="$1"
local cname="$cid"
local ip=
local statefile="$STATEDIR/$cid.interfaces"
if [ -f "$statefile" ]; then
exec 3< $statefile
read -r cname <&3
echo "$cname: container died (id $cid)"
while read -r hostIf ip; do
if [ -n "$ip" ]; then
while ip neigh del "${ip%/*}" 2> /dev/null; do :; done
ip addr del "$ip" dev "$hostIf" 2> /dev/null || :
echo "$cname: removed neighbor entries for $ip"
fi
ip link del "$hostIf" 2> /dev/null || :
echo "$cname: cleaned up host interface $hostIf"
ip=
done <&3
rm -f "$statefile"
exec 3>&-
fi
if [ -L "/var/run/netns/$cid" ]; then
( cd /var/run/netns
if link="$(readlink "$cid")"; then
rm -f "$link"
echo "$cname: cleaned up netns link ${link##*/}"
fi
rm -f "$cid"
echo "$cname: cleaned up netns link $cid"
)
fi
} #
container_health_status() { #
local status="$1"
local cid="$2"
case "$status" in
healthy | starting | none )
echo "I: container_health_status: cid $cid is $status, ignoring"
return 0
;;
unhealthy )
echo "E: container_health_status: cid $cid is $status, restarting" >&2
;;
* )
echo "E: container_health_status: cid $cid has unhandled $status, IGNORING" >&2
return 1
;;
esac
docker restart "$cid"
} #
daemon() { #
mkdir -p "$STATEDIR" || if [ ! -d "$STATEDIR" ]; then
echo "E: '$STATEDIR' is not a directory, EXITING" >&2
exit 1
fi
# proper cleanup of pipe attached child processes
trap "cleanup; exit" EXIT HUP INT QUIT TERM
# setup pipe to receive events from "docker events"
# do this early so we do not miss events,
# so state after cold plugging is followed, correctly
rm -f "$PIPE" "$PIPEPIDFILE"
mknod "$PIPE" p
"$(which docker)" events > "$PIPE" < /dev/null 2> /dev/null & PIPEPID=$!
echo "$PIPEPID" > "$PIPEPIDFILE"
exec < "$PIPE"
# container:start coldplug
docker ps --filter status=running --format "{{ .ID }}" | while read -r cid; do
echo "I: found running container $cid"
# we have the short Id: fetch long Id
cid="$( docker inspect "$cid" --format='{{ .Id }}' )"
container_start "$cid" &
done
# container:health_status coldplug - check for unhealthy containers
docker ps --filter health=unhealthy --format "{{ .ID }}" | while read -r cid; do
echo "I: found running container $cid"
# we have the short Id: fetch long Id
cid="$( docker inspect "$cid" --format='{{ .Id }}' )"
container_health_status unhealthy "$cid" &
done
while read -r date object event meta; do
# strip trailing ":"
event="${event%:}"
#echo "daemon: event_dispatch: $object:$event $meta"
case "$object:$event" in
# this is dispatching handled events
# as a detached background jobs
#
# container:start|stop|die
# - we hand over the container id, $cid = ${meta%% *} (first token in $meta)
container:start | container:stop | container:die )
${object}_${event} $meta &
;;
# container:health_status
# - we need the first arguments from $meta
# eg.
# healthy 2e2a2acc2502eb460c044be6853ecbf6408f9c36e6389943dde2c4690958db57 ...
# unhealthy 2e2a2acc2502eb460c044be6853ecbf6408f9c36e6389943dde2c4690958db57 ...
container:health_status )
# TODO: may be set a backoff here, so we do add sleeps
# if we have to many unhealthy containers restarted in a short period of time
# so we can prevent cascaded restarts (that why k8s distinguisches between health and liveness)
${object}_${event} $meta &
;;
# debug: for unknown events
# * ) echo "W: unhandled $object:$event $meta" ;;
esac < /dev/null
done
} #
start() { #
stop
if [ -L "$0" ]; then
case "$0" in
/etc/init.d/* ) ;;
* ) exec "$(readlink "$0")" start
esac
fi
/sbin/start-stop-daemon --start --exec "$0" \
--quiet --background \
--chdir /tmp \
--pidfile "$PIDFILE" --make-pidfile \
-- \
daemon \
# eo start-stop-daemon
} #
cleanup() { #
[ ! -r "$PIPEPIDFILE" ] || \
/sbin/start-stop-daemon --stop \
--pidfile "$PIPEPIDFILE" \
--quiet \
--retry=TERM/5/KILL/1 2> /dev/null || : \
# eo start-stop-daemon
rm -f "$PIPEPIDFILE"
} #
stop() { #
/sbin/start-stop-daemon --stop \
--pidfile "$PIDFILE" \
--quiet \
--retry=TERM/5/KILL/1 \
# eo start-stop-daemon
cleanup
} #
restart() { #
status && start
} #
reload() { #
restart
} #
status() { #
/sbin/start-stop-daemon --stop \
--pidfile "$PIDFILE" \
--signal 0 \
# eo start-stop-daemon
} #
usage() { #
exec 1>&2
echo
echo "usage: $0 [start|stop|restart|reload|status|daemon]"
echo
exit 1
} #
if [ "$$" = 1 ]; then
# we are running as PID 1
container_entrypoint_catrunner
else
# we are running as initscript or daemon
case "$1" in
start | stop | restart | reload | status | daemon ) "$1" ;;
* ) usage ;;
esac
fi
# vim: ft=sh sw=2 ts=2 et foldmethod=marker foldmarker={\ #,}\ #
# vim ts=2 sw=2 et ft=Dockerfile
FROM alpine
COPY files/lib/entrypoint.sh /lib/entrypoint.sh
RUN set -ex; \
apk add --update ipset docker-cli iproute2 openrc ethtool util-linux netcat-openbsd; \
apk upgrade; \
sed -i -r -e '2 i mv() { rm -f "$1"; }' /usr/share/udhcpc/default.script; \
sed -i -r -e '1 s@^#!.*$@#!/bin/sh@' /lib/entrypoint.sh; \
:
ENTRYPOINT [ "/lib/entrypoint.sh", "daemon" ]
# example start:
# docker run \
# -ti --rm --log-driver none \
# --pid host --network host --privileged \
# -v "/etc/init.d/mesh-docker-service-companion:/lub/entrypoint.sh:ro" \
# -v "/var/run/docker.sock:/var/run/docker.sock:rw"
# --name mesh-docker-service-companion \
# j0ju/mesh-service-companion:alpine
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment