Last active
June 19, 2021 21:18
-
-
Save j0ju/61eb648932e74eb81a0bb6c77bfb0816 to your computer and use it in GitHub Desktop.
a simple CNI-alike for docker for a more specific network config in containers, also can act on unhealthy containers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[Unit] | |
Description=Mesh Docker Service Companion | |
ReloadPropagatedFrom=openvpn.service | |
After=network-online.target | |
Requires=docker.service | |
[Service] | |
Type=simple | |
PrivateTmp=true | |
WorkingDirectory=/run | |
ExecStart=/etc/init.d/mesh-docker-service-companion daemon | |
#CapabilityBoundingSet=CAP_NET_ADMIN CAP_SYS_ADMIN | |
LimitNPROC=24 | |
DeviceAllow=/dev/null rw | |
ProtectSystem=true | |
ProtectHome=true | |
RestartSec=5s | |
Restart=on-failure | |
[Install] | |
WantedBy=multi-user.target |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
### BEGIN INIT INFO | |
# Provides: mesh-docker-service-companion | |
# Required-Start: $local_fs $network $remote_fs | |
# Required-Stop: $local_fs $network $remote_fs | |
# Default-Start: 2 3 4 5 | |
# Default-Stop: 0 1 6 | |
# Short-Description: mesh-docker-service-companion docker network attach sidecar helper | |
# Description: mesh-docker-service-companion docker network attach sidecar helper | |
### END INIT INFO | |
set -eu | |
# Label that this deamon would act on for modes in network mode 'none' | |
# service_ips= # comma seperated list for IPs on a dedicated peer to peer interfaces handed into the container | |
# bridge_member= # a bridge a interface should be added to | |
# bridge_ips= # an IP for the bridge interface | |
# service_interface= # a interface available on host were we add service ips via proxy arp | |
# Label that this deamon would act on for modes in network mode 'host' | |
# service_interface= # a interface available in network mode == host, where service IPs can bound to | |
# service_ips= # a comma seperated list of IPs added to the service_interface, and later cleaned up on exit | |
# Examples | |
# docker run --rm -dti --network=none -l service_ips=1.1.1.1 debian:buster bash - | |
# docker run --rm -dti --network=host -l service_interface=foo -l service_ips=8.8.8.8,heise.de debian:buster bash - | |
# Label for the "cat" runner functionality mimicing lxc | |
# cat= # if that label is set and the entrypoint is /lib/catrunner.sh, the container | |
# # is pivot rooted to /rootfs, (works best if /rootfs is a volume or bind mount) | |
# # for this you need this script bind mounted to /lib/catrunner.sh | |
# cat_cgroups=* # cgroups to keep | |
# # e.g. | |
# # * = all cgroups in RO (default) | |
# # pids = pids cgroup in RO | |
# # pids:rw = pids cgroup in RW | |
# cat_sysfs=no # if sysfs is present in pet (default: yes) | |
# Example | |
# docker run --rm --log-driver none -v /etc/init.d/mesh-docker-service-companion:/lib/catrunner.sh:ro --entrypoint /lib/catrunner.sh -l cat=yes hub.i.qx.ai/debian-buster:runit.amd64 | |
# (hub.i.qx.ai/debian-buster:runit.amd64 is an image where runit is the local init system) | |
# * This example is not working properly as (buster has systemd as init system, with jessie it is working better (sysvinit)) | |
# docker run --rm --log-driver none -v /etc/init.d/mesh-docker-service-companion:/lib/catrunner.sh:ro --entrypoint /lib/catrunner.sh -l cat=yes debian:buster | |
PIDFILE=/var/run/docker-events.pid | |
PIPE=/var/run/docker-events.pipe | |
PIPEPIDFILE=/var/run/docker-events.pipe.pid | |
STATEDIR=/run/mesh-docker-service-companion | |
HOST_SVC_IP=169.254.123.123/32 | |
HOST_SVC_IPv6=fe80::1/64 | |
CONTAINER_IPv6=fe80::dead/64 | |
HOST_SVC_PFX=V | |
CONTAINER_SVC_PFX=v | |
HOST_BRIDGE_PFX=B | |
CONTAINER_BRIDGE_PFX=b | |
container_add_p2p_interface() { # | |
case "$network_mode" in | |
none | host ) ;; # we handle this types of network modes | |
* ) return 0 ;; # we do NOT | |
esac | |
local oIFS="$IFS" | |
# check if p2p interface already in place | |
if nsenter -t $pid -n -- ip link show dev p2p > /dev/null 2>&1; then | |
echo "I: container $cid: p2p interface already present, skipping" | |
return 0 | |
fi | |
if [ -n "$label_service_interface" ] && [ -d "/sys/class/net/$label_service_interface" ]; then | |
HOST_SVC_PFX="S" | |
fi | |
# set interface names | |
# set host side interface name | |
for hostIf in "${HOST_SVC_PFX}$cname" "${HOST_SVC_PFX}$cid" $cid ${$}${cid}; do | |
hostIf="$(echo "${hostIf}" | head -c 15)" | |
if ! [ -d "/sys/class/net/$hostIf" ]; then | |
break | |
fi | |
echo "W: $hostIf is already used, skipping" >&2 | |
hostIf= | |
done | |
if [ -z "$hostIf" ]; then | |
echo "E: no host side service interface name found, skipping" >&2 | |
break | |
fi | |
# set container side interface name | |
for containerIf in "${CONTAINER_SVC_PFX}$cname" "${CONTAINER_SVC_PFX}$cid" $cid ${$}${cid}; do | |
containerIf="$(echo "${containerIf}" | head -c 15)" | |
if ! [ -d "/sys/class/net/$containerIf" ]; then | |
break | |
fi | |
echo "W: $containerIf is already used, skipping" >&2 | |
containerIf= | |
done | |
if [ -z "$containerIf" ]; then | |
echo "E: no container side service interface name found, skipping" >&2 | |
break | |
fi | |
# create interfaces | |
ip link add "$hostIf" type veth peer name "$containerIf" | |
sysctl > /dev/null -w net.ipv6.conf.$containerIf.disable_ipv6=1 | |
sysctl > /dev/null -w net.ipv6.conf.$hostIf.disable_ipv6=1 | |
ip link set netns "$pid" dev "$containerIf" | |
# setup container side | |
nsenter -t $pid -n -- ip link set up name p2p dev $containerIf | |
echo "$cname:p2p <-> $hostIf" | |
# setup host side | |
ip link set up dev $hostIf | |
# ToDo | |
V6_ready=no | |
V4_ready=no | |
# collect the list of IPs and hostnames and resolve them, afterwards pipe them to the address setting loop | |
IFS=, | |
for item in $label_service_ips; do # collect and resolve - output per line: $ip $item | |
IFS="$oIFS" | |
case "$item" in | |
"" ) | |
# we have nothing | |
continue | |
;; | |
*:* | [1-9]*.*.* ) | |
# assume we have an IPv6 or IPV4 address | |
echo "$item $item" | |
;; | |
* ) | |
# assume we have an hostname, try to resolve it | |
getent ahosts "$item" | grep -Eo "^[^ ]+" | sort -u | sed -e "s/$/ $item/" | |
;; | |
esac | |
done | while IFS="$oIFS" read -r ip label; do # set the addresses | |
IFS="$oIFS" | |
case "$ip" in | |
*:* ) # assume we have an IPv6 address | |
ip="$ip/128" | |
if [ $V6_ready = no ]; then # enable v6, add IP to host and container | |
sysctl > /dev/null -w net.ipv6.conf.$hostIf.disable_ipv6=0 | |
nsenter -t $pid -n -- sysctl > /dev/null -w net.ipv6.conf.p2p.disable_ipv6=0 | |
ip addr add $HOST_SVC_IPv6 dev $hostIf | |
nsenter -t $pid -n -- ip addr add $CONTAINER_IPv6 dev p2p | |
nsenter -t $pid -n -- ip route add default via ${HOST_SVC_IPv6%/*} dev p2p metric 100 | |
V6_ready=yes | |
fi | |
nsenter -t $pid -n -- ip addr add $ip dev p2p | |
# prevent asymetric routing | |
nsenter -t $pid -n -- ip route add default from "$ip" via ${HOST_SVC_IPv6%/*} dev p2p metric 100 | |
# add route from host to container | |
ip route add $ip dev $hostIf | |
;; | |
[1-9]*.*.* ) # assume we have an IPv4 address | |
ip="$ip/32" | |
if [ $V4_ready = no ]; then | |
ip addr add $HOST_SVC_IP dev $hostIf | |
fi | |
nsenter -t $pid -n -- ip addr add "$ip" dev p2p | |
ip route add $ip dev $hostIf | |
if [ $V4_ready = no ]; then | |
nsenter -t $pid -n -- ip route add $HOST_SVC_IP dev p2p | |
# add a default route | |
nsenter -t $pid -n -- ip route add default via ${HOST_SVC_IP%/*} metric 100 | |
# prevent asymetric routing via policy routing, as the FROM extension/source routing is not valid for ipv4 | |
nsenter -t $pid -n -- ip route add default via ${HOST_SVC_IP%/*} metric 100 table 23 | |
V4_ready=yes | |
fi | |
# prevent asymetric routing via policy routing, as the FROM extension is not valid for ipv4 | |
nsenter -t $pid -n -- ip rule add from "$ip" table 23 pref 23 | |
;; | |
* ) # this should not happen | |
echo "E: '$ip' is not a IPv4/6 address, set via label '$label', abort" >&2 | |
break | |
;; | |
esac | |
if [ -n "$label_service_interface" ] && [ -d "/sys/class/net/$label_service_interface" ]; then | |
ip neigh add proxy ${ip%/*} dev $label_service_interface | |
echo "$hostIf $ip" >> "$statefile" | |
fi | |
echo "$cname:p2p added $ip" | |
done | |
echo "$cname:p2p $hostIf host service interface up" | |
# write out state for later usage on stop or destroy | |
echo "$hostIf" >> "$statefile" | |
IFS="$oIFS" | |
} # | |
container_add_eth0_interface() { # | |
local oIFS="$IFS" | |
case "$network_mode" in | |
none | host ) ;; # we handle this types of network modes | |
* ) return 0 ;; # we do NOT | |
esac | |
# check if bridge interface already in place | |
if nsenter -t $pid -n -- ip link show dev eth0 > /dev/null 2>&1; then | |
echo "I: container $cid: eth0 bridge interface already present, skipping" | |
return 0 | |
fi | |
# TODO: trap on failure and delete interface | |
# set interface names | |
# set host side interface name | |
for hostIf in "${HOST_BRIDGE_PFX}$cname" "${HOST_BRIDGE_PFX}$cid" $cid ${$}${cid}; do | |
hostIf="$(echo "${hostIf}" | head -c 15)" | |
if ! [ -d "/sys/class/net/$hostIf" ]; then | |
break | |
fi | |
echo "W: $hostIf is already used, skipping" >&2 | |
hostIf= | |
done | |
if [ -z "$hostIf" ]; then | |
echo "E: no host side service interface name found, skipping" >&2 | |
break | |
fi | |
# set container side interface name | |
for containerIf in "${CONTAINER_BRIDGE_PFX}$cname" "${CONTAINER_BRIDGE_PFX}$cid" $cid ${$}${cid}; do | |
containerIf="$(echo "${containerIf}" | head -c 15)" | |
if ! [ -d "/sys/class/net/$containerIf" ]; then | |
break | |
fi | |
echo "W: $containerIf is already used, skipping" >&2 | |
containerIf= | |
done | |
if [ -z "$containerIf" ]; then | |
echo "E: no container side service interface name found, skipping" >&2 | |
break | |
fi | |
# create interfaces | |
h_mac="fe:$(echo $cname | md5sum | sed -re 's/^(..)(..)(..)(..)(..).*/\1:\2:\3:\4:\5/')" | |
c_mac="02:${h_mac##??:}" | |
ip link add "$hostIf" type veth peer name "$containerIf" | |
ip link set address "$h_mac" dev "$hostIf" | |
ip link set address "$c_mac" dev "$containerIf" | |
sysctl > /dev/null -w net.ipv6.conf.$hostIf.disable_ipv6=1 | |
ip link set netns "$pid" dev "$containerIf" | |
# setup host side | |
ip link set up master "$label_bridge_member" dev $hostIf | |
echo "$cname:eth0 (mac: $c_mac) <-> $hostIf (mac: $h_mac, bridge: $label_bridge_member)" | |
# setup container side | |
nsenter -t $pid -n -- sysctl > /dev/null -w net.ipv6.conf.lo.disable_ipv6=0 | |
nsenter -t $pid -n -- sysctl > /dev/null -w net.ipv6.conf.$containerIf.disable_ipv6=0 | |
nsenter -t $pid -n -- ip link set up name eth0 dev $containerIf | |
if [ "$label_bridge_checksum_offload" = off ]; then | |
nsenter -t $pid -n -- ethtool -K eth0 rx off tx off | |
fi | |
if [ "$label_bridge_ips" = dhcp ]; then | |
# this needs to set up addesses/prefixes and routes | |
nsenter -t $pid -n -- udhcpc -i eth0 -q -x "hostname:$cname" | |
echo "$cname:eth0 got DHCP" | |
nsenter -t $pid -n -- ip addr show dev eth0 | sed -rne "/inet[6]?/ s/^[ ]+/$cname:eth0: / p" | |
else | |
# add addresses | |
( IFS=, | |
for ip in $label_bridge_ips; do | |
[ "$ip" = dhcp ] && \ | |
continue || \ | |
nsenter -t $pid -n -- ip addr add "$ip"/32 dev eth0 | |
echo "$cname:eth0 added $ip" | |
done | |
) | |
# set device routes | |
ip r list dev $label_bridge_member | awk '$1 ~ "^default|/[0-9]+$" {print}' | while read -r pfx via gw _; do | |
if [ "$via" != via ]; then | |
nsenter -t $pid -n -- ip route add $pfx dev eth0 metric 10 | |
echo "$cname:eth0 added device route $pfx" | |
fi | |
done | |
# set routes with gateways | |
ip r list dev $label_bridge_member | awk '$1 ~ "^default|/[0-9]+$" {print}' | while read -r pfx via gw _; do | |
if [ "$via" = via ]; then | |
nsenter -t $pid -n -- ip route add $pfx via "$gw" dev eth0 metric 10 | |
echo "$cname:eth0 added route $pfx via $gw" | |
fi | |
done | |
fi | |
echo "$cname:eth0 $hostIf host bridge interface up" | |
# write out state for later usage on stop or destroy | |
echo "$hostIf" >> "$statefile" | |
IFS="$oIFS" | |
} # | |
container_add_service_ips() { # | |
case "$network_mode" in | |
none | host ) ;; # we handle this types of network modes | |
* ) return 0 ;; # we do NOT | |
esac | |
if ! ip link show dev "$label_service_interface" > /dev/null 2>&1; then | |
echo "E: interface from service_interface label '$label_service_interface' not found" >&2 | |
echo "W: not adding any IPs" >&2 | |
return 0 | |
fi | |
local ip= | |
local label= | |
local oIFS="$IFS" | |
# get CIDR of primary addresses on interface | |
# use them to add svc address to interface to avoid more specific imports | |
local cidr4="$( ip -4 a s dev "$label_service_interface" | awk -F"[ /]+" '$2 == "inet" {print $4; exit}' )" | |
local cidr6="$( ip -6 a s dev "$label_service_interface" | awk -F"[ /]+" '$2 == "inet6" {print $4; exit}' )" | |
# collect the list of IPs and hostnames and resolve them, afterwards pipe them to the address setting loop | |
IFS=, | |
for ip in $label_service_ips; do # collect and resolve | |
IFS="$oIFS" | |
case "$ip" in | |
"" ) | |
# we have nothing | |
continue | |
;; | |
*:* | [1-9]*.*.* ) | |
# assume we have an IPv6 or IPV4 address | |
echo "$ip $ip" | |
;; | |
* ) | |
# assume we have an hostname, try to resolve it | |
getent ahosts "$ip" | grep -Eo "^[^ ]+" | sort -u | sed -e "s/$/ $ip/" | |
;; | |
esac | |
done | while IFS="$oIFS" read -r ip label; do # set the addresses | |
IFS="$oIFS" | |
case "$ip" in | |
*:* ) # assume we have an IPv6 address | |
cidr=128 | |
cidr_if="$cidr6" | |
;; | |
[1-9]*.*.* ) # assume we have an IPv4 address | |
cidr=32 | |
cidr_if="$cidr4" | |
;; | |
* ) # this should not happen | |
echo "E: '$ip' is not a IPv4/6 address, set via label '$label', abort" >&2 | |
break | |
;; | |
esac | |
# we now have either an IPv4 or an IPv6 address, determine the correct cidr | |
# if it is not routed via a gateway, but via the specified $label_service_interface", then use the interfaces | |
# CIDR, otherwise use /32 or /128 depending on the protocol | |
if route="$(ip route get "$ip" | grep " dev $label_service_interface ")"; then | |
case "$route" in | |
*" via "* ) ip="$ip/$cidr" ;; | |
* ) ip="$ip/$cidr_if" ;; | |
esac | |
else | |
ip="$ip/$cidr" | |
fi | |
if ip route get ${ip%/*} | grep ^local > /dev/null || ip addr add "$ip" dev "$label_service_interface"; then | |
echo "I: $cname: added $ip to $label_service_interface" | |
echo "$label_service_interface $ip" >> "$statefile" | |
else | |
echo "E: could not add '$ip' ($label) to '$label_service_interface'" >&2 | |
fi | |
done | |
IFS="$oIFS" | |
} # | |
host_container_add_netns() { # | |
mkdir -p /var/run/netns | |
ln -sf "/proc/$pid/ns/net" "/var/run/netns/$cname" | |
ln -sf "/var/run/netns/$cname" "/var/run/netns/$cid" | |
} # | |
container_entrypoint_catrunner() { # | |
# this runs as PID 1 in the container via | |
# ... -v $0:/lib/catrunner.sh:ro --entrypoint /lib/catrunner.sh ... | |
if ! [ -r /proc/mounts ]; then | |
# if we have no /proc/mounts the container is high likely broken | |
echo "$0: ERROR: /this should no happend, /proc/mounts does not exists, ABORT" | |
exit 1 | |
fi | |
# wait for /mnt, as the cat_preparer will prepare rootfs from the running image if possible | |
echo "${0}: wait for rootfs" | |
while ! grep " /mnt " /proc/mounts > /dev/null 2> /dev/null; do | |
sleep 1 | |
done | |
# the cat_preparer has done the pivot root | |
cd / | |
echo "${0}: rootfs ready, execing init" | |
# start the init in the "cat" container | |
for init in /lib/preinit /sbin/init; do | |
[ -x "${init%% *}" ] || continue | |
echo "${0}: exec $init as init" | |
exec "$init" "$@" | |
done | |
echo "${0}: ERROR: exec of INIT failed $init, ABORT" | |
exit 1 | |
} # | |
container_cat_preparer() { # | |
# init { # | |
local oIFS="$IFS" | |
# this is started by container_start, if label_cat is not empty | |
# 1st: check if the entrypoint is /lib/catrunner.sh | |
entrypoint="$(docker inspect --format='{{ index .Config.Entrypoint 0 }}' "$cid")" | |
if ! [ "$entrypoint" = /lib/catrunner.sh ]; then | |
echo "I: label cat is set but, entrypoint is not /lib/catrunner.sh, IGNORE" | |
return 0 | |
fi | |
# 2nd check if /lib/catrunner.sh is still a bind mount, if not do not act on this container | |
if ! awk 'BEGIN { rs=1 } $2 == "'"$(nsenter -t $pid -m -p -- readlink -f /lib/catrunner.sh)"'" { rs=0; print $0 }; END { exit rs }' /proc/$pid/mounts; then | |
echo "I: /lib/catrunner.sh rootfs preparation has been run already, IGNORE" | |
return 0 | |
fi | |
echo "I: new pet started with /lib/catrunner.sh helper" | |
# check if we /rootfs is a mountpoint | |
# if not do a bind so that the later pivot_root call is working | |
if ! grep " /rootfs " /proc/$pid/mounts > /dev/null; then | |
echo "I: no /rootfs mounted, creating bind mount / -> /rootfs" | |
nsenter -t $pid -m -p -- sh -c ' | |
mkdir -p /rootfs | |
mount --bind / /rootfs | |
' # eo nsenter sh | |
fi | |
# } # | |
# do bootstrap of rootfs, if no init is found # { # | |
# if we find no init process to start, check if any file in /rootfs, | |
# if not bootstrap: copy the current image rootfs to /rootfs | |
nsenter -t $pid -m -p -- sh -c ' | |
if [ ! -x /rootfs/sbin/init -a ! -x /rootfs/lib/preinit ]; then | |
if [ -z "$( ls -d /rootfs/* 2> /dev/null )" ]; then | |
echo "I: /rootfs empty, cloning rootfs from docker image" | |
for f in /*; do | |
case "$f" in | |
/rootfs ) | |
continue | |
;; | |
/sys | /proc ) | |
mkdir -p "/rootfs/$f" | |
continue | |
;; | |
/tmp ) | |
mkdir -p "/rootfs/$f" | |
chmod 1777 "/rootfs/$f" | |
continue | |
;; | |
* ) | |
if grep " $f " /proc/mounts > /dev/null; then | |
mkdir "/rootfs/$f" -p | |
continue | |
fi | |
;; | |
esac | |
cp -a "$f" /rootfs | |
done | |
rm -f /rootfs/lib/catrunner.sh | |
fi | |
fi | |
[ -d /rootfs/mnt ] || \ | |
mkdir -p /rootfs/mnt | |
[ -d /rootfs/run ] || \ | |
mkdir -p /rootfs/run | |
' # eo nsenter sh # } # | |
# mount mounts and pivot_root { # | |
# * move system mounts from docker, before pivot_root | |
# because it is a "cat", it could be modified by its user afterwards, eg. removal of directories | |
# the user thinks, that are not needed | |
# we only move, known system mounts for which a directory exists | |
# the rest under /mnt/ is cleared after execing init | |
# * if we have /tmp, mount a tmpfs there if there is not a mountpoint | |
# * if we have /run, mount a tmpfs there if there is not a mountpoint | |
nsenter -t $pid -m -p -- sh -c ' | |
for m in /dev /sys /etc/resolv.conf /tmp /proc; do | |
if mount --move "$m" "/rootfs$m" 2> /dev/null; then | |
echo "I: moved system mount $m" | |
else | |
echo "E: error moving mount $m -> /rootfs" | |
fi | |
done | |
if [ -d /rootfs/tmp ]; then | |
if ! mountpoint /rootfs/tmp > /dev/null; then | |
echo "I: creating empty tmpfs in /tmp" | |
mount -t tmpfs tmpfs /rootfs/tmp -o '"size=$label_cat_tmp_size"' | |
chown root.root /rootfs/tmp | |
chmod 1777 /rootfs/tmp | |
fi | |
fi | |
if [ -d /rootfs/run ]; then | |
if ! mountpoint /rootfs/run > /dev/null; then | |
echo "I: creating empty tmpfs in /run" | |
mount -t tmpfs tmpfs /rootfs/run -o '"size=$label_cat_run_size"' | |
mkdir -p /rootfs/run/lock | |
chown root.root /rootfs/run /rootfs/run/lock | |
chmod 0755 /rootfs/run /rootfs/run/lock | |
fi | |
fi | |
' | |
# pivot_root | |
nsenter -t $pid -m -p -- pivot_root /rootfs /rootfs/mnt | |
echo "I: pivot root done" | |
# it will not be touched anymore by container_cat_preparer | |
# if /lib/catrunner.sh -> /mnt/lib/catrunner.sh is no mountpoint anymore (flag) | |
# } # | |
# unmount or modify filesystem mounts in pet container { # | |
# after this init is executed by container_entrypoint_catrunner | |
# - cleanup system mounts | |
# - try to clean up leftovers | |
# try to clean up leftovers in /mnt for 7 times with 1 second retry | |
_mountpoint_considered= | |
for i in 1 2 3 4 5 6 7; do | |
# cleanup system mounts | |
cat /proc/$pid/mounts > /tmp/.$$.mounts | |
while read what where type opts _ _; do # read from mounts in namepace | |
# see at the end of loop | |
case "$_mountpoint_considered" in | |
"$where" | *" $where "* | *" $where" ) | |
#echo " $where already reconsiled" | |
continue | |
;; | |
esac | |
#echo " ($i) consiling $type:$where" | |
case "$type:$where" in | |
*:/proc/* ) # { # | |
nsenter -t $pid -m -p -- mount -t tmpfs -o size=16k,noexec,nosuid,noatime proc /mnt | |
nsenter -t $pid -m -p -- chmod 0755 /mnt | |
if [ -d "$where" ]; then | |
if nsenter -t $pid -m -p -- umount "$where"; then | |
nsenter -t $pid -m -p -- mount --move /mnt "$where" || \ | |
nsenter -t $pid -m -p -- mount -o move /mnt "$where" | |
nsenter -t $pid -m -p -- mount -o nodev,remount -r "$where" | |
else | |
nsenter -t $pid -m -p -- umount /mnt | |
fi | |
else | |
nsenter -t $pid -m -p -- cp -a /dev/null "/mnt/inode" | |
if nsenter -t $pid -m -p -- umount "$where"; then | |
nsenter -t $pid -m -p -- mount -o bind /mnt/inode "$where" | |
nsenter -t $pid -m -p -- mount -o remount -r "$where" | |
fi | |
nsenter -t $pid -m -p -- umount /mnt | |
fi | |
;; # } # | |
cgroup:/sys/fs/cgroup/* ) # { # | |
# determine cgroup mode | |
cgroup="${where##*/}" | |
cgroup_mode= | |
IFS=, | |
for _cg in $label_cat_cgroups_default,$label_cat_cgroups; do | |
IFS="$oIFS" | |
_cgroup_mode="${_cg##*[=:]}" | |
_cgroup_mode="${_cgroup_mode%,}" | |
_cgroup="${_cg%%[=:]*}" | |
if [ "$_cgroup" = "*" -o "$_cgroup" = "$cgroup" ]; then | |
cgroup_mode="$_cgroup_mode" | |
fi | |
IFS=, | |
done | |
IFS="$oIFS" | |
# act | |
case "$cgroup_mode" in | |
rw | ro ) | |
if ! nsenter -t $pid -m -p -C -- mount -o "remount,$cgroup_mode" "$where" 2> /dev/null ; then | |
opts="${opts#r[ow]}" | |
opts="${opts#,}" | |
nsenter -t $pid -m -p -C -- umount "$where" | |
nsenter -t $pid -m -p -C -- mount -t "$type" -o "$cgroup_mode,$opts" "$what" "$where" | |
fi | |
;; | |
umount | * ) | |
if nsenter -t $pid -m -p -C -- umount "$where"; then | |
nsenter -t $pid -m -p -C -- rmdir "$where" | |
else | |
echo "E: could not unmount '$where'" >&2 | |
fi | |
;; | |
esac | |
;; # } # | |
*:/sys ) # { # | |
nsenter -t $pid -m -p -C -- sh -e -c ' | |
mount -t tmpfs -o size=16k,noexec,nosuid,noatime sysfs /mnt | |
chmod 0755 /mnt | |
mkdir -p /mnt/fs/cgroup/cgroup | |
mount --move /sys /tmp || \ | |
mount -o move /sys /tmp | |
mount --move /mnt /sys || \ | |
mount -o move /mnt /sys | |
mount --move /tmp /mnt || \ | |
mount -o move /tmp /mnt | |
mount --bind /sys/fs/cgroup/cgroup /sys/fs/cgroup || \ | |
mount -o bind /sys/fs/cgroup/cgroup /sys/fs/cgroup | |
for d in /mnt/fs/cgroup/*; do | |
[ -d "$d" ] || \ | |
continue | |
[ ! -L "$d" ] || \ | |
continue | |
mkdir -p "/sys/fs/cgroup/${d##*/}" | |
mount --move $d "/sys/fs/cgroup/${d##*/}" || \ | |
mount -o move $d "/sys/fs/cgroup/${d##*/}" | |
done | |
' # eo nsenter | |
;; # } # | |
mqueue:* ) # { # | |
nsenter -t $pid -m -p -C -- umount "$where" | |
nsenter -t $pid -m -p -C -- rmdir "$where" 2> /dev/null || : | |
;; # } # | |
esac | |
# add this to conciled paths | |
#echo " ($i) conciled $type:$where" | |
_mountpoint_considered="$_mountpoint_considered $where" | |
done < /tmp/.$$.mounts | |
rm -f /tmp/.$$.mounts | |
nsenter -t $pid -m -p -C -- mount -o remount -r /sys || : | |
if mounts="$( grep -E -o " /mnt(|[^ ]+) " /proc/$pid/mounts )"; then | |
nsenter -t $pid -m -p -- sh -c ' | |
mounts="'"$mounts"'" | |
for m in $mounts; do | |
if umount "$m" 2> /dev/null; then | |
echo "I: unmounted $m" | |
else | |
echo "W: $m still mounted, retrying ..." | |
fi | |
done | |
rmdir /rootfs 2> /dev/null || :; | |
' | |
sleep 1 | |
else | |
break | |
fi | |
done | |
# } # | |
} # | |
# TODO: -?!?- start <-> cat-entrypoint, lock via /etc/resolv.conf, if preparetion is still running | |
container_start() { # | |
local cid="$1" | |
# ensure clean variables, set defaults | |
local label_service_ips= # comma seperated list of IPs on a dedicated peer to peer interfaces handed into the container | |
local label_bridge_member= # a bridge a interface should be added to | |
local label_bridge_ips= # an IP for the bridge interface, dhcp is an option | |
# | |
local label_bridge_checksum_offload= # when using bridge and local VMs not from KVM (eg VirtualBox, VMWare), we need this to fix checksum errors | |
# | |
local label_service_interface= # a interface available in network mode == host, where service IPs can bound to | |
local label_service_ips= # a comma seperated list of IPs added to the service_interface, and later cleaned up on exit | |
# | |
local label_cat= | |
local label_cat_tmp_size=262144k | |
local label_cat_run_size=131072k | |
local label_cat_cgroups="" | |
local label_cat_cgroups_default="*:umount,pids:rw" | |
local label_cat_sysfs="yes" # mounted | |
# only act if network mode is "none" | |
local network_mode="$( docker inspect --format='{{ range $key, $value := .NetworkSettings.Networks}}{{ $key }}{{ end}}' "$cid" )" | |
# fetch label from container | |
eval "$(docker inspect -f '{{ range $k, $v := .Config.Labels -}} | |
local label_{{ $k }}='"'"'{{ $v }}'"'"' | |
{{ end -}}' "$cid" | \ | |
grep -E "^[ ]*local label_(service_interface|service_ips|bridge_ips|bridge_member|bridge_checksum_offload|cat|cat_cgroups|cat_sysfs|cat_tmp_size|cat_run_size)=" | |
) | |
" # eo EVAL | |
# fetch container name | |
local cname="$(docker inspect --format='{{.Name}}' "$cid")" | |
local cname="${cname#/}" | |
# get pid for nsenter | |
local pid="$(docker inspect --format='{{.State.Pid}}' $cid)" | |
if [ "$pid" = 0 ]; then | |
# container died before we could do something with it | |
return 0 | |
fi | |
# create/cleanup statefile | |
statefile="$STATEDIR/$cid.interfaces" | |
if [ -L "$statefile" ]; then | |
rm -f "$statefile" | |
fi | |
echo "$cname" > "$statefile" | |
echo "$cname: container started (pid $pid, id $cid)" | |
# add netns name/alias for container based on name | |
host_container_add_netns | |
case "$network_mode" in | |
none ) | |
if [ -n "$label_bridge_ips" -a -n "$label_bridge_member" ]; then | |
container_add_eth0_interface | |
fi | |
# setup direct link p2p service interface | |
if [ -n "$label_service_ips" ]; then | |
container_add_p2p_interface | |
fi | |
;; | |
host ) | |
if [ -n "$label_service_interface" -a -n "$label_service_ips" ]; then | |
container_add_service_ips | |
fi | |
;; | |
esac | |
if [ -n "$label_cat" ]; then | |
container_cat_preparer | |
fi | |
} # | |
container_stop() { # | |
container_die "$1" | |
} # | |
container_die() { # | |
local cid="$1" | |
local cname="$cid" | |
local ip= | |
local statefile="$STATEDIR/$cid.interfaces" | |
if [ -f "$statefile" ]; then | |
exec 3< $statefile | |
read -r cname <&3 | |
echo "$cname: container died (id $cid)" | |
while read -r hostIf ip; do | |
if [ -n "$ip" ]; then | |
while ip neigh del "${ip%/*}" 2> /dev/null; do :; done | |
ip addr del "$ip" dev "$hostIf" 2> /dev/null || : | |
echo "$cname: removed neighbor entries for $ip" | |
fi | |
ip link del "$hostIf" 2> /dev/null || : | |
echo "$cname: cleaned up host interface $hostIf" | |
ip= | |
done <&3 | |
rm -f "$statefile" | |
exec 3>&- | |
fi | |
if [ -L "/var/run/netns/$cid" ]; then | |
( cd /var/run/netns | |
if link="$(readlink "$cid")"; then | |
rm -f "$link" | |
echo "$cname: cleaned up netns link ${link##*/}" | |
fi | |
rm -f "$cid" | |
echo "$cname: cleaned up netns link $cid" | |
) | |
fi | |
} # | |
container_health_status() { # | |
local status="$1" | |
local cid="$2" | |
case "$status" in | |
healthy | starting | none ) | |
echo "I: container_health_status: cid $cid is $status, ignoring" | |
return 0 | |
;; | |
unhealthy ) | |
echo "E: container_health_status: cid $cid is $status, restarting" >&2 | |
;; | |
* ) | |
echo "E: container_health_status: cid $cid has unhandled $status, IGNORING" >&2 | |
return 1 | |
;; | |
esac | |
docker restart "$cid" | |
} # | |
daemon() { # | |
mkdir -p "$STATEDIR" || if [ ! -d "$STATEDIR" ]; then | |
echo "E: '$STATEDIR' is not a directory, EXITING" >&2 | |
exit 1 | |
fi | |
# proper cleanup of pipe attached child processes | |
trap "cleanup; exit" EXIT HUP INT QUIT TERM | |
# setup pipe to receive events from "docker events" | |
# do this early so we do not miss events, | |
# so state after cold plugging is followed, correctly | |
rm -f "$PIPE" "$PIPEPIDFILE" | |
mknod "$PIPE" p | |
"$(which docker)" events > "$PIPE" < /dev/null 2> /dev/null & PIPEPID=$! | |
echo "$PIPEPID" > "$PIPEPIDFILE" | |
exec < "$PIPE" | |
# container:start coldplug | |
docker ps --filter status=running --format "{{ .ID }}" | while read -r cid; do | |
echo "I: found running container $cid" | |
# we have the short Id: fetch long Id | |
cid="$( docker inspect "$cid" --format='{{ .Id }}' )" | |
container_start "$cid" & | |
done | |
# container:health_status coldplug - check for unhealthy containers | |
docker ps --filter health=unhealthy --format "{{ .ID }}" | while read -r cid; do | |
echo "I: found running container $cid" | |
# we have the short Id: fetch long Id | |
cid="$( docker inspect "$cid" --format='{{ .Id }}' )" | |
container_health_status unhealthy "$cid" & | |
done | |
while read -r date object event meta; do | |
# strip trailing ":" | |
event="${event%:}" | |
#echo "daemon: event_dispatch: $object:$event $meta" | |
case "$object:$event" in | |
# this is dispatching handled events | |
# as a detached background jobs | |
# | |
# container:start|stop|die | |
# - we hand over the container id, $cid = ${meta%% *} (first token in $meta) | |
container:start | container:stop | container:die ) | |
${object}_${event} $meta & | |
;; | |
# container:health_status | |
# - we need the first arguments from $meta | |
# eg. | |
# healthy 2e2a2acc2502eb460c044be6853ecbf6408f9c36e6389943dde2c4690958db57 ... | |
# unhealthy 2e2a2acc2502eb460c044be6853ecbf6408f9c36e6389943dde2c4690958db57 ... | |
container:health_status ) | |
# TODO: may be set a backoff here, so we do add sleeps | |
# if we have to many unhealthy containers restarted in a short period of time | |
# so we can prevent cascaded restarts (that why k8s distinguisches between health and liveness) | |
${object}_${event} $meta & | |
;; | |
# debug: for unknown events | |
# * ) echo "W: unhandled $object:$event $meta" ;; | |
esac < /dev/null | |
done | |
} # | |
start() { # | |
stop | |
if [ -L "$0" ]; then | |
case "$0" in | |
/etc/init.d/* ) ;; | |
* ) exec "$(readlink "$0")" start | |
esac | |
fi | |
/sbin/start-stop-daemon --start --exec "$0" \ | |
--quiet --background \ | |
--chdir /tmp \ | |
--pidfile "$PIDFILE" --make-pidfile \ | |
-- \ | |
daemon \ | |
# eo start-stop-daemon | |
} # | |
cleanup() { # | |
[ ! -r "$PIPEPIDFILE" ] || \ | |
/sbin/start-stop-daemon --stop \ | |
--pidfile "$PIPEPIDFILE" \ | |
--quiet \ | |
--retry=TERM/5/KILL/1 2> /dev/null || : \ | |
# eo start-stop-daemon | |
rm -f "$PIPEPIDFILE" | |
} # | |
stop() { # | |
/sbin/start-stop-daemon --stop \ | |
--pidfile "$PIDFILE" \ | |
--quiet \ | |
--retry=TERM/5/KILL/1 \ | |
# eo start-stop-daemon | |
cleanup | |
} # | |
restart() { # | |
status && start | |
} # | |
reload() { # | |
restart | |
} # | |
status() { # | |
/sbin/start-stop-daemon --stop \ | |
--pidfile "$PIDFILE" \ | |
--signal 0 \ | |
# eo start-stop-daemon | |
} # | |
usage() { # | |
exec 1>&2 | |
echo | |
echo "usage: $0 [start|stop|restart|reload|status|daemon]" | |
echo | |
exit 1 | |
} # | |
if [ "$$" = 1 ]; then | |
# we are running as PID 1 | |
container_entrypoint_catrunner | |
else | |
# we are running as initscript or daemon | |
case "$1" in | |
start | stop | restart | reload | status | daemon ) "$1" ;; | |
* ) usage ;; | |
esac | |
fi | |
# vim: ft=sh sw=2 ts=2 et foldmethod=marker foldmarker={\ #,}\ # |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# vim ts=2 sw=2 et ft=Dockerfile | |
FROM alpine | |
COPY files/lib/entrypoint.sh /lib/entrypoint.sh | |
RUN set -ex; \ | |
apk add --update ipset docker-cli iproute2 openrc ethtool util-linux netcat-openbsd; \ | |
apk upgrade; \ | |
sed -i -r -e '2 i mv() { rm -f "$1"; }' /usr/share/udhcpc/default.script; \ | |
sed -i -r -e '1 s@^#!.*$@#!/bin/sh@' /lib/entrypoint.sh; \ | |
: | |
ENTRYPOINT [ "/lib/entrypoint.sh", "daemon" ] | |
# example start: | |
# docker run \ | |
# -ti --rm --log-driver none \ | |
# --pid host --network host --privileged \ | |
# -v "/etc/init.d/mesh-docker-service-companion:/lub/entrypoint.sh:ro" \ | |
# -v "/var/run/docker.sock:/var/run/docker.sock:rw" | |
# --name mesh-docker-service-companion \ | |
# j0ju/mesh-service-companion:alpine |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment