joemiller/_ERROR.txt Secret

## _ERROR.txt
# more /var/log/user-data.log
...
Configuring kubelet snap
2023-11-30 18:21:26,956:__main__:INFO:No more changes in progress ...
2023-11-30 18:21:27,871:__main__:INFO:received '202/Accepted' from snapd for POST on /v2/snaps (change-id: 7)
2023-11-30 18:21:27,879:__main__:INFO:No more changes in progress ...
2023-11-30 18:21:27,880:__main__:INFO:result for change: {'id': '7', 'kind': 'refresh-snap', 'summary': 'Refresh all snaps: no updates', 'status': 'Done', 'ready': True, 'spawn-time': '2023-11-30T18:21:27.863112183Z', 'ready-time': '2023-11-30T18:21:27.863124863Z', 'data': {'
snap-names': []}}
2023-11-30 18:21:27,932:__main__:INFO:Setting kubelet-eks config to: {'cluster-dns': '172.20.0.10', 'container-runtime': 'remote', 'container-runtime-endpoint': 'unix:///run/containerd/containerd.sock', 'address': '0.0.0.0', 'anonymous-auth': 'false', 'authentication-token-we
bhook': 'true', 'authorization-mode': 'Webhook', 'cgroup-driver': 'cgroupfs', 'client-ca-file': '/etc/kubernetes/pki/ca.crt', 'cloud-provider': 'aws', 'cluster-domain': 'cluster.local', 'cni-bin-dir': '/opt/cni/bin', 'cni-conf-dir': '/etc/cni/net.d', 'config': '/etc/kubernete
s/kubelet/kubelet-config.json', 'kubeconfig': '/var/lib/kubelet/kubeconfig', 'node-ip': '10.8.56.177', 'network-plugin': 'cni', 'register-node': 'true', 'resolv-conf': '/run/systemd/resolve/resolv.conf', 'pod-infra-container-image': 'dkr.ecr.us-east-1.amazonaws.c
om/eks/pause:3.5', '--max-pods': '110'}
2023-11-30 18:21:27,970:__main__:INFO:received '202/Accepted' from snapd for PUT on /v2/snaps/kubelet-eks/conf (change-id: 8)
2023-11-30 18:21:27,979:__main__:INFO:Still 1 changes in progress ...
2023-11-30 18:21:32,984:__main__:INFO:No more changes in progress ...
2023-11-30 18:21:32,986:__main__:INFO:result for change: {'id': '8', 'kind': 'configure-snap', 'summary': 'Change configuration of "kubelet-eks" snap', 'status': 'Error', 'tasks': [{'id': '113', 'kind': 'run-hook', 'summary': 'Run configure hook of "kubelet-eks" snap', 'statu
s': 'Error', 'log': ['2023-11-30T18:21:27Z ERROR invalid option name: "--max-pods"'], 'progress': {'label': '', 'done': 1, 'total': 1}, 'spawn-time': '2023-11-30T18:21:27.933832076Z', 'ready-time': '2023-11-30T18:21:27.979626575Z'}], 'ready': True, 'err': 'cannot perform the
following tasks:\n- Run configure hook of "kubelet-eks" snap (invalid option name: "--max-pods")', 'spawn-time': '2023-11-30T18:21:27.933868886Z', 'ready-time': '2023-11-30T18:21:27.979627625Z'}

## gistfile1.patch
%  diff -u good-bootstrap.sh bad-bootstrap.sh
--- good-bootstrap.sh   2023-11-30 02:35:22.723407109 +0000
+++ bad-bootstrap.sh    2023-11-30 02:35:57.743372957 +0000
@@ -222,52 +222,14 @@
   echo $cpu_to_reserve
 }

+# variable to collect the kubelet-eks snap config options
 _kubelet_snap_options=()

-# Call this function instead of "snap set" to configure the kubelet-eks snap.
-# This functions mitigates a race condition where there's a snap refresh in the
-# background and the set command fails with
-#   error: snap "kubelet-eks" has "auto-refresh" change in progress
-# resulting in startup of misconfigured kubelet.
+# function to add another option for kubelet-eks
 kubelet_snap_add_options() {
     _kubelet_snap_options+=("$@")
 }

-# Call this function before starting the kubelet-eks snap. This function first
-# tries to make sure that no refresh is running and then runs "snap set" with
-# all options added with the kubelet_snap_add_options function and retries
-# several times in case there's a change in progress.
-kubelet_snap_configure() {
-    local retry ret
-
-    # Wait for a possible running refresh to finish.
-    snap watch --last=refresh\?
-    snap watch --last=auto-refresh\?
-
-    # If another refresh is running at this point, the snap commands below may
-    # fail. But the likelihood of this happening is small.
-
-    # If this succeeds the likelihood of another refresh is even smaller.
-    # Ignore the "... no updates available" message.
-    snap refresh kubelet-eks 2> >(grep -v ' has no updates available$' >&2) || true
-
-    # Try to set the options every 2 seconds until we succeed or give up after
-    # 16 seconds. A possible running refresh shouldn't take longer.
-    for (( retry=0; retry < 8; retry++ )); do
-        if (( retry )); then
-            sleep 2
-        fi
-        ret=0
-        if snap set kubelet-eks "${_kubelet_snap_options[@]}"; then
-            break
-        else
-            ret=$?
-        fi
-    done
-
-    return $ret
-}
-
 if [ -z "$CLUSTER_NAME" ]; then
     echo "CLUSTER_NAME is not defined"
     exit  1
@@ -556,7 +518,9 @@
     args="$KUBELET_EXTRA_ARGS"

 echo "Configuring kubelet snap"
-kubelet_snap_configure
+python3 /usr/local/share/eks/snapd-helper.py wait
+python3 /usr/local/share/eks/snapd-helper.py refresh
+python3 /usr/local/share/eks/snapd-helper.py configure kubelet-eks "${_kubelet_snap_options[*]}"

 echo "Starting k8s kubelet daemon"
 snap start --enable kubelet-eks

## snapd-helper.py
#!/usr/bin/python3

import pathlib
import sys
import argparse
import http.client
import socket
import json
import logging
import time
from typing import Dict, Any


logger = logging.getLogger(__name__)


class SnapdConnection(http.client.HTTPConnection):
    def __init__(self, path):
        super().__init__("localhost")
        self._path = path

    def connect(self):
        self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
        self.sock.connect(self._path)


def _connection(snapd_socket: pathlib.Path):
    """
    Get a connection to snapd via the snapd socket
    """
    conn = SnapdConnection(snapd_socket.as_posix())
    conn.set_debuglevel(0)
    return conn


def _do_request(args, method: str, url: str, body, headers: Dict[str, str],
                expected_status_code: int, timeout: int = 120) -> Any:
    """
    Do a request against snapd and retry for some time until the required status code is received
    Returns json data or None in case of an error
    """
    conn = _connection(args.snapd_socket)
    kwargs: Dict[str, Any] = dict()
    if body:
        kwargs['body'] = json.dumps(body)
    if headers:
        kwargs['headers'] = headers

    start = time.time()
    end = start + timeout
    while time.time() < end:
        conn.request(method, url, **kwargs)
        r = conn.getresponse()
        # always read() the response!!! see docs
        resp = json.loads(r.read().decode())
        if r.status != expected_status_code:
            logger.warning(f"received '{r.status}/{r.reason}' from snapd for "
                           f"{method} on {url}: '{resp['result']['message']}'. retry ...")
            time.sleep(2)
            continue

        if resp['type'] == 'async':
            logger.info(f"received '{r.status}/{r.reason}' from snapd for {method} "
                        f"on {url} (change-id: {resp['change']})")
        return resp

    logger.error(f"Unable to make {method} request via {url} against snapd. tried {timeout} seconds")
    return None


def _change(args, change_id):
    """
    Get the current status of a change
    """
    resp = _do_request(args, "GET", f"/v2/changes/{change_id}", None, None, 200)
    logger.info(f"result for change: {resp['result']}")


def _refresh(args):
    """
    Trigger refresh all snaps and wait for in-progress changes
    Note: this might not refresh all snaps if eg. a snap can't be refreshed
    due to running apps. snapd logs contain then something like:

    cannot refresh snap "aws-cli": snap "aws-cli" has running apps (aws), pids: 162638

    Not error out on this is by intention because the bootstrap.sh script (which
    executes this snap-helper.py script) should not fail if a refresh fails
    """
    headers = {"Content-type": "application/json"}
    body = {"action": "refresh"}
    resp = _do_request(args, "POST", "/v2/snaps", body, headers, 202)
    _wait(args)
    if resp:
        _change(args, resp['change'])


def _configure(args):
    """
    Configure a given snap and wait for in-progress changes
    """
    conf = dict()
    for pair in args.config.split():
        res = pair.split('=')
        if len(res) == 2:
            conf[res[0].strip()] = res[1].strip()

    logger.info(f"Setting {args.snapname} config to: {conf}")
    headers = {"Content-type": "application/json"}
    resp = _do_request(args, "PUT", f"/v2/snaps/{args.snapname}/conf", conf, headers, 202)
    _wait(args)
    if resp:
        _change(args, resp['change'])


def _wait(args):
    """
    Wait for any in-progress change to finish
    """
    start = time.time()
    end = start + 240
    while time.time() < end:
        resp = _do_request(args, "GET", "/v2/changes?select=in-progress", None, None, 200)

        # wait for availalbe changes
        if len(resp["result"]) > 0:
            logger.info(f"Still {len(resp['result'])} changes in progress ...")
            time.sleep(5)
            continue
        else:
            # waited enough
            logger.info("No more changes in progress ...")
            return

    # we reached the timeout here
    logger.error("timeout while waiting for in-progress changes")


def _parser():
    parser = argparse.ArgumentParser(description="AWS EKS snapd interactions")
    parser.add_argument("--log-level", choices=["info", "debug"], default="info")
    parser.add_argument("--snapd-socket", type=pathlib.Path, default="/run/snapd.socket",
                        help="path to the snapd socket. default: %(default)s")
    p_sub = parser.add_subparsers(help="sub-command help")

    # wait
    p_wait = p_sub.add_parser("wait", help="Wait for any snap changes")
    p_wait.set_defaults(func=_wait)

    # refresh
    p_refresh = p_sub.add_parser("refresh", help="Refresh all snaps")
    p_refresh.set_defaults(func=_refresh)

    # configure
    p_configure = p_sub.add_parser("configure", help="Configure a given snap")
    p_configure.add_argument("snapname", help="the snap to configure")
    p_configure.add_argument("config", help="the configuration options. "
                             "space separated list of key=value pairs. Eg. 'key1=val1 key2=val2'")
    p_configure.set_defaults(func=_configure)

    return parser


def main():
    parser = _parser()
    args = parser.parse_args()
    logformat = "%(asctime)s:%(name)s:%(levelname)s:%(message)s"
    # log level
    loglevel = logging.INFO
    if args.log_level == "debug":
        loglevel = logging.DEBUG
    logging.basicConfig(format=logformat, level=loglevel)
    if "func" not in args:
        sys.exit(parser.print_help())
    args.func(args)
    sys.exit(0)


if __name__ == "__main__":
    main()
	# more /var/log/user-data.log
	...
	Configuring kubelet snap
	2023-11-30 18:21:26,956:__main__:INFO:No more changes in progress ...
	2023-11-30 18:21:27,871:__main__:INFO:received '202/Accepted' from snapd for POST on /v2/snaps (change-id: 7)
	2023-11-30 18:21:27,879:__main__:INFO:No more changes in progress ...
	2023-11-30 18:21:27,880:__main__:INFO:result for change: {'id': '7', 'kind': 'refresh-snap', 'summary': 'Refresh all snaps: no updates', 'status': 'Done', 'ready': True, 'spawn-time': '2023-11-30T18:21:27.863112183Z', 'ready-time': '2023-11-30T18:21:27.863124863Z', 'data': {'
	snap-names': []}}
	2023-11-30 18:21:27,932:__main__:INFO:Setting kubelet-eks config to: {'cluster-dns': '172.20.0.10', 'container-runtime': 'remote', 'container-runtime-endpoint': 'unix:///run/containerd/containerd.sock', 'address': '0.0.0.0', 'anonymous-auth': 'false', 'authentication-token-we
	bhook': 'true', 'authorization-mode': 'Webhook', 'cgroup-driver': 'cgroupfs', 'client-ca-file': '/etc/kubernetes/pki/ca.crt', 'cloud-provider': 'aws', 'cluster-domain': 'cluster.local', 'cni-bin-dir': '/opt/cni/bin', 'cni-conf-dir': '/etc/cni/net.d', 'config': '/etc/kubernete
	s/kubelet/kubelet-config.json', 'kubeconfig': '/var/lib/kubelet/kubeconfig', 'node-ip': '10.8.56.177', 'network-plugin': 'cni', 'register-node': 'true', 'resolv-conf': '/run/systemd/resolve/resolv.conf', 'pod-infra-container-image': 'dkr.ecr.us-east-1.amazonaws.c
	om/eks/pause:3.5', '--max-pods': '110'}
	2023-11-30 18:21:27,970:__main__:INFO:received '202/Accepted' from snapd for PUT on /v2/snaps/kubelet-eks/conf (change-id: 8)
	2023-11-30 18:21:27,979:__main__:INFO:Still 1 changes in progress ...
	2023-11-30 18:21:32,984:__main__:INFO:No more changes in progress ...
	2023-11-30 18:21:32,986:__main__:INFO:result for change: {'id': '8', 'kind': 'configure-snap', 'summary': 'Change configuration of "kubelet-eks" snap', 'status': 'Error', 'tasks': [{'id': '113', 'kind': 'run-hook', 'summary': 'Run configure hook of "kubelet-eks" snap', 'statu
	s': 'Error', 'log': ['2023-11-30T18:21:27Z ERROR invalid option name: "--max-pods"'], 'progress': {'label': '', 'done': 1, 'total': 1}, 'spawn-time': '2023-11-30T18:21:27.933832076Z', 'ready-time': '2023-11-30T18:21:27.979626575Z'}], 'ready': True, 'err': 'cannot perform the
	following tasks:\n- Run configure hook of "kubelet-eks" snap (invalid option name: "--max-pods")', 'spawn-time': '2023-11-30T18:21:27.933868886Z', 'ready-time': '2023-11-30T18:21:27.979627625Z'}
	%  diff -u good-bootstrap.sh bad-bootstrap.sh
	--- good-bootstrap.sh 2023-11-30 02:35:22.723407109 +0000
	+++ bad-bootstrap.sh 2023-11-30 02:35:57.743372957 +0000
	@@ -222,52 +222,14 @@
	echo $cpu_to_reserve
	}

	+# variable to collect the kubelet-eks snap config options
	_kubelet_snap_options=()

	-# Call this function instead of "snap set" to configure the kubelet-eks snap.
	-# This functions mitigates a race condition where there's a snap refresh in the
	-# background and the set command fails with
	-# error: snap "kubelet-eks" has "auto-refresh" change in progress
	-# resulting in startup of misconfigured kubelet.
	+# function to add another option for kubelet-eks
	kubelet_snap_add_options() {
	_kubelet_snap_options+=("$@")
	}

	-# Call this function before starting the kubelet-eks snap. This function first
	-# tries to make sure that no refresh is running and then runs "snap set" with
	-# all options added with the kubelet_snap_add_options function and retries
	-# several times in case there's a change in progress.
	-kubelet_snap_configure() {
	- local retry ret
	-
	- # Wait for a possible running refresh to finish.
	- snap watch --last=refresh\?
	- snap watch --last=auto-refresh\?
	-
	- # If another refresh is running at this point, the snap commands below may
	- # fail. But the likelihood of this happening is small.
	-
	- # If this succeeds the likelihood of another refresh is even smaller.
	- # Ignore the "... no updates available" message.
	- snap refresh kubelet-eks 2> >(grep -v ' has no updates available$' >&2) \|\| true
	-
	- # Try to set the options every 2 seconds until we succeed or give up after
	- # 16 seconds. A possible running refresh shouldn't take longer.
	- for (( retry=0; retry < 8; retry++ )); do
	- if (( retry )); then
	- sleep 2
	- fi
	- ret=0
	- if snap set kubelet-eks "${_kubelet_snap_options[@]}"; then
	- break
	- else
	- ret=$?
	- fi
	- done
	-
	- return $ret
	-}
	-
	if [ -z "$CLUSTER_NAME" ]; then
	echo "CLUSTER_NAME is not defined"
	exit 1
	@@ -556,7 +518,9 @@
	args="$KUBELET_EXTRA_ARGS"

	echo "Configuring kubelet snap"
	-kubelet_snap_configure
	+python3 /usr/local/share/eks/snapd-helper.py wait
	+python3 /usr/local/share/eks/snapd-helper.py refresh
	+python3 /usr/local/share/eks/snapd-helper.py configure kubelet-eks "${_kubelet_snap_options[*]}"

	echo "Starting k8s kubelet daemon"
	snap start --enable kubelet-eks
	#!/usr/bin/python3

	import pathlib
	import sys
	import argparse
	import http.client
	import socket
	import json
	import logging
	import time
	from typing import Dict, Any


	logger = logging.getLogger(__name__)


	class SnapdConnection(http.client.HTTPConnection):
	def __init__(self, path):
	super().__init__("localhost")
	self._path = path

	def connect(self):
	self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
	self.sock.connect(self._path)


	def _connection(snapd_socket: pathlib.Path):
	"""
	Get a connection to snapd via the snapd socket
	"""
	conn = SnapdConnection(snapd_socket.as_posix())
	conn.set_debuglevel(0)
	return conn


	def _do_request(args, method: str, url: str, body, headers: Dict[str, str],
	expected_status_code: int, timeout: int = 120) -> Any:
	"""
	Do a request against snapd and retry for some time until the required status code is received
	Returns json data or None in case of an error
	"""
	conn = _connection(args.snapd_socket)
	kwargs: Dict[str, Any] = dict()
	if body:
	kwargs['body'] = json.dumps(body)
	if headers:
	kwargs['headers'] = headers

	start = time.time()
	end = start + timeout
	while time.time() < end:
	conn.request(method, url, **kwargs)
	r = conn.getresponse()
	# always read() the response!!! see docs
	resp = json.loads(r.read().decode())
	if r.status != expected_status_code:
	logger.warning(f"received '{r.status}/{r.reason}' from snapd for "
	f"{method} on {url}: '{resp['result']['message']}'. retry ...")
	time.sleep(2)
	continue

	if resp['type'] == 'async':
	logger.info(f"received '{r.status}/{r.reason}' from snapd for {method} "
	f"on {url} (change-id: {resp['change']})")
	return resp

	logger.error(f"Unable to make {method} request via {url} against snapd. tried {timeout} seconds")
	return None


	def _change(args, change_id):
	"""
	Get the current status of a change
	"""
	resp = _do_request(args, "GET", f"/v2/changes/{change_id}", None, None, 200)
	logger.info(f"result for change: {resp['result']}")


	def _refresh(args):
	"""
	Trigger refresh all snaps and wait for in-progress changes
	Note: this might not refresh all snaps if eg. a snap can't be refreshed
	due to running apps. snapd logs contain then something like:

	cannot refresh snap "aws-cli": snap "aws-cli" has running apps (aws), pids: 162638

	Not error out on this is by intention because the bootstrap.sh script (which
	executes this snap-helper.py script) should not fail if a refresh fails
	"""
	headers = {"Content-type": "application/json"}
	body = {"action": "refresh"}
	resp = _do_request(args, "POST", "/v2/snaps", body, headers, 202)
	_wait(args)
	if resp:
	_change(args, resp['change'])


	def _configure(args):
	"""
	Configure a given snap and wait for in-progress changes
	"""
	conf = dict()
	for pair in args.config.split():
	res = pair.split('=')
	if len(res) == 2:
	conf[res[0].strip()] = res[1].strip()

	logger.info(f"Setting {args.snapname} config to: {conf}")
	headers = {"Content-type": "application/json"}
	resp = _do_request(args, "PUT", f"/v2/snaps/{args.snapname}/conf", conf, headers, 202)
	_wait(args)
	if resp:
	_change(args, resp['change'])


	def _wait(args):
	"""
	Wait for any in-progress change to finish
	"""
	start = time.time()
	end = start + 240
	while time.time() < end:
	resp = _do_request(args, "GET", "/v2/changes?select=in-progress", None, None, 200)

	# wait for availalbe changes
	if len(resp["result"]) > 0:
	logger.info(f"Still {len(resp['result'])} changes in progress ...")
	time.sleep(5)
	continue
	else:
	# waited enough
	logger.info("No more changes in progress ...")
	return

	# we reached the timeout here
	logger.error("timeout while waiting for in-progress changes")


	def _parser():
	parser = argparse.ArgumentParser(description="AWS EKS snapd interactions")
	parser.add_argument("--log-level", choices=["info", "debug"], default="info")
	parser.add_argument("--snapd-socket", type=pathlib.Path, default="/run/snapd.socket",
	help="path to the snapd socket. default: %(default)s")
	p_sub = parser.add_subparsers(help="sub-command help")

	# wait
	p_wait = p_sub.add_parser("wait", help="Wait for any snap changes")
	p_wait.set_defaults(func=_wait)

	# refresh
	p_refresh = p_sub.add_parser("refresh", help="Refresh all snaps")
	p_refresh.set_defaults(func=_refresh)

	# configure
	p_configure = p_sub.add_parser("configure", help="Configure a given snap")
	p_configure.add_argument("snapname", help="the snap to configure")
	p_configure.add_argument("config", help="the configuration options. "
	"space separated list of key=value pairs. Eg. 'key1=val1 key2=val2'")
	p_configure.set_defaults(func=_configure)

	return parser


	def main():
	parser = _parser()
	args = parser.parse_args()
	logformat = "%(asctime)s:%(name)s:%(levelname)s:%(message)s"
	# log level
	loglevel = logging.INFO
	if args.log_level == "debug":
	loglevel = logging.DEBUG
	logging.basicConfig(format=logformat, level=loglevel)
	if "func" not in args:
	sys.exit(parser.print_help())
	args.func(args)
	sys.exit(0)


	if __name__ == "__main__":
	main()