Skip to content

Instantly share code, notes, and snippets.

@joemiller
Last active November 30, 2023 19:20
Show Gist options
  • Save joemiller/22b6ac4d60910e9c997957be09504c99 to your computer and use it in GitHub Desktop.
Save joemiller/22b6ac4d60910e9c997957be09504c99 to your computer and use it in GitHub Desktop.
# more /var/log/user-data.log
...
Configuring kubelet snap
2023-11-30 18:21:26,956:__main__:INFO:No more changes in progress ...
2023-11-30 18:21:27,871:__main__:INFO:received '202/Accepted' from snapd for POST on /v2/snaps (change-id: 7)
2023-11-30 18:21:27,879:__main__:INFO:No more changes in progress ...
2023-11-30 18:21:27,880:__main__:INFO:result for change: {'id': '7', 'kind': 'refresh-snap', 'summary': 'Refresh all snaps: no updates', 'status': 'Done', 'ready': True, 'spawn-time': '2023-11-30T18:21:27.863112183Z', 'ready-time': '2023-11-30T18:21:27.863124863Z', 'data': {'
snap-names': []}}
2023-11-30 18:21:27,932:__main__:INFO:Setting kubelet-eks config to: {'cluster-dns': '172.20.0.10', 'container-runtime': 'remote', 'container-runtime-endpoint': 'unix:///run/containerd/containerd.sock', 'address': '0.0.0.0', 'anonymous-auth': 'false', 'authentication-token-we
bhook': 'true', 'authorization-mode': 'Webhook', 'cgroup-driver': 'cgroupfs', 'client-ca-file': '/etc/kubernetes/pki/ca.crt', 'cloud-provider': 'aws', 'cluster-domain': 'cluster.local', 'cni-bin-dir': '/opt/cni/bin', 'cni-conf-dir': '/etc/cni/net.d', 'config': '/etc/kubernete
s/kubelet/kubelet-config.json', 'kubeconfig': '/var/lib/kubelet/kubeconfig', 'node-ip': '10.8.56.177', 'network-plugin': 'cni', 'register-node': 'true', 'resolv-conf': '/run/systemd/resolve/resolv.conf', 'pod-infra-container-image': 'dkr.ecr.us-east-1.amazonaws.c
om/eks/pause:3.5', '--max-pods': '110'}
2023-11-30 18:21:27,970:__main__:INFO:received '202/Accepted' from snapd for PUT on /v2/snaps/kubelet-eks/conf (change-id: 8)
2023-11-30 18:21:27,979:__main__:INFO:Still 1 changes in progress ...
2023-11-30 18:21:32,984:__main__:INFO:No more changes in progress ...
2023-11-30 18:21:32,986:__main__:INFO:result for change: {'id': '8', 'kind': 'configure-snap', 'summary': 'Change configuration of "kubelet-eks" snap', 'status': 'Error', 'tasks': [{'id': '113', 'kind': 'run-hook', 'summary': 'Run configure hook of "kubelet-eks" snap', 'statu
s': 'Error', 'log': ['2023-11-30T18:21:27Z ERROR invalid option name: "--max-pods"'], 'progress': {'label': '', 'done': 1, 'total': 1}, 'spawn-time': '2023-11-30T18:21:27.933832076Z', 'ready-time': '2023-11-30T18:21:27.979626575Z'}], 'ready': True, 'err': 'cannot perform the
following tasks:\n- Run configure hook of "kubelet-eks" snap (invalid option name: "--max-pods")', 'spawn-time': '2023-11-30T18:21:27.933868886Z', 'ready-time': '2023-11-30T18:21:27.979627625Z'}
%  diff -u good-bootstrap.sh bad-bootstrap.sh
--- good-bootstrap.sh 2023-11-30 02:35:22.723407109 +0000
+++ bad-bootstrap.sh 2023-11-30 02:35:57.743372957 +0000
@@ -222,52 +222,14 @@
echo $cpu_to_reserve
}
+# variable to collect the kubelet-eks snap config options
_kubelet_snap_options=()
-# Call this function instead of "snap set" to configure the kubelet-eks snap.
-# This functions mitigates a race condition where there's a snap refresh in the
-# background and the set command fails with
-# error: snap "kubelet-eks" has "auto-refresh" change in progress
-# resulting in startup of misconfigured kubelet.
+# function to add another option for kubelet-eks
kubelet_snap_add_options() {
_kubelet_snap_options+=("$@")
}
-# Call this function before starting the kubelet-eks snap. This function first
-# tries to make sure that no refresh is running and then runs "snap set" with
-# all options added with the kubelet_snap_add_options function and retries
-# several times in case there's a change in progress.
-kubelet_snap_configure() {
- local retry ret
-
- # Wait for a possible running refresh to finish.
- snap watch --last=refresh\?
- snap watch --last=auto-refresh\?
-
- # If another refresh is running at this point, the snap commands below may
- # fail. But the likelihood of this happening is small.
-
- # If this succeeds the likelihood of another refresh is even smaller.
- # Ignore the "... no updates available" message.
- snap refresh kubelet-eks 2> >(grep -v ' has no updates available$' >&2) || true
-
- # Try to set the options every 2 seconds until we succeed or give up after
- # 16 seconds. A possible running refresh shouldn't take longer.
- for (( retry=0; retry < 8; retry++ )); do
- if (( retry )); then
- sleep 2
- fi
- ret=0
- if snap set kubelet-eks "${_kubelet_snap_options[@]}"; then
- break
- else
- ret=$?
- fi
- done
-
- return $ret
-}
-
if [ -z "$CLUSTER_NAME" ]; then
echo "CLUSTER_NAME is not defined"
exit 1
@@ -556,7 +518,9 @@
args="$KUBELET_EXTRA_ARGS"
echo "Configuring kubelet snap"
-kubelet_snap_configure
+python3 /usr/local/share/eks/snapd-helper.py wait
+python3 /usr/local/share/eks/snapd-helper.py refresh
+python3 /usr/local/share/eks/snapd-helper.py configure kubelet-eks "${_kubelet_snap_options[*]}"
echo "Starting k8s kubelet daemon"
snap start --enable kubelet-eks
#!/usr/bin/python3
import pathlib
import sys
import argparse
import http.client
import socket
import json
import logging
import time
from typing import Dict, Any
logger = logging.getLogger(__name__)
class SnapdConnection(http.client.HTTPConnection):
def __init__(self, path):
super().__init__("localhost")
self._path = path
def connect(self):
self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
self.sock.connect(self._path)
def _connection(snapd_socket: pathlib.Path):
"""
Get a connection to snapd via the snapd socket
"""
conn = SnapdConnection(snapd_socket.as_posix())
conn.set_debuglevel(0)
return conn
def _do_request(args, method: str, url: str, body, headers: Dict[str, str],
expected_status_code: int, timeout: int = 120) -> Any:
"""
Do a request against snapd and retry for some time until the required status code is received
Returns json data or None in case of an error
"""
conn = _connection(args.snapd_socket)
kwargs: Dict[str, Any] = dict()
if body:
kwargs['body'] = json.dumps(body)
if headers:
kwargs['headers'] = headers
start = time.time()
end = start + timeout
while time.time() < end:
conn.request(method, url, **kwargs)
r = conn.getresponse()
# always read() the response!!! see docs
resp = json.loads(r.read().decode())
if r.status != expected_status_code:
logger.warning(f"received '{r.status}/{r.reason}' from snapd for "
f"{method} on {url}: '{resp['result']['message']}'. retry ...")
time.sleep(2)
continue
if resp['type'] == 'async':
logger.info(f"received '{r.status}/{r.reason}' from snapd for {method} "
f"on {url} (change-id: {resp['change']})")
return resp
logger.error(f"Unable to make {method} request via {url} against snapd. tried {timeout} seconds")
return None
def _change(args, change_id):
"""
Get the current status of a change
"""
resp = _do_request(args, "GET", f"/v2/changes/{change_id}", None, None, 200)
logger.info(f"result for change: {resp['result']}")
def _refresh(args):
"""
Trigger refresh all snaps and wait for in-progress changes
Note: this might not refresh all snaps if eg. a snap can't be refreshed
due to running apps. snapd logs contain then something like:
cannot refresh snap "aws-cli": snap "aws-cli" has running apps (aws), pids: 162638
Not error out on this is by intention because the bootstrap.sh script (which
executes this snap-helper.py script) should not fail if a refresh fails
"""
headers = {"Content-type": "application/json"}
body = {"action": "refresh"}
resp = _do_request(args, "POST", "/v2/snaps", body, headers, 202)
_wait(args)
if resp:
_change(args, resp['change'])
def _configure(args):
"""
Configure a given snap and wait for in-progress changes
"""
conf = dict()
for pair in args.config.split():
res = pair.split('=')
if len(res) == 2:
conf[res[0].strip()] = res[1].strip()
logger.info(f"Setting {args.snapname} config to: {conf}")
headers = {"Content-type": "application/json"}
resp = _do_request(args, "PUT", f"/v2/snaps/{args.snapname}/conf", conf, headers, 202)
_wait(args)
if resp:
_change(args, resp['change'])
def _wait(args):
"""
Wait for any in-progress change to finish
"""
start = time.time()
end = start + 240
while time.time() < end:
resp = _do_request(args, "GET", "/v2/changes?select=in-progress", None, None, 200)
# wait for availalbe changes
if len(resp["result"]) > 0:
logger.info(f"Still {len(resp['result'])} changes in progress ...")
time.sleep(5)
continue
else:
# waited enough
logger.info("No more changes in progress ...")
return
# we reached the timeout here
logger.error("timeout while waiting for in-progress changes")
def _parser():
parser = argparse.ArgumentParser(description="AWS EKS snapd interactions")
parser.add_argument("--log-level", choices=["info", "debug"], default="info")
parser.add_argument("--snapd-socket", type=pathlib.Path, default="/run/snapd.socket",
help="path to the snapd socket. default: %(default)s")
p_sub = parser.add_subparsers(help="sub-command help")
# wait
p_wait = p_sub.add_parser("wait", help="Wait for any snap changes")
p_wait.set_defaults(func=_wait)
# refresh
p_refresh = p_sub.add_parser("refresh", help="Refresh all snaps")
p_refresh.set_defaults(func=_refresh)
# configure
p_configure = p_sub.add_parser("configure", help="Configure a given snap")
p_configure.add_argument("snapname", help="the snap to configure")
p_configure.add_argument("config", help="the configuration options. "
"space separated list of key=value pairs. Eg. 'key1=val1 key2=val2'")
p_configure.set_defaults(func=_configure)
return parser
def main():
parser = _parser()
args = parser.parse_args()
logformat = "%(asctime)s:%(name)s:%(levelname)s:%(message)s"
# log level
loglevel = logging.INFO
if args.log_level == "debug":
loglevel = logging.DEBUG
logging.basicConfig(format=logformat, level=loglevel)
if "func" not in args:
sys.exit(parser.print_help())
args.func(args)
sys.exit(0)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment