-
-
Save joemiller/22b6ac4d60910e9c997957be09504c99 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# more /var/log/user-data.log | |
... | |
Configuring kubelet snap | |
2023-11-30 18:21:26,956:__main__:INFO:No more changes in progress ... | |
2023-11-30 18:21:27,871:__main__:INFO:received '202/Accepted' from snapd for POST on /v2/snaps (change-id: 7) | |
2023-11-30 18:21:27,879:__main__:INFO:No more changes in progress ... | |
2023-11-30 18:21:27,880:__main__:INFO:result for change: {'id': '7', 'kind': 'refresh-snap', 'summary': 'Refresh all snaps: no updates', 'status': 'Done', 'ready': True, 'spawn-time': '2023-11-30T18:21:27.863112183Z', 'ready-time': '2023-11-30T18:21:27.863124863Z', 'data': {' | |
snap-names': []}} | |
2023-11-30 18:21:27,932:__main__:INFO:Setting kubelet-eks config to: {'cluster-dns': '172.20.0.10', 'container-runtime': 'remote', 'container-runtime-endpoint': 'unix:///run/containerd/containerd.sock', 'address': '0.0.0.0', 'anonymous-auth': 'false', 'authentication-token-we | |
bhook': 'true', 'authorization-mode': 'Webhook', 'cgroup-driver': 'cgroupfs', 'client-ca-file': '/etc/kubernetes/pki/ca.crt', 'cloud-provider': 'aws', 'cluster-domain': 'cluster.local', 'cni-bin-dir': '/opt/cni/bin', 'cni-conf-dir': '/etc/cni/net.d', 'config': '/etc/kubernete | |
s/kubelet/kubelet-config.json', 'kubeconfig': '/var/lib/kubelet/kubeconfig', 'node-ip': '10.8.56.177', 'network-plugin': 'cni', 'register-node': 'true', 'resolv-conf': '/run/systemd/resolve/resolv.conf', 'pod-infra-container-image': 'dkr.ecr.us-east-1.amazonaws.c | |
om/eks/pause:3.5', '--max-pods': '110'} | |
2023-11-30 18:21:27,970:__main__:INFO:received '202/Accepted' from snapd for PUT on /v2/snaps/kubelet-eks/conf (change-id: 8) | |
2023-11-30 18:21:27,979:__main__:INFO:Still 1 changes in progress ... | |
2023-11-30 18:21:32,984:__main__:INFO:No more changes in progress ... | |
2023-11-30 18:21:32,986:__main__:INFO:result for change: {'id': '8', 'kind': 'configure-snap', 'summary': 'Change configuration of "kubelet-eks" snap', 'status': 'Error', 'tasks': [{'id': '113', 'kind': 'run-hook', 'summary': 'Run configure hook of "kubelet-eks" snap', 'statu | |
s': 'Error', 'log': ['2023-11-30T18:21:27Z ERROR invalid option name: "--max-pods"'], 'progress': {'label': '', 'done': 1, 'total': 1}, 'spawn-time': '2023-11-30T18:21:27.933832076Z', 'ready-time': '2023-11-30T18:21:27.979626575Z'}], 'ready': True, 'err': 'cannot perform the | |
following tasks:\n- Run configure hook of "kubelet-eks" snap (invalid option name: "--max-pods")', 'spawn-time': '2023-11-30T18:21:27.933868886Z', 'ready-time': '2023-11-30T18:21:27.979627625Z'} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
% diff -u good-bootstrap.sh bad-bootstrap.sh | |
--- good-bootstrap.sh 2023-11-30 02:35:22.723407109 +0000 | |
+++ bad-bootstrap.sh 2023-11-30 02:35:57.743372957 +0000 | |
@@ -222,52 +222,14 @@ | |
echo $cpu_to_reserve | |
} | |
+# variable to collect the kubelet-eks snap config options | |
_kubelet_snap_options=() | |
-# Call this function instead of "snap set" to configure the kubelet-eks snap. | |
-# This functions mitigates a race condition where there's a snap refresh in the | |
-# background and the set command fails with | |
-# error: snap "kubelet-eks" has "auto-refresh" change in progress | |
-# resulting in startup of misconfigured kubelet. | |
+# function to add another option for kubelet-eks | |
kubelet_snap_add_options() { | |
_kubelet_snap_options+=("$@") | |
} | |
-# Call this function before starting the kubelet-eks snap. This function first | |
-# tries to make sure that no refresh is running and then runs "snap set" with | |
-# all options added with the kubelet_snap_add_options function and retries | |
-# several times in case there's a change in progress. | |
-kubelet_snap_configure() { | |
- local retry ret | |
- | |
- # Wait for a possible running refresh to finish. | |
- snap watch --last=refresh\? | |
- snap watch --last=auto-refresh\? | |
- | |
- # If another refresh is running at this point, the snap commands below may | |
- # fail. But the likelihood of this happening is small. | |
- | |
- # If this succeeds the likelihood of another refresh is even smaller. | |
- # Ignore the "... no updates available" message. | |
- snap refresh kubelet-eks 2> >(grep -v ' has no updates available$' >&2) || true | |
- | |
- # Try to set the options every 2 seconds until we succeed or give up after | |
- # 16 seconds. A possible running refresh shouldn't take longer. | |
- for (( retry=0; retry < 8; retry++ )); do | |
- if (( retry )); then | |
- sleep 2 | |
- fi | |
- ret=0 | |
- if snap set kubelet-eks "${_kubelet_snap_options[@]}"; then | |
- break | |
- else | |
- ret=$? | |
- fi | |
- done | |
- | |
- return $ret | |
-} | |
- | |
if [ -z "$CLUSTER_NAME" ]; then | |
echo "CLUSTER_NAME is not defined" | |
exit 1 | |
@@ -556,7 +518,9 @@ | |
args="$KUBELET_EXTRA_ARGS" | |
echo "Configuring kubelet snap" | |
-kubelet_snap_configure | |
+python3 /usr/local/share/eks/snapd-helper.py wait | |
+python3 /usr/local/share/eks/snapd-helper.py refresh | |
+python3 /usr/local/share/eks/snapd-helper.py configure kubelet-eks "${_kubelet_snap_options[*]}" | |
echo "Starting k8s kubelet daemon" | |
snap start --enable kubelet-eks |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import pathlib | |
import sys | |
import argparse | |
import http.client | |
import socket | |
import json | |
import logging | |
import time | |
from typing import Dict, Any | |
logger = logging.getLogger(__name__) | |
class SnapdConnection(http.client.HTTPConnection): | |
def __init__(self, path): | |
super().__init__("localhost") | |
self._path = path | |
def connect(self): | |
self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) | |
self.sock.connect(self._path) | |
def _connection(snapd_socket: pathlib.Path): | |
""" | |
Get a connection to snapd via the snapd socket | |
""" | |
conn = SnapdConnection(snapd_socket.as_posix()) | |
conn.set_debuglevel(0) | |
return conn | |
def _do_request(args, method: str, url: str, body, headers: Dict[str, str], | |
expected_status_code: int, timeout: int = 120) -> Any: | |
""" | |
Do a request against snapd and retry for some time until the required status code is received | |
Returns json data or None in case of an error | |
""" | |
conn = _connection(args.snapd_socket) | |
kwargs: Dict[str, Any] = dict() | |
if body: | |
kwargs['body'] = json.dumps(body) | |
if headers: | |
kwargs['headers'] = headers | |
start = time.time() | |
end = start + timeout | |
while time.time() < end: | |
conn.request(method, url, **kwargs) | |
r = conn.getresponse() | |
# always read() the response!!! see docs | |
resp = json.loads(r.read().decode()) | |
if r.status != expected_status_code: | |
logger.warning(f"received '{r.status}/{r.reason}' from snapd for " | |
f"{method} on {url}: '{resp['result']['message']}'. retry ...") | |
time.sleep(2) | |
continue | |
if resp['type'] == 'async': | |
logger.info(f"received '{r.status}/{r.reason}' from snapd for {method} " | |
f"on {url} (change-id: {resp['change']})") | |
return resp | |
logger.error(f"Unable to make {method} request via {url} against snapd. tried {timeout} seconds") | |
return None | |
def _change(args, change_id): | |
""" | |
Get the current status of a change | |
""" | |
resp = _do_request(args, "GET", f"/v2/changes/{change_id}", None, None, 200) | |
logger.info(f"result for change: {resp['result']}") | |
def _refresh(args): | |
""" | |
Trigger refresh all snaps and wait for in-progress changes | |
Note: this might not refresh all snaps if eg. a snap can't be refreshed | |
due to running apps. snapd logs contain then something like: | |
cannot refresh snap "aws-cli": snap "aws-cli" has running apps (aws), pids: 162638 | |
Not error out on this is by intention because the bootstrap.sh script (which | |
executes this snap-helper.py script) should not fail if a refresh fails | |
""" | |
headers = {"Content-type": "application/json"} | |
body = {"action": "refresh"} | |
resp = _do_request(args, "POST", "/v2/snaps", body, headers, 202) | |
_wait(args) | |
if resp: | |
_change(args, resp['change']) | |
def _configure(args): | |
""" | |
Configure a given snap and wait for in-progress changes | |
""" | |
conf = dict() | |
for pair in args.config.split(): | |
res = pair.split('=') | |
if len(res) == 2: | |
conf[res[0].strip()] = res[1].strip() | |
logger.info(f"Setting {args.snapname} config to: {conf}") | |
headers = {"Content-type": "application/json"} | |
resp = _do_request(args, "PUT", f"/v2/snaps/{args.snapname}/conf", conf, headers, 202) | |
_wait(args) | |
if resp: | |
_change(args, resp['change']) | |
def _wait(args): | |
""" | |
Wait for any in-progress change to finish | |
""" | |
start = time.time() | |
end = start + 240 | |
while time.time() < end: | |
resp = _do_request(args, "GET", "/v2/changes?select=in-progress", None, None, 200) | |
# wait for availalbe changes | |
if len(resp["result"]) > 0: | |
logger.info(f"Still {len(resp['result'])} changes in progress ...") | |
time.sleep(5) | |
continue | |
else: | |
# waited enough | |
logger.info("No more changes in progress ...") | |
return | |
# we reached the timeout here | |
logger.error("timeout while waiting for in-progress changes") | |
def _parser(): | |
parser = argparse.ArgumentParser(description="AWS EKS snapd interactions") | |
parser.add_argument("--log-level", choices=["info", "debug"], default="info") | |
parser.add_argument("--snapd-socket", type=pathlib.Path, default="/run/snapd.socket", | |
help="path to the snapd socket. default: %(default)s") | |
p_sub = parser.add_subparsers(help="sub-command help") | |
# wait | |
p_wait = p_sub.add_parser("wait", help="Wait for any snap changes") | |
p_wait.set_defaults(func=_wait) | |
# refresh | |
p_refresh = p_sub.add_parser("refresh", help="Refresh all snaps") | |
p_refresh.set_defaults(func=_refresh) | |
# configure | |
p_configure = p_sub.add_parser("configure", help="Configure a given snap") | |
p_configure.add_argument("snapname", help="the snap to configure") | |
p_configure.add_argument("config", help="the configuration options. " | |
"space separated list of key=value pairs. Eg. 'key1=val1 key2=val2'") | |
p_configure.set_defaults(func=_configure) | |
return parser | |
def main(): | |
parser = _parser() | |
args = parser.parse_args() | |
logformat = "%(asctime)s:%(name)s:%(levelname)s:%(message)s" | |
# log level | |
loglevel = logging.INFO | |
if args.log_level == "debug": | |
loglevel = logging.DEBUG | |
logging.basicConfig(format=logformat, level=loglevel) | |
if "func" not in args: | |
sys.exit(parser.print_help()) | |
args.func(args) | |
sys.exit(0) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment