-
-
Save acerbetti/fc7011f598a5c0099b47ddfbedf2c40c to your computer and use it in GitHub Desktop.
Jetson Nano Cluster Join Script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Jetson Nano Cluster Join Script | |
# Based on https://github.com/siderolabs/talos/issues/3990 | |
set -e | |
# Colors for output | |
RED='\033[0;31m' | |
GREEN='\033[0;32m' | |
YELLOW='\033[1;33m' | |
NC='\033[0m' # No Color | |
echo -e "${GREEN}π Starting Jetson Nano cluster join process...${NC}" | |
# Configuration - User input | |
echo -e "${YELLOW}π Please provide connection details:${NC}" | |
read -p "Jetson IP address [192.168.1.102]: " JETSON_IP | |
read -p "Jetson username [jetson]: " JETSON_USER | |
read -p "Talos endpoint [192.168.1.100]: " TALOS_ENDPOINT | |
# Set defaults if empty | |
JETSON_IP=${JETSON_IP:-"192.168.1.102"} | |
JETSON_USER=${JETSON_USER:-"jetson"} | |
TALOS_ENDPOINT=${TALOS_ENDPOINT:-"192.168.1.100"} | |
echo -e "${YELLOW}π Using: ${JETSON_USER}@${JETSON_IP} β Talos ${TALOS_ENDPOINT}${NC}" | |
TALOS_CONFIG="${HOME}/.talos/config" | |
# Check prerequisites | |
echo -e "${YELLOW}π Checking prerequisites...${NC}" | |
if ! command -v talosctl &> /dev/null; then | |
echo -e "${RED}β talosctl not found. Please install Talos CLI.${NC}" | |
exit 1 | |
fi | |
if ! command -v kubectl &> /dev/null; then | |
echo -e "${RED}β kubectl not found. Please install kubectl.${NC}" | |
exit 1 | |
fi | |
if ! ssh -o ConnectTimeout=5 "${JETSON_USER}@${JETSON_IP}" 'exit 0' 2>/dev/null; then | |
echo -e "${RED}β Cannot SSH to Jetson Nano at ${JETSON_IP}${NC}" | |
exit 1 | |
fi | |
if ! talosctl --talosconfig "${TALOS_CONFIG}" --endpoints ${TALOS_ENDPOINT} --nodes ${TALOS_ENDPOINT} get machineconfig &>/dev/null; then | |
echo -e "${RED}β Cannot connect to Talos cluster at ${TALOS_ENDPOINT}${NC}" | |
echo "Trying to diagnose the issue..." | |
echo "Talos config file: ${TALOS_CONFIG}" | |
ls -la "${TALOS_CONFIG}" 2>/dev/null || echo "Config file not found!" | |
echo "Testing connection:" | |
talosctl --talosconfig "${TALOS_CONFIG}" --endpoints ${TALOS_ENDPOINT} --nodes ${TALOS_ENDPOINT} get machineconfig || true | |
exit 1 | |
fi | |
echo -e "${GREEN}β All prerequisites met${NC}" | |
# Create temporary directory | |
TMPDIR=$(mktemp -d) | |
echo -e "${YELLOW}π Using temporary directory: ${TMPDIR}${NC}" | |
# Get cluster configuration from Talos | |
echo -e "${YELLOW}π Getting cluster configuration from Talos...${NC}" | |
CLUSTER_DOMAIN=$(talosctl --talosconfig "${TALOS_CONFIG}" --endpoints ${TALOS_ENDPOINT} --nodes ${TALOS_ENDPOINT} get kubeletconfig -o jsonpath="{.spec.clusterDomain}") | |
CLUSTER_DNS=$(talosctl --talosconfig "${TALOS_CONFIG}" --endpoints ${TALOS_ENDPOINT} --nodes ${TALOS_ENDPOINT} get kubeletconfig -o jsonpath="{.spec.clusterDNS}") | |
echo "Cluster Domain: ${CLUSTER_DOMAIN}" | |
echo "Cluster DNS: ${CLUSTER_DNS}" | |
# Prompt for confirmation | |
echo -e "${YELLOW}β οΈ WARNING: This will wipe the existing Kubernetes installation on Jetson Nano and cause a reboot!${NC}" | |
read -p "Do you want to continue? [y/N] " -n 1 -r | |
echo | |
if [[ ! $REPLY =~ ^[Yy]$ ]]; then | |
echo "Aborted." | |
exit 1 | |
fi | |
# Step 1: Clean existing installation on Jetson | |
echo -e "${YELLOW}π§Ή Cleaning existing Kubernetes installation on Jetson...${NC}" | |
ssh -t "${JETSON_USER}@${JETSON_IP}" 'systemctl is-active kubelet.service containerd.service 1>/dev/null && sudo systemctl disable kubelet.service containerd.service && sudo reboot && sleep 10' || true | |
echo "Waiting for Jetson to reboot..." | |
until ssh -o ConnectTimeout=1 "${JETSON_USER}@${JETSON_IP}" 'exit 0' 2>/dev/null; do | |
sleep 5 | |
echo -n "." | |
done | |
echo -e "\n${GREEN}β Jetson is back online${NC}" | |
ssh -t "${JETSON_USER}@${JETSON_IP}" 'sudo rm -rf /etc/kubernetes /etc/cni /var/lib/cni /opt/cni /var/lib/containerd /var/lib/kubelet /var/lib/etcd' | |
# Step 2: Copy Kubernetes files from Talos cluster | |
echo -e "${YELLOW}π Copying Kubernetes configuration from Talos cluster...${NC}" | |
talosctl --talosconfig "${TALOS_CONFIG}" --endpoints ${TALOS_ENDPOINT} --nodes ${TALOS_ENDPOINT} cat /etc/kubernetes/kubeconfig-kubelet > "${TMPDIR}/kubelet.conf" | |
talosctl --talosconfig "${TALOS_CONFIG}" --endpoints ${TALOS_ENDPOINT} --nodes ${TALOS_ENDPOINT} cat /etc/kubernetes/bootstrap-kubeconfig > "${TMPDIR}/bootstrap-kubelet.conf" | |
talosctl --talosconfig "${TALOS_CONFIG}" --endpoints ${TALOS_ENDPOINT} --nodes ${TALOS_ENDPOINT} cat /etc/kubernetes/pki/ca.crt > "${TMPDIR}/ca.crt" | |
# Step 3: Update server endpoints | |
echo -e "${YELLOW}π§ Updating server endpoints...${NC}" | |
sed -i.bak "s|server:.*|server: https://${TALOS_ENDPOINT}:6443|g" "${TMPDIR}/kubelet.conf" "${TMPDIR}/bootstrap-kubelet.conf" | |
# Step 4: Create kubelet configuration | |
echo -e "${YELLOW}βοΈ Creating kubelet configuration...${NC}" | |
cat > "${TMPDIR}/var-lib-kubelet-config.yaml" <<EOT | |
kind: KubeletConfiguration | |
apiVersion: kubelet.config.k8s.io/v1beta1 | |
authentication: | |
anonymous: | |
enabled: false | |
webhook: | |
enabled: true | |
x509: | |
clientCAFile: /etc/kubernetes/pki/ca.crt | |
authorization: | |
mode: Webhook | |
clusterDomain: "${CLUSTER_DOMAIN}" | |
clusterDNS: ${CLUSTER_DNS} | |
runtimeRequestTimeout: "0s" | |
cgroupDriver: systemd | |
containerRuntimeEndpoint: unix:///var/run/containerd/containerd.sock | |
failSwapOn: false | |
EOT | |
# Step 5: Create kubelet service override | |
cat > "${TMPDIR}/kubelet.service.override.conf" <<EOT | |
[Service] | |
Environment="KUBELET_ARGS=--bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf --config=/var/lib/kubelet/config.yaml --fail-swap-on=false --ignore-preflight-errors=SystemVerification" | |
EOT | |
# Step 6: Create HAProxy configuration for KubePrism compatibility | |
cat > "${TMPDIR}/haproxy.cfg" <<EOT | |
defaults | |
timeout client 10s | |
timeout connect 5s | |
timeout server 10s | |
frontend kubeprism | |
mode tcp | |
bind 127.0.0.1:7445 | |
default_backend k8s_api | |
backend k8s_api | |
mode tcp | |
server talos-master ${TALOS_ENDPOINT}:6443 check | |
server talos-worker 192.168.1.101:6443 check backup | |
EOT | |
# Step 7: Copy files to Jetson Nano | |
echo -e "${YELLOW}π€ Copying configuration files to Jetson Nano...${NC}" | |
REMOTE_TMPDIR="/tmp/k8s-join-$(date +%s)" | |
scp -r "${TMPDIR}" "${JETSON_USER}@${JETSON_IP}:${REMOTE_TMPDIR}" | |
# Step 8: Install HAProxy and configure services on Jetson Nano | |
echo -e "${YELLOW}π¦ Installing HAProxy on Jetson Nano...${NC}" | |
ssh -t "${JETSON_USER}@${JETSON_IP}" "sudo apt-get update && sudo apt-get install -y haproxy" | |
# Step 9: Install CNI plugins and configure networking | |
echo -e "${YELLOW}π Installing CNI plugins and configuring networking...${NC}" | |
ssh "${JETSON_USER}@${JETSON_IP}" 'curl -L https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-arm64-v1.3.0.tgz -o /tmp/cni-plugins.tgz' | |
ssh -t "${JETSON_USER}@${JETSON_IP}" 'sudo mkdir -p /opt/cni/bin && sudo tar -xzf /tmp/cni-plugins.tgz -C /opt/cni/bin && sudo chmod +x /opt/cni/bin/*' | |
echo -e "${YELLOW}π Installing Flannel CNI plugin...${NC}" | |
ssh "${JETSON_USER}@${JETSON_IP}" 'curl -L https://github.com/flannel-io/cni-plugin/releases/download/v1.2.0/flannel-arm64 -o /tmp/flannel' | |
ssh -t "${JETSON_USER}@${JETSON_IP}" 'sudo mv /tmp/flannel /opt/cni/bin/flannel && sudo chmod +x /opt/cni/bin/flannel' | |
echo -e "${YELLOW}π Configuring bridge netfilter for Flannel...${NC}" | |
ssh -t "${JETSON_USER}@${JETSON_IP}" 'sudo modprobe br_netfilter && echo "br_netfilter" | sudo tee -a /etc/modules' | |
ssh -t "${JETSON_USER}@${JETSON_IP}" 'sudo sysctl net.bridge.bridge-nf-call-iptables=1 && sudo sysctl net.bridge.bridge-nf-call-ip6tables=1' | |
ssh -t "${JETSON_USER}@${JETSON_IP}" 'echo "net.bridge.bridge-nf-call-iptables=1" | sudo tee -a /etc/sysctl.conf && echo "net.bridge.bridge-nf-call-ip6tables=1" | sudo tee -a /etc/sysctl.conf' | |
# Step 10: Configure and start services | |
echo -e "${YELLOW}π§ Installing and configuring services on Jetson Nano...${NC}" | |
ssh -t "${JETSON_USER}@${JETSON_IP}" "sudo mkdir -p /etc/kubernetes/pki /var/lib/kubelet /etc/systemd/system/kubelet.service.d/ /etc/haproxy && | |
sudo mv ${REMOTE_TMPDIR}/kubelet.conf /etc/kubernetes/kubelet.conf && | |
sudo mv ${REMOTE_TMPDIR}/bootstrap-kubelet.conf /etc/kubernetes/bootstrap-kubelet.conf && | |
sudo mv ${REMOTE_TMPDIR}/ca.crt /etc/kubernetes/pki/ca.crt && | |
sudo mv ${REMOTE_TMPDIR}/var-lib-kubelet-config.yaml /var/lib/kubelet/config.yaml && | |
sudo mv ${REMOTE_TMPDIR}/kubelet.service.override.conf /etc/systemd/system/kubelet.service.d/override.conf && | |
sudo mv ${REMOTE_TMPDIR}/haproxy.cfg /etc/haproxy/haproxy.cfg && | |
sudo systemctl daemon-reload && | |
sudo systemctl enable --now haproxy.service containerd.service kubelet.service" | |
# Step 11: Verify node joined successfully | |
echo -e "${YELLOW}β³ Waiting for node to join cluster...${NC}" | |
sleep 30 | |
echo -e "${YELLOW}π Verifying Jetson Nano joined the cluster...${NC}" | |
JETSON_HOSTNAME=$(ssh "${JETSON_USER}@${JETSON_IP}" 'hostname') | |
echo "Looking for node with hostname: ${JETSON_HOSTNAME}" | |
# Wait up to 5 minutes for the node to appear | |
TIMEOUT=300 | |
ELAPSED=0 | |
NODE_FOUND=false | |
while [ $ELAPSED -lt $TIMEOUT ]; do | |
if kubectl get nodes --no-headers | grep -q "${JETSON_HOSTNAME}.*Ready"; then | |
NODE_FOUND=true | |
break | |
fi | |
echo -n "." | |
sleep 10 | |
ELAPSED=$((ELAPSED + 10)) | |
done | |
echo "" | |
if [ "$NODE_FOUND" = true ]; then | |
echo -e "${GREEN}β Jetson Nano successfully joined the cluster!${NC}" | |
kubectl get nodes | |
# Cleanup | |
rm -rf "${TMPDIR}" | |
echo -e "${GREEN}π Jetson Nano cluster join process completed successfully!${NC}" | |
echo -e "${YELLOW}π‘ You can now label the node for GPU workloads:${NC}" | |
echo "kubectl label node ${JETSON_HOSTNAME} node-type=gpu" | |
echo "kubectl label node ${JETSON_HOSTNAME} gpu=tegra210" | |
echo "" | |
read -p "Would you like to apply these GPU labels now? [y/N] " -n 1 -r | |
echo | |
if [[ $REPLY =~ ^[Yy]$ ]]; then | |
echo -e "${YELLOW}π·οΈ Applying GPU labels...${NC}" | |
kubectl label node "${JETSON_HOSTNAME}" node-type=gpu | |
kubectl label node "${JETSON_HOSTNAME}" gpu=tegra210 | |
echo -e "${GREEN}β GPU labels applied successfully!${NC}" | |
fi | |
else | |
echo -e "${RED}β Jetson Nano failed to join the cluster within ${TIMEOUT} seconds${NC}" | |
echo "Current cluster nodes:" | |
kubectl get nodes | |
echo "" | |
echo "Checking kubelet status on Jetson:" | |
ssh "${JETSON_USER}@${JETSON_IP}" 'sudo systemctl status kubelet --no-pager' || true | |
echo "" | |
echo "Checking kubelet logs on Jetson:" | |
ssh "${JETSON_USER}@${JETSON_IP}" 'sudo journalctl -u kubelet --no-pager -n 20' || true | |
# Cleanup | |
rm -rf "${TMPDIR}" | |
echo -e "${RED}π₯ Jetson Nano cluster join process FAILED!${NC}" | |
exit 1 | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment