Last active
March 29, 2024 17:29
-
-
Save dims/6d3ffbdcdaf2b894e8eb8e7b5c5c3d14 to your computer and use it in GitHub Desktop.
Notes from running kubetest2 ec2 with nvidia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# All the notes you need!!! | |
# https://github.com/awslabs/amazon-eks-ami/releases/tag/v20240227 | |
# AMI Name: amazon-eks-gpu-node-1.29-v20240227 | |
# https://docs.aws.amazon.com/eks/latest/userguide/eks-optimized-ami.html | |
# https://docs.aws.amazon.com/dlami/latest/devguide/gpu.html | |
# https://hub.docker.com/r/nvidia/cuda/tags | |
# https://github.com/NVIDIA/k8s-device-plugin/releases | |
# on the box where you can start stuff from | |
(cd $HOME/go/src/sigs.k8s.io/provider-aws-test-infra/kubetest2-ec2 && go install .) | |
VERSION=$(curl -Ls https://dl.k8s.io/ci/latest.txt) | |
kubetest2 ec2 \ | |
--stage https://dl.k8s.io/ci/ \ | |
--version $VERSION \ | |
--instance-type=g4dn.xlarge \ | |
--worker-image=ami-0e58b1ac65cc02370 \ | |
--device-plugin-nvidia true \ | |
--worker-user-data-file $(go env GOPATH)/src/sigs.k8s.io/provider-aws-test-infra/kubetest2-ec2/config/al2.sh \ | |
--region us-east-1 \ | |
--target-build-arch linux/amd64 \ | |
--up | |
# post startup once you have KUBECONFIG pointed to the cluster created above | |
kubectl get nodes | |
kubectl get nodes -o yaml | |
kubectl get pods --all-namespaces | |
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.5/nvidia-device-plugin.yml | |
kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu" | |
kubectl rollout status daemonset nvidia-device-plugin-daemonset -n kube-system --timeout=2m | |
# Test pod | |
cat <<EOF | kubectl apply -f - | |
apiVersion: v1 | |
kind: Pod | |
metadata: | |
name: nvidia-smi | |
spec: | |
restartPolicy: OnFailure | |
containers: | |
- name: nvidia-smi | |
image: nvidia/cuda:12.3.2-runtime-ubuntu22.04 | |
args: | |
- "nvidia-smi" | |
resources: | |
limits: | |
nvidia.com/gpu: 1 | |
EOF | |
# deploy kubeflow stuff | |
cd $HOME/go/src/github.com/kubeflow/manifests | |
grep "namespace: default" apps/training-operator/upstream/base/rbac/cluster-role-binding.yaml || echo " namespace: default" >> apps/training-operator/upstream/base/rbac/cluster-role-binding.yaml | |
kustomize build apps/training-operator/upstream/base | kubectl -n default apply -f - | |
# try an example | |
curl https://raw.githubusercontent.com/aws/aws-k8s-tester/main/e2e2/test/cases/nvidia/manifests/mpi-job-pytorch-training-single-node.yaml | sed 's/v2beta1/v1/' | sed 's/mpiImplementation: OpenMPI//' | kubectl apply -f - | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment