dims/nvi.sh

## nvi.sh
#!/bin/bash

# All the notes you need!!!
# https://github.com/awslabs/amazon-eks-ami/releases/tag/v20240227
#   AMI Name: amazon-eks-gpu-node-1.29-v20240227
# https://docs.aws.amazon.com/eks/latest/userguide/eks-optimized-ami.html
# https://docs.aws.amazon.com/dlami/latest/devguide/gpu.html
# https://hub.docker.com/r/nvidia/cuda/tags
# https://github.com/NVIDIA/k8s-device-plugin/releases

# on the box where you can start stuff from
(cd $HOME/go/src/sigs.k8s.io/provider-aws-test-infra/kubetest2-ec2 && go install .)
VERSION=$(curl -Ls https://dl.k8s.io/ci/latest.txt)
kubetest2 ec2 \
 --stage https://dl.k8s.io/ci/ \
 --version $VERSION \
 --instance-type=g4dn.xlarge \
 --worker-image=ami-0e58b1ac65cc02370 \
 --device-plugin-nvidia true \
 --worker-user-data-file $(go env GOPATH)/src/sigs.k8s.io/provider-aws-test-infra/kubetest2-ec2/config/al2.sh \
 --region us-east-1 \
 --target-build-arch linux/amd64 \
 --up

# post startup once you have KUBECONFIG pointed to the cluster created above
kubectl get nodes
kubectl get nodes -o yaml
kubectl get pods --all-namespaces
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.5/nvidia-device-plugin.yml
kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu"


kubectl rollout status daemonset nvidia-device-plugin-daemonset -n kube-system --timeout=2m

# Test pod
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
  name: nvidia-smi
spec:
  restartPolicy: OnFailure
  containers:
  - name: nvidia-smi
    image: nvidia/cuda:12.3.2-runtime-ubuntu22.04
    args:
    - "nvidia-smi"
    resources:
      limits:
        nvidia.com/gpu: 1
EOF

# deploy kubeflow stuff
cd $HOME/go/src/github.com/kubeflow/manifests
grep "namespace: default" apps/training-operator/upstream/base/rbac/cluster-role-binding.yaml || echo "  namespace: default" >> apps/training-operator/upstream/base/rbac/cluster-role-binding.yaml
kustomize build apps/training-operator/upstream/base | kubectl -n default apply -f -

# try an example
curl https://raw.githubusercontent.com/aws/aws-k8s-tester/main/e2e2/test/cases/nvidia/manifests/mpi-job-pytorch-training-single-node.yaml | sed 's/v2beta1/v1/' | sed 's/mpiImplementation: OpenMPI//' | kubectl apply -f -
	#!/bin/bash

	# All the notes you need!!!
	# https://github.com/awslabs/amazon-eks-ami/releases/tag/v20240227
	# AMI Name: amazon-eks-gpu-node-1.29-v20240227
	# https://docs.aws.amazon.com/eks/latest/userguide/eks-optimized-ami.html
	# https://docs.aws.amazon.com/dlami/latest/devguide/gpu.html
	# https://hub.docker.com/r/nvidia/cuda/tags
	# https://github.com/NVIDIA/k8s-device-plugin/releases

	# on the box where you can start stuff from
	(cd $HOME/go/src/sigs.k8s.io/provider-aws-test-infra/kubetest2-ec2 && go install .)
	VERSION=$(curl -Ls https://dl.k8s.io/ci/latest.txt)
	kubetest2 ec2 \
	--stage https://dl.k8s.io/ci/ \
	--version $VERSION \
	--instance-type=g4dn.xlarge \
	--worker-image=ami-0e58b1ac65cc02370 \
	--device-plugin-nvidia true \
	--worker-user-data-file $(go env GOPATH)/src/sigs.k8s.io/provider-aws-test-infra/kubetest2-ec2/config/al2.sh \
	--region us-east-1 \
	--target-build-arch linux/amd64 \
	--up

	# post startup once you have KUBECONFIG pointed to the cluster created above
	kubectl get nodes
	kubectl get nodes -o yaml
	kubectl get pods --all-namespaces
	kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.5/nvidia-device-plugin.yml
	kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu"


	kubectl rollout status daemonset nvidia-device-plugin-daemonset -n kube-system --timeout=2m

	# Test pod
	cat <<EOF \| kubectl apply -f -
	apiVersion: v1
	kind: Pod
	metadata:
	name: nvidia-smi
	spec:
	restartPolicy: OnFailure
	containers:
	- name: nvidia-smi
	image: nvidia/cuda:12.3.2-runtime-ubuntu22.04
	args:
	- "nvidia-smi"
	resources:
	limits:
	nvidia.com/gpu: 1
	EOF

	# deploy kubeflow stuff
	cd $HOME/go/src/github.com/kubeflow/manifests
	grep "namespace: default" apps/training-operator/upstream/base/rbac/cluster-role-binding.yaml \|\| echo " namespace: default" >> apps/training-operator/upstream/base/rbac/cluster-role-binding.yaml
	kustomize build apps/training-operator/upstream/base \| kubectl -n default apply -f -

	# try an example
	curl https://raw.githubusercontent.com/aws/aws-k8s-tester/main/e2e2/test/cases/nvidia/manifests/mpi-job-pytorch-training-single-node.yaml \| sed 's/v2beta1/v1/' \| sed 's/mpiImplementation: OpenMPI//' \| kubectl apply -f -