Skip to content

Instantly share code, notes, and snippets.

@afgane
Last active November 2, 2020 02:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save afgane/62235290f2639e550eaf91abeabbc818 to your computer and use it in GitHub Desktop.
Save afgane/62235290f2639e550eaf91abeabbc818 to your computer and use it in GitHub Desktop.
Add AWS GPU instance to a K8s cluster. Tested with instance type g3 on Ubuntu 20.04.
docker run --gpus all -it nvidia/cuda:10.2-devel-ubuntu18.04 /bin/bash
apt update && apt install -y python3 build-essential python3-pip libz-dev wget libncurses5 libncurses5-dev libbz2-dev liblzma-dev liblzma5
wget https://github.com/samtools/samtools/releases/download/1.11/samtools-1.11.tar.bz2
tar -xf samtools-1.11.tar.bz2
cd samtools-1.11
./configure --prefix /usr/local/
make
make install
pip3 install ont-bonito cupy-cuda102
docker commit <container id> galaxyworks/samtools-bonito:0.3.0
<job_conf>
<plugins>
<plugin id="local" type="runner" load="galaxy.jobs.runners.local:LocalJobRunner" workers="4" />
<plugin id="k8s" type="runner" load="galaxy.jobs.runners.kubernetes:KubernetesJobRunner">
<param id="k8s_use_service_account">true</param>
<param id="k8s_persistent_volume_claims">galaxy-1604089776-galaxy-pvc:/galaxy/server/database,galaxy-1604089776-cvmfs-gxy-data-pvc:/cvmfs/data.galaxyproject.org,galaxy-1604089776-cvmfs-gxy-main-pvc:/cvmfs/main.galaxyproject.org,initial-project-data-pvc:/gvl/projects/current,initial-public-data-pvc:/gvl/public</param>
<param id="k8s_namespace">initial</param>
<!-- Must be DNS friendly and less than 20 characters -->
<param id="k8s_galaxy_instance_id">galaxy-1604089776</param>
<param id="k8s_run_as_user_id">101</param>
<param id="k8s_run_as_group_id">101</param>
<param id="k8s_fs_group_id">101</param>
<param id="k8s_supplemental_group_id">101</param>
<param id="k8s_pull_policy">IfNotPresent</param>
<param id="k8s_cleanup_job">onsuccess</param>
<param id="k8s_pod_priority_class">galaxy-1604089776-job-priority</param>
</plugin>
<plugin id="gpu" type="runner" load="galaxy.jobs.runners.kubernetes:KubernetesJobRunner">
<param id="k8s_use_service_account">true</param>
<param id="k8s_persistent_volume_claims">galaxy-1604089776-galaxy-pvc:/galaxy/server/database,galaxy-1604089776-cvmfs-gxy-data-pvc:/cvmfs/data.galaxyproject.org,galaxy-1604089776-cvmfs-gxy-main-pvc:/cvmfs/main.galaxyproject.org</param>
<param id="k8s_namespace">initial</param>
<!-- Must be DNS friendly and less than 20 characters -->
<param id="k8s_galaxy_instance_id">galaxy-1604089776</param>
<param id="k8s_run_as_user_id">101</param>
<param id="k8s_run_as_group_id">101</param>
<param id="k8s_fs_group_id">101</param>
<param id="k8s_supplemental_group_id">101</param>
<param id="k8s_pull_policy">IfNotPresent</param>
<param id="k8s_cleanup_job">onsuccess</param>
<param id="k8s_pod_priority_class">galaxy-1604089776-job-priority</param>
<param id="k8s_affinity">
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu
operator: In
values:
- "true"
</param>
</plugin>
</plugins>
<handlers assign_with="db-skip-locked" />
<destinations default="dynamic-k8s-dispatcher">
<destination id="local" runner="local"/>
<destination id="dynamic-k8s-dispatcher" runner="dynamic">
<param id="type">python</param>
<param id="function">k8s_container_mapper</param>
<param id="docker_default_container_id">galaxy/galaxy-k8s:20.05</param>
<param id="docker_enabled">true</param>
</destination>
<destination id="dynamic-gpu-dispatcher" runner="dynamic">
<param id="type">python</param>
<param id="function">k8s_container_mapper</param>
<param id="k8s_runner_id">gpu</param>
<param id="docker_default_container_id">galaxyworks/bonito:0.3.0</param>
<param id="docker_enabled">true</param>
</destination>
</destinations>
<limits>
<limit type="registered_user_concurrent_jobs">5</limit>
<limit type="anonymous_user_concurrent_jobs">2</limit>
</limits>
<tools>
<tool id="bonito_basecaller" destination="dynamic-gpu-dispatcher" />
</tools>
</job_conf>
apt update && apt install -y ubuntu-drivers-common apt-transport-https ca-certificates curl gnupg-agent software-properties-common nvidia-cuda-toolkit linux-headers-generic build-essential
ubuntu-drivers autoinstall
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
add-apt-repository \
"deb [arch=amd64] https://download.docker.com/linux/ubuntu \
$(lsb_release -cs) \
stable"
apt update && apt install -y docker-ce docker-ce-cli containerd.io
reboot
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
apt update && apt install -y nvidia-container-toolkit nvidia-container-runtime nvidia-docker2
# Make sure the `default-runtime` is set
vi /etc/docker/daemon.json
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "/usr/bin/nvidia-container-runtime",
"runtimeArgs": []
}
}
}
sudo pkill -SIGHUP dockerd
sudo systemctl daemon-reload
systemctl restart docker
# Test with a demo container to see if the GPU is discovered by a container
docker run --gpus all nvidia/cuda nvidia-smi
# <add node to the cluster using the command from Rancher>
# Run once per cluster; it installs a daemonset
kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.7.0/nvidia-device-plugin.yml
kubectl label node <ip> nvidia.com/gpu=true
vi cuda.pod
apiVersion: v1
kind: Pod
metadata:
name: cuda-vector-add
spec:
restartPolicy: OnFailure
containers:
- name: cuda-vector-add
# https://github.com/kubernetes/kubernetes/blob/v1.7.11/test/images/nvidia-cuda/Dockerfile
image: "k8s.gcr.io/cuda-vector-add:v0.1"
resources:
limits:
nvidia.com/gpu: 1
nodeSelector:
nvidia.com/gpu: 'true'
kubectl apply -f cuda.pod
# There are no logs available via `kubectl logs` but `docker logs` shows process output
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment