Environment
RHEL 8.4 4.18.0-305.19.1.el8_4.x86_64
Verify setup
lspci -nnv |grep -i nvidia
e.g.
# lspci -nnv |grep -i nvidia
00:1e.0 VGA compatible controller [0300]: NVIDIA Corporation GM204GL [Tesla M60] [10de:13f2] (rev a1) (prog-if 00 [VGA controller])
Subsystem: NVIDIA Corporation Device [10de:113a]
Install
echo 'blacklist nouveau' >> /etc/modprobe.d/blacklist.conf
dnf config-manager --add-repo=https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
dnf module install nvidia-driver:latest -y
Then reboot.
Note, if secure boot is enabled, nvidia kernel modules may not be able to load. Refer to this to disable secure boot if necessary. After reboot,
nvidia-smi
Here is the output
# nvidia-smi
Thu Sep 16 14:53:09 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02 Driver Version: 470.57.02 CUDA Version: 11.4 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla M60 Off | 00000000:00:1E.0 Off | 0 |
| N/A 35C P0 38W / 150W | 0MiB / 7618MiB | 100% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
podman
Install podman and crun, and ensure crun as the default oci runtime
dnf install -y crun
dnf install -y podman
cp /usr/share/containers/containers.conf /etc/containers/containers.conf
sed -i 's/^# runtime = "crun"/runtime = "crun"/;' /etc/containers/containers.conf
Ensure Cgroup v2 version
podman info |grep -i cgroup
If the cgroup version is not v2, do the following to swtich to v2
grubby --update-kernel=ALL --args="systemd.unified_cgroup_hierarchy=1"
reboot
nvidia-docker
(note, currently no rhel 8.4 support yet, so use rhel 8.3 for now)
curl -s -L https://nvidia.github.io/nvidia-docker/rhel8.3/nvidia-docker.repo | tee /etc/yum.repos.d/nvidia-docker.repo
dnf install nvidia-container-toolkit -y
curl -LO https://raw.githubusercontent.com/NVIDIA/dgx-selinux/master/bin/RHEL7/nvidia-container.pp
semodule -i nvidia-container.pp
nvidia-container-cli -k list | restorecon -v -f -
Use the following /etc/nvidia-container-runtime/config.toml
for rootless access
disable-require = false
#swarm-resource = "DOCKER_RESOURCE_GPU"
#accept-nvidia-visible-devices-envvar-when-unprivileged = true
#accept-nvidia-visible-devices-as-volume-mounts = false
[nvidia-container-cli]
#root = "/run/nvidia/driver"
#path = "/usr/bin/nvidia-container-cli"
environment = []
debug = "/var/log/nvidia-container-toolkit.log"
#ldcache = "/etc/ld.so.cache"
load-kmods = false
no-cgroups = false
#user = "root:video"
ldconfig = "@/sbin/ldconfig"
[nvidia-container-runtime]
debug = "/var/log/nvidia-container-runtime.log"
Test nvidia + podman
# podman run --rm --privileged nvidia/samples:vectoradd-cuda11.2.1-ubi8
[Vector addition of 50000 elements]
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 196 blocks of 256 threads
Copy output data from the CUDA device to the host memory
Test PASSED
Done
# podman run --privileged -ti nvidia/cuda:11.0-base nvidia-smi
Thu Sep 16 18:26:26 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02 Driver Version: 470.57.02 CUDA Version: 11.4 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla M60 Off | 00000000:00:1E.0 Off | 0 |
| N/A 38C P0 38W / 150W | 0MiB / 7618MiB | 98% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
Run microshift
Refer to MicroShift Doc for installation guide.
Install nvidia GPU operator
helm repo add nvdp https://nvidia.github.io/k8s-device-plugin && helm repo update
helm install --generate-name nvdp/nvidia-device-plugin --namespace kube-system
# kubectl logs -n kube-system nvidia-device-plugin-1632319880-nq9gs
2021/09/22 14:37:43 Loading NVML
2021/09/22 14:37:43 Starting FS watcher.
2021/09/22 14:37:43 Starting OS watcher.
2021/09/22 14:37:43 Retreiving plugins.
2021/09/22 14:37:43 Starting GRPC server for 'nvidia.com/gpu'
2021/09/22 14:37:43 Starting to serve 'nvidia.com/gpu' on /var/lib/kubelet/device-plugins/nvidia-gpu.sock
2021/09/22 14:37:43 Registered device plugin for 'nvidia.com/gpu' with Kubelet
Run a sample GPU test
[root@rhel ~]# cat gpu.t
apiVersion: v1
kind: Pod
metadata:
name: gpu-operator-test
spec:
restartPolicy: OnFailure
containers:
- name: cuda-vector-add
image: "nvidia/samples:vectoradd-cuda10.2"
resources:
limits:
nvidia.com/gpu: 1
[root@rhel ~]# kubectl get pods -A
NAMESPACE NAME READY STATUS RESTARTS AGE
default gpu-operator-test 0/1 Completed 0 40s
kube-system kube-flannel-ds-45g7v 1/1 Running 0 4m
kube-system nvidia-device-plugin-1632322815-kg4jq 1/1 Running 0 2m47s
kubevirt-hostpath-provisioner kubevirt-hostpath-provisioner-lbpsz 1/1 Running 0 4m1s
openshift-dns dns-default-76jzs 3/3 Running 0 4m
openshift-ingress router-default-6d8c9d8f57-qqbcv 1/1 Running 1 4m
openshift-service-ca service-ca-64547678c6-wmgs4 1/1 Running 2 4m1s
[root@rhel ~]kubectl logs -n default gpu-operator-test
[Vector addition of 50000 elements]
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 196 blocks of 256 threads
Copy output data from the CUDA device to the host memory
Test PASSED
Done