Disable Default Nvidia Device Plugin in Nvidia GPU Operator ClusterPolicy:
devicePlugin:
config:
default: ''
name: ''
enabled: false
imagePullPolicy: Always
Label nodes for alternate device plugin:
oc label node r730ocp3.localdomain k8s.kuartis.com/accelerator=vgpu
Download the alternative device plugin (https://github.com/kuartis/kuartis-virtual-gpu-device-plugin):
git clone https://github.com/kuartis/kuartis-virtual-gpu-device-plugin.git
Edit the device-plugin.yml manifest to get the latest (as of now) CUDA image, this should match CUDA being called from workload pods and Nvidia Operator. Note:
image: nvidia/cuda:11.8.0-runtime-ubuntu22.04
--vgpu=<number_of_virtual_gpu_one_pyhsical_gpu_can_have> # Default is 10, Max 48
--allowmultigpu=<true|false> # Default is false. Prevents vGPU resources that one container can have to fall on different physical gpus.
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: kuartis-virtual-gpu-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
app.kubernetes.io/name: kuartis-virtual-gpu-device-plugin
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
app.kubernetes.io/name: kuartis-virtual-gpu-device-plugin
spec:
hostIPC: true
nodeSelector:
k8s.kuartis.com/accelerator: vgpu
tolerations:
- key: k8s.kuartis.com/vgpu
operator: Exists
effect: NoSchedule
priorityClassName: "system-node-critical"
initContainers:
- name: set-compute-mode
image: nvidia/cuda:11.8.0-runtime-ubuntu22.04
command: ["nvidia-smi", "-c", "EXCLUSIVE_PROCESS"]
securityContext:
capabilities:
add: ["SYS_ADMIN"]
containers:
- image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.7.0
name: kuartis-virtual-gpu-device-plugin-ctr
command:
- /usr/bin/virtual-gpu-device-plugin
args:
- --vgpu=1
- --allowmultigpu=false
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
ports:
- containerPort: 8080
name: metrics
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: proc
mountPath: /host/proc
readOnly: true
- mountPath: /var/run/dockershim.sock
name: dockershimsock
- image: nvidia/mps
name: mps
volumeMounts:
- name: nvidia-mps
mountPath: /tmp/nvidia-mps
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: nvidia-mps
hostPath:
path: /tmp/nvidia-mps
- name: proc
hostPath:
path: /proc
- name: dockershimsock
hostPath:
path: /var/run/dockershim.sock
---
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/name: kuartis-virtual-gpu-device-plugin
name: kuartis-virtual-gpu-device-plugin
namespace: kube-system
spec:
ports:
- name: metrics
port: 8080
targetPort: 8080
protocol: TCP
selector:
app.kubernetes.io/name: kuartis-virtual-gpu-device-plugin
sessionAffinity: None
type: ClusterIP
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/name: kuartis-virtual-gpu-device-plugin
name: kuartis-virtual-gpu-device-plugin
namespace: kube-system
spec:
endpoints:
- interval: 15s
path: /metrics
port: metrics
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
app.kubernetes.io/name: kuartis-virtual-gpu-device-plugin
Apply manifest to cluster: