initialed85/nvidia-device-plugin.yml

## nvidia-device-plugin.yml
# tweaked from https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.12.2/nvidia-device-plugin.yml
# credit to https://github.com/hansaya for the comment at https://github.com/k3s-io/k3s/issues/4391#issuecomment-1194627754
# note: you need to label your GPU nodes with gpu=yes for this to work (I had some non-GPU nodes and the discovery pods kept crashlooping
# on those nodes without the system working out they had no GPUs)

#
# prerequisites
#

apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
  name: nvidia
handler: nvidia
---
# From https://github.com/NVIDIA/gpu-feature-discovery/blob/main/deployments/static/nfd.yaml
# Modified to set `runtimeClassName: nvidia`, as k3s does not replace the default container runtime
apiVersion: v1
kind: Namespace
metadata:
  name: node-feature-discovery
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: nfd-master
  namespace: node-feature-discovery
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: nfd-master
rules:
  - apiGroups:
      - ""
    resources:
      - nodes
    verbs:
      - get
      - patch
      - update
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: nfd-master
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: nfd-master
subjects:
  - kind: ServiceAccount
    name: nfd-master
    namespace: node-feature-discovery
---

#
# feature discovery
#

apiVersion: apps/v1
kind: DaemonSet
metadata:
  labels:
    app: nfd
  name: nfd
  namespace: node-feature-discovery
spec:
  selector:
    matchLabels:
      app: nfd
  template:
    metadata:
      labels:
        app: nfd
    spec:
      serviceAccount: nfd-master
      runtimeClassName: nvidia
      containers:
        - env:
            - name: NODE_NAME
              valueFrom:
                fieldRef:
                  fieldPath: spec.nodeName
          image: quay.io/kubernetes_incubator/node-feature-discovery:v0.6.0
          name: nfd-master
          command:
            - "nfd-master"
          args:
            - "--extra-label-ns=nvidia.com"
        - env:
            - name: NODE_NAME
              valueFrom:
                fieldRef:
                  fieldPath: spec.nodeName
          image: quay.io/kubernetes_incubator/node-feature-discovery:v0.6.0
          name: nfd-worker
          command:
            - "nfd-worker"
          args:
            - "--sleep-interval=60s"
            - "--options={\"sources\": {\"pci\": { \"deviceLabelFields\": [\"vendor\"] }}}"
          volumeMounts:
            - name: host-boot
              mountPath: "/host-boot"
              readOnly: true
            - name: host-os-release
              mountPath: "/host-etc/os-release"
              readOnly: true
            - name: host-sys
              mountPath: "/host-sys"
            - name: source-d
              mountPath: "/etc/kubernetes/node-feature-discovery/source.d/"
            - name: features-d
              mountPath: "/etc/kubernetes/node-feature-discovery/features.d/"
      nodeSelector:
        gpu: "yes"
      volumes:
        - name: host-boot
          hostPath:
            path: "/boot"
        - name: host-os-release
          hostPath:
            path: "/etc/os-release"
        - name: host-sys
          hostPath:
            path: "/sys"
        - name: source-d
          hostPath:
            path: "/etc/kubernetes/node-feature-discovery/source.d/"
        - name: features-d
          hostPath:
            path: "/etc/kubernetes/node-feature-discovery/features.d/"
---
# From https://github.com/NVIDIA/gpu-feature-discovery/blob/main/deployments/static/gpu-feature-discovery-daemonset.yaml
# Modified to set `runtimeClassName: nvidia`, as k3s does not replace the default container runtime
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: gpu-feature-discovery
  namespace: node-feature-discovery
  labels:
    app.kubernetes.io/name: gpu-feature-discovery
    app.kubernetes.io/version: 0.6.1
    app.kubernetes.io/part-of: nvidia-gpu
spec:
  selector:
    matchLabels:
      app.kubernetes.io/name: gpu-feature-discovery
      app.kubernetes.io/part-of: nvidia-gpu
  template:
    metadata:
      labels:
        app.kubernetes.io/name: gpu-feature-discovery
        app.kubernetes.io/version: 0.6.1
        app.kubernetes.io/part-of: nvidia-gpu
    spec:
      runtimeClassName: nvidia
      containers:
        - image: nvcr.io/nvidia/gpu-feature-discovery:v0.6.1
          name: gpu-feature-discovery
          volumeMounts:
            - name: output-dir
              mountPath: "/etc/kubernetes/node-feature-discovery/features.d"
            - name: dmi-product-name
              mountPath: "/sys/class/dmi/id/product_name"
          securityContext:
            privileged: true
          env:
            - name: MIG_STRATEGY
              value: none
      nodeSelector:
        gpu: "yes"
        feature.node.kubernetes.io/pci-10de.present: "true" # NVIDIA vendor ID
      volumes:
        - name: output-dir
          hostPath:
            path: "/etc/kubernetes/node-feature-discovery/features.d"
        - name: dmi-product-name
          hostPath:
            path: "/sys/class/dmi/id/product_name"
---

#
# plugin
#

# From https://github.com/NVIDIA/k8s-device-plugin/blob/master/nvidia-device-plugin.yml
# Modified to set `runtimeClassName: nvidia`, as k3s does not replace the default container runtime
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: nvidia-device-plugin-daemonset
  namespace: kube-system
spec:
  selector:
    matchLabels:
      name: nvidia-device-plugin-ds
  updateStrategy:
    type: RollingUpdate
  template:
    metadata:
      labels:
        name: nvidia-device-plugin-ds
    spec:
      tolerations:
        - key: nvidia.com/gpu
          operator: Exists
          effect: NoSchedule
      # Mark this pod as a critical add-on; when enabled, the critical add-on
      # scheduler reserves resources for critical add-on pods so that they can
      # be rescheduled after a failure.
      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
      priorityClassName: "system-node-critical"
      runtimeClassName: "nvidia"
      containers:
        - image: nvcr.io/nvidia/k8s-device-plugin:v0.12.2
          name: nvidia-device-plugin-ctr
          env:
            - name: FAIL_ON_INIT_ERROR
              value: "false"
          securityContext:
            allowPrivilegeEscalation: false
            capabilities:
              drop: [ "ALL" ]
          volumeMounts:
            - name: device-plugin
              mountPath: /var/lib/kubelet/device-plugins
      nodeSelector:
        gpu: "yes"
      volumes:
        - name: device-plugin
          hostPath:
            path: /var/lib/kubelet/device-plugins
	# tweaked from https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.12.2/nvidia-device-plugin.yml
	# credit to https://github.com/hansaya for the comment at https://github.com/k3s-io/k3s/issues/4391#issuecomment-1194627754
	# note: you need to label your GPU nodes with gpu=yes for this to work (I had some non-GPU nodes and the discovery pods kept crashlooping
	# on those nodes without the system working out they had no GPUs)

	#
	# prerequisites
	#

	apiVersion: node.k8s.io/v1
	kind: RuntimeClass
	metadata:
	name: nvidia
	handler: nvidia
	---
	# From https://github.com/NVIDIA/gpu-feature-discovery/blob/main/deployments/static/nfd.yaml
	# Modified to set `runtimeClassName: nvidia`, as k3s does not replace the default container runtime
	apiVersion: v1
	kind: Namespace
	metadata:
	name: node-feature-discovery
	---
	apiVersion: v1
	kind: ServiceAccount
	metadata:
	name: nfd-master
	namespace: node-feature-discovery
	---
	apiVersion: rbac.authorization.k8s.io/v1
	kind: ClusterRole
	metadata:
	name: nfd-master
	rules:
	- apiGroups:
	- ""
	resources:
	- nodes
	verbs:
	- get
	- patch
	- update
	---
	apiVersion: rbac.authorization.k8s.io/v1
	kind: ClusterRoleBinding
	metadata:
	name: nfd-master
	roleRef:
	apiGroup: rbac.authorization.k8s.io
	kind: ClusterRole
	name: nfd-master
	subjects:
	- kind: ServiceAccount
	name: nfd-master
	namespace: node-feature-discovery
	---

	#
	# feature discovery
	#

	apiVersion: apps/v1
	kind: DaemonSet
	metadata:
	labels:
	app: nfd
	name: nfd
	namespace: node-feature-discovery
	spec:
	selector:
	matchLabels:
	app: nfd
	template:
	metadata:
	labels:
	app: nfd
	spec:
	serviceAccount: nfd-master
	runtimeClassName: nvidia
	containers:
	- env:
	- name: NODE_NAME
	valueFrom:
	fieldRef:
	fieldPath: spec.nodeName
	image: quay.io/kubernetes_incubator/node-feature-discovery:v0.6.0
	name: nfd-master
	command:
	- "nfd-master"
	args:
	- "--extra-label-ns=nvidia.com"
	- env:
	- name: NODE_NAME
	valueFrom:
	fieldRef:
	fieldPath: spec.nodeName
	image: quay.io/kubernetes_incubator/node-feature-discovery:v0.6.0
	name: nfd-worker
	command:
	- "nfd-worker"
	args:
	- "--sleep-interval=60s"
	- "--options={\"sources\": {\"pci\": { \"deviceLabelFields\": [\"vendor\"] }}}"
	volumeMounts:
	- name: host-boot
	mountPath: "/host-boot"
	readOnly: true
	- name: host-os-release
	mountPath: "/host-etc/os-release"
	readOnly: true
	- name: host-sys
	mountPath: "/host-sys"
	- name: source-d
	mountPath: "/etc/kubernetes/node-feature-discovery/source.d/"
	- name: features-d
	mountPath: "/etc/kubernetes/node-feature-discovery/features.d/"
	nodeSelector:
	gpu: "yes"
	volumes:
	- name: host-boot
	hostPath:
	path: "/boot"
	- name: host-os-release
	hostPath:
	path: "/etc/os-release"
	- name: host-sys
	hostPath:
	path: "/sys"
	- name: source-d
	hostPath:
	path: "/etc/kubernetes/node-feature-discovery/source.d/"
	- name: features-d
	hostPath:
	path: "/etc/kubernetes/node-feature-discovery/features.d/"
	---
	# From https://github.com/NVIDIA/gpu-feature-discovery/blob/main/deployments/static/gpu-feature-discovery-daemonset.yaml
	# Modified to set `runtimeClassName: nvidia`, as k3s does not replace the default container runtime
	apiVersion: apps/v1
	kind: DaemonSet
	metadata:
	name: gpu-feature-discovery
	namespace: node-feature-discovery
	labels:
	app.kubernetes.io/name: gpu-feature-discovery
	app.kubernetes.io/version: 0.6.1
	app.kubernetes.io/part-of: nvidia-gpu
	spec:
	selector:
	matchLabels:
	app.kubernetes.io/name: gpu-feature-discovery
	app.kubernetes.io/part-of: nvidia-gpu
	template:
	metadata:
	labels:
	app.kubernetes.io/name: gpu-feature-discovery
	app.kubernetes.io/version: 0.6.1
	app.kubernetes.io/part-of: nvidia-gpu
	spec:
	runtimeClassName: nvidia
	containers:
	- image: nvcr.io/nvidia/gpu-feature-discovery:v0.6.1
	name: gpu-feature-discovery
	volumeMounts:
	- name: output-dir
	mountPath: "/etc/kubernetes/node-feature-discovery/features.d"
	- name: dmi-product-name
	mountPath: "/sys/class/dmi/id/product_name"
	securityContext:
	privileged: true
	env:
	- name: MIG_STRATEGY
	value: none
	nodeSelector:
	gpu: "yes"
	feature.node.kubernetes.io/pci-10de.present: "true" # NVIDIA vendor ID
	volumes:
	- name: output-dir
	hostPath:
	path: "/etc/kubernetes/node-feature-discovery/features.d"
	- name: dmi-product-name
	hostPath:
	path: "/sys/class/dmi/id/product_name"
	---

	#
	# plugin
	#

	# From https://github.com/NVIDIA/k8s-device-plugin/blob/master/nvidia-device-plugin.yml
	# Modified to set `runtimeClassName: nvidia`, as k3s does not replace the default container runtime
	apiVersion: apps/v1
	kind: DaemonSet
	metadata:
	name: nvidia-device-plugin-daemonset
	namespace: kube-system
	spec:
	selector:
	matchLabels:
	name: nvidia-device-plugin-ds
	updateStrategy:
	type: RollingUpdate
	template:
	metadata:
	labels:
	name: nvidia-device-plugin-ds
	spec:
	tolerations:
	- key: nvidia.com/gpu
	operator: Exists
	effect: NoSchedule
	# Mark this pod as a critical add-on; when enabled, the critical add-on
	# scheduler reserves resources for critical add-on pods so that they can
	# be rescheduled after a failure.
	# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
	priorityClassName: "system-node-critical"
	runtimeClassName: "nvidia"
	containers:
	- image: nvcr.io/nvidia/k8s-device-plugin:v0.12.2
	name: nvidia-device-plugin-ctr
	env:
	- name: FAIL_ON_INIT_ERROR
	value: "false"
	securityContext:
	allowPrivilegeEscalation: false
	capabilities:
	drop: [ "ALL" ]
	volumeMounts:
	- name: device-plugin
	mountPath: /var/lib/kubelet/device-plugins
	nodeSelector:
	gpu: "yes"
	volumes:
	- name: device-plugin
	hostPath:
	path: /var/lib/kubelet/device-plugins