jreuben11/README.md

## README.md

      
    Raw
  

              README.md
            
          
    GPU Sharing on GKE DaemonSet

NOTE: This is not a Google supported product.

Example Usage


Create a GKE cluster with a GPU node pool:

gcloud container clusters create gpu-sharing-demo --zone us-central1-c
gcloud container node-pools create gpu --cluster gpu-sharing-demo --zone us-central1-c --num-nodes=1 --accelerator type=nvidia-tesla-p4,count=1

Apply the DaemonSet to enable GPU sharing:

kubectl apply -f https://gist.githubusercontent.com/danisla/77afbb88f215d116f1905f723d3d879d/raw/472a7c3fbdc38d821b0f85b71f2abadf65e57606/gpu-sharing-daemonset.yaml

Apply the DaemonSet to install the NVIDIA GPU driver per the docs:

kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml

Wait for the driver installer pod to become ready:

kubectl -n kube-system wait pod -l k8s-app=nvidia-driver-installer --for=condition=Ready --timeout=600s

Wait for the gpu-sharing pod to become ready:

kubectl -n kube-system wait pod -l app=gpu-sharing --for=condition=Ready --timeout=600s

Verify GPU sharing is working:

kubectl describe node -l cloud.google.com/gke-accelerator | grep nvidia.com/gpu
Example output:
 nvidia.com/gpu:             16
 nvidia.com/gpu:             16
  nvidia.com/gpu             0               0


NOTE: if it's not working, try restarting the nvidia-gpu-device-plugin pods in the kube-sytem namespace.


Run several GPU pods on the same node:

cat - | kubectl apply -f - <<EOF
apiVersion: apps/v1
kind: Deployment
metadata:
  name: gpu-sharing-test
spec:
  replicas: 6
  selector:
    matchLabels:
      app: gpu-sharing-test
  template:
    metadata:
      labels:
        app: gpu-sharing-test
    spec:
      containers:
        - name: my-gpu-container
          image: nvidia/cuda:10.0-runtime-ubuntu18.04
          command: ["/usr/local/nvidia/bin/nvidia-smi", "-l"]
          resources:
            limits:
              nvidia.com/gpu: 1
EOF

kubectl wait pod -l app=gpu-sharing-test --for=condition=Ready

  
## gpu-sharing-daemonset.yaml
# Copyright 2019 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
  name: gpu-sharing
  namespace: kube-system
spec:
  selector:
    matchLabels:
      app: gpu-sharing
  updateStrategy:
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 100%
  template:
    metadata:
      labels:
        app: gpu-sharing
    spec:
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
              - matchExpressions:
                  - key: cloud.google.com/gke-accelerator
                    operator: Exists
      tolerations:
        - key: "nvidia.com/gpu"
          effect: "NoSchedule"
          operator: "Exists"
      volumes:
        ###
        # dev filesystem from host
        ###
        - name: dev
          hostPath:
            path: /dev
      initContainers:
        ###
        #
        ###
        - name: gpu-sharing
          securityContext:
            privileged: true
          image: alpine:latest
          resources:
            requests:
              cpu: 0.15
          command: ["/bin/sh"]
          args:
            - -ec
            - |
              # Wait for nvidia0 device.
              echo "Waiting for nvidia0"
              until [[ -e /dev/nvidia0 ]]; do sleep 1; done
              echo "Found nvidia0, creating symlinks"

              # Create symlinks to NVIDIA device to support GPU sharing.
              OLD_DEV=/dev/nvidia0
              for i in $(seq 1 $(($NVIDIA_0_SHARE - 1))); do
                NEW_DEV=/dev/nvidia${i}
                echo "Linking $OLD_DEV -> $NEW_DEV"
                ln -sf $OLD_DEV $NEW_DEV
              done

              echo "Done"
          env:
            - name: NVIDIA_0_SHARE
              value: "16"
          volumeMounts:
            - name: dev
              mountPath: /dev
      containers:
        ###
        # pause container
        ###
        - image: "gcr.io/google-containers/pause:2.0"
          name: pause
	# Copyright 2019 Google Inc. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	apiVersion: extensions/v1beta1
	kind: DaemonSet
	metadata:
	name: gpu-sharing
	namespace: kube-system
	spec:
	selector:
	matchLabels:
	app: gpu-sharing
	updateStrategy:
	type: RollingUpdate
	rollingUpdate:
	maxUnavailable: 100%
	template:
	metadata:
	labels:
	app: gpu-sharing
	spec:
	affinity:
	nodeAffinity:
	requiredDuringSchedulingIgnoredDuringExecution:
	nodeSelectorTerms:
	- matchExpressions:
	- key: cloud.google.com/gke-accelerator
	operator: Exists
	tolerations:
	- key: "nvidia.com/gpu"
	effect: "NoSchedule"
	operator: "Exists"
	volumes:
	###
	# dev filesystem from host
	###
	- name: dev
	hostPath:
	path: /dev
	initContainers:
	###
	#
	###
	- name: gpu-sharing
	securityContext:
	privileged: true
	image: alpine:latest
	resources:
	requests:
	cpu: 0.15
	command: ["/bin/sh"]
	args:
	- -ec
	- \|
	# Wait for nvidia0 device.
	echo "Waiting for nvidia0"
	until [[ -e /dev/nvidia0 ]]; do sleep 1; done
	echo "Found nvidia0, creating symlinks"

	# Create symlinks to NVIDIA device to support GPU sharing.
	OLD_DEV=/dev/nvidia0
	for i in $(seq 1 $(($NVIDIA_0_SHARE - 1))); do
	NEW_DEV=/dev/nvidia${i}
	echo "Linking $OLD_DEV -> $NEW_DEV"
	ln -sf $OLD_DEV $NEW_DEV
	done

	echo "Done"
	env:
	- name: NVIDIA_0_SHARE
	value: "16"
	volumeMounts:
	- name: dev
	mountPath: /dev
	containers:
	###
	# pause container
	###
	- image: "gcr.io/google-containers/pause:2.0"
	name: pause