Skip to content

Instantly share code, notes, and snippets.

@HenriTEL
Created October 17, 2019 10:09
Show Gist options
  • Save HenriTEL/f14a2108ce428543e322461d6427ef57 to your computer and use it in GitHub Desktop.
Save HenriTEL/f14a2108ce428543e322461d6427ef57 to your computer and use it in GitHub Desktop.
A custom driver installer for gke ubuntu based images that includes nvidia-docker.
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-driver-installer
namespace: kube-system
labels:
k8s-app: nvidia-driver-installer
spec:
selector:
matchLabels:
k8s-app: nvidia-driver-installer
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-driver-installer
k8s-app: nvidia-driver-installer
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: Exists
tolerations:
- operator: "Exists"
volumes:
- name: dev
hostPath:
path: /dev
- name: boot
hostPath:
path: /boot
- name: root-mount
hostPath:
path: /
- name: install-script
configMap:
name: ubuntu-nvidia-install-script
items:
- key: ubuntu-nvidia-install.sh
path: ubuntu-nvidia-install
defaultMode: 0744
hostPID: true
initContainers:
- name: nvidia-driver-installer
image: gke-nvidia-installer:fixed
resources:
requests:
cpu: 0.15
securityContext:
privileged: true
volumeMounts:
- name: boot
mountPath: /boot
- name: dev
mountPath: /dev
- name: root-mount
mountPath: /root
- name: install-script
mountPath: /usr/local/bin
- name: docker-reloader
image: debian:10.1-slim
command: [nsenter, -t, '1', -m, -u, -n, -i, service, docker, reload]
securityContext:
privileged: true
containers:
- name: pause
image: gcr.io/google-containers/pause:2.0
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: kube-system
generatorOptions:
disableNameSuffixHash: true
configMapGenerator:
- name: ubuntu-nvidia-install-script
files:
- ubuntu-nvidia-install.sh
resources:
- daemonset.yml
#!/bin/sh
# CLOUD_IMG: This file was created/modified by the Cloud Image build process
#
# This script is run by the nvidia-container-first-boot script.
# It's purpose is to install nvidia drivers.
# Copyright 2018 Canonical Ltd.
#
# These programs are free software; you can redistribute and/or modify
# them under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
# These programs are distributed in the hope that they will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
# You should have received a copy of the GNU General Public License with
# your Ubuntu system, in /usr/share/common-licenses/GPL, or with the
# livecd-rootfs source package as the file COPYING. If not, write to
# the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
# Boston, MA 02110-1301 USA.
set -e
NVIDIA_INSTALL_DIR="/opt/nvidia"
ROOT_MOUNT_DIR="/root"
LINUX_FLAVOR="ubuntu18.04"
configure_nvidia_installation_dirs() {
echo "Configuring installation directories..."
tmp_deb_file=/var/cache/nvidia-driver-gke.deb
tmp_driver_version_file=/var/cache/nvidia-driver-version
kvers="$(uname -r)"
rm --force "$tmp_deb_file" "$tmp_driver_version_file"
# Check to see if the nvidia version to install has been overridden
if [ -n "$NVIDIA_DRIVER_VERSION" ]
then
nvidia_version="$NVIDIA_DRIVER_VERSION"
else
# Check to see if the nvidia version to install will change due to kernel
# having been upgraded/downgraded
if [ "$(cat /var/lib/nvidia/shipped-kernel-version || true)" = "$kvers" ]
then
# FIXME: nvidia module dir is versioned and we probably want it not to be
if [ -e "/lib/modules/$kvers/nvidia-*/nvidia.ko" ]; then
echo "Configuring installation directories... SKIPPED."
return 0
fi
nvidia_version="$(cat /var/lib/nvidia/shipped-nvidia-version)"
else
# Download the current nvidia driver version from object storage.
# The version will change based on the image variant
# (currently d1703-0 or d1809-0) and we can read this variant from
# /var/lib/nvidia/shipped-gke-variant
gke_variant="$(cat /var/lib/nvidia/shipped-gke-variant)"
wget "https://www.googleapis.com/storage/v1/b/ubuntu_nvidia_packages/o/current-driver-$kvers-$gke_variant-amd64?alt=media" -O "$tmp_driver_version_file"
nvidia_version="$(cat "$tmp_driver_version_file")"
fi
fi
gs_file="nvidia-driver-gke_$kvers-${nvidia_version}_amd64.deb"
wget "https://www.googleapis.com/storage/v1/b/ubuntu_nvidia_packages/o/$gs_file?alt=media" -O "$tmp_deb_file"
# Installing kernel modules triggers the kernel, but we don't want to
# pointlessly rebuild initramfs (which we shouldn't actually have anyway,
# that's a bug) or trigger a grub update. The latter fails due to
# grub-probe not being able to identify the rootfs, and skipping these
# hooks will speed up the driver package installation.
dpkg-divert --rename --local \
--divert /etc/kernel/postinst.d/initramfs-tools.nvidia-divert \
--add /etc/kernel/postinst.d/initramfs-tools
dpkg-divert --rename --local \
--divert /etc/kernel/postinst.d/zz-update-grub.nvidia-divert \
--add /etc/kernel/postinst.d/zz-update-grub
if ! DEBIAN_FRONTEND=noninteractive dpkg --install "$tmp_deb_file"; then
# could dpkg have failed because of a package dependency?
apt-get update
DEBIAN_FRONTEND=noninteractive apt-get --fix-broken --assume-yes install
fi
dpkg-divert --rename --local --remove /etc/kernel/postinst.d/initramfs-tools
dpkg-divert --rename --local --remove /etc/kernel/postinst.d/zz-update-grub
rm --force "$tmp_deb_file" "$tmp_driver_version_file"
echo "Configuring installation directories... DONE."
}
install_nvidia_docker() {
wget -qO- "https://nvidia.github.io/nvidia-container-runtime/gpgkey" | apt-key add -
wget -qO- "https://nvidia.github.io/nvidia-docker/gpgkey" | apt-key add -
wget "https://nvidia.github.io/nvidia-container-runtime/${LINUX_FLAVOR}/nvidia-container-runtime.list" -O "/etc/apt/sources.list.d/nvidia-container-runtime.list"
wget "https://nvidia.github.io/nvidia-docker/${LINUX_FLAVOR}/nvidia-docker.list" -O "/etc/apt/sources.list.d/nvidia-docker.list"
apt-get update
DEBIAN_FRONTEND=noninteractive apt-get --assume-yes install nvidia-container-runtime nvidia-docker2
echo '{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "nvidia-container-runtime",
"runtimeArgs": []
}
}
}' > /etc/docker/daemon.json
}
verify_nvidia_installation() {
echo "Verifying Nvidia installation..."
export PATH="${NVIDIA_INSTALL_DIR}/bin:${PATH}"
export LD_LIBRARY_PATH="${NVIDIA_INSTALL_DIR}/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
if [ ! -c "/dev/nvidia-uvm" ]; then
${NVIDIA_INSTALL_DIR}/sbin/create-uvm-dev-node
fi
nvidia-smi
echo "Verifying Nvidia installation... DONE."
}
update_host_ld_cache() {
echo "Updating host's ld cache..."
echo "${NVIDIA_INSTALL_DIR}/lib64" >> "${ROOT_MOUNT_DIR}/etc/ld.so.conf"
ldconfig -r "${ROOT_MOUNT_DIR}"
echo "Updating host's ld cache... DONE."
}
undo_docker_redirects() {
umount /etc/hosts /etc/resolv.conf /etc/hostname
mv /etc /etc.bak
ln --symbolic /root/etc /etc
}
create_nvidia_symlink() {
ln --symbolic --no-target-directory --force "${NVIDIA_INSTALL_DIR}" "${ROOT_MOUNT_DIR}/home/kubernetes/bin/nvidia"
}
main() {
echo "Installing nvidia drivers..."
undo_docker_redirects
configure_nvidia_installation_dirs
install_nvidia_docker
update_host_ld_cache
verify_nvidia_installation
create_nvidia_symlink
}
main "$@"
@HenriTEL
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment