Nova:
- PCI PASSTHROUGH
- FLAVOR
Device:
product: GK110BGL [Tesla K40c] [10DE:1024]
vendor: NVIDIA Corporation [10DE]
physical id: 0
bus info: pci@0000:83:00.0
Before Queens vGPU:
[pci]
passthrough_whitelist={"vendor_id":"10de","product_id":"1024"}
Queens vGPU:
TBD
cmp03:
lspci | egrep NVIDIA
sudo lshw -numeric -C display
K8s:
- DeviceClass
Drivers:
- NVIDIA DOCKER2
- NVIDIA CUDA TOOLKIT/Drivers
kcm102:
lspci | egrep NVIDIA
sudo lshw -numeric -C display
cat /proc/driver/nvidia/version
nvidia-smi
cat /etc/docker/daemon.json
{
"runtimes": { <-----
"nvidia": {
"path": "/usr/bin/nvidia-container-runtime",
"runtimeArgs": []
}
},
"storage-driver": "overlay2",
"log-opts": {
"max-size": "50m",
"labels": "io.kubernetes.pod.name"
},
"iptables": false,
"bip": "172.31.255.1/24",
"log-driver": "json-file",
"default-runtime": "nvidia", <-----
"insecure-registries": [ <-----
"registry.services.baremetal.local:80"
]
}
poc-demo-helm repo,
cat ../charts/k8s-config/templates/device-plugin/nvidia.yaml:
# #
# https://github.com/NVIDIA/k8s-device-plugin
{{- if .Values.enabled.gpu }}
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
template:
metadata:
# Mark this pod as a critical add-on; when enabled, the critical add-on scheduler
# reserves resources for critical add-on pods so that they can be rescheduled after
# a failure. This annotation works in tandem with the toleration below.
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: nvidia-device-plugin-ds
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- kcm102
tolerations:
# Allow this pod to be rescheduled while the node is in "critical add-ons only" mode.
# This, along with the annotation above marks this pod as a critical add-on.
- key: CriticalAddonsOnly
operator: Exists
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
containers:
- image: nvidia/k8s-device-plugin:1.11
name: nvidia-device-plugin-ctr
env:
# https://github.com/NVIDIA/k8s-device-plugin/issues/33
- name: DP_DISABLE_HEALTHCHECKS
value: "xids"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
{{- end }}
cd ~/terraform-infra/vm-deploy-openstack
pipenv --py && pipenv shell
openstack flavor list
+--------------------------------------+-----------------+-------+------+-----------+-------+-----------+
| ID | Name | RAM | Disk | Ephemeral | VCPUs | Is Public |
+--------------------------------------+-----------------+-------+------+-----------+-------+-----------+
| 015babce-361b-4412-98cc-be2aed311fdc | k1.tiny | 4096 | 30 | 0 | 2 | True |
| 064b4941-66de-42d4-a070-ead81f009b31 | ml.medium.1xK40 | 8192 | 40 | 0 | 2 | True |
| 30f98ce0-3302-4fcd-aec0-a50fd45ab71f | small | 2048 | 20 | 0 | 1 | True |
| 4567c063-da5c-492d-a328-d307d32e34d5 | medium | 8192 | 40 | 0 | 2 | True |
| fd477f9f-5be8-440e-baf8-ba9c5364da73 | large | 16384 | 40 | 0 | 4 | True |
+--------------------------------------+-----------------+-------+------+-----------+-------+-----------+
openstack flavor show ml.medium.1xK40
+----------------------------+--------------------------------------+
| Field | Value |
+----------------------------+--------------------------------------+
| OS-FLV-DISABLED:disabled | False |
| OS-FLV-EXT-DATA:ephemeral | 0 |
| access_project_ids | None |
| disk | 40 |
| id | 064b4941-66de-42d4-a070-ead81f009b31 |
| name | ml.medium.1xK40 |
| os-flavor-access:is_public | True |
| properties | pci_passthrough:alias='K40c_Tesla:1' |
| ram | 8192 |
| rxtx_factor | 1.0 |
| swap | |
| vcpus | 2 |
+----------------------------+--------------------------------------+
terraform apply
Outputs:
access = <sensitive>
ctl_nodes = {
ctl01 = 172.16.209.216
ctl02 = 172.16.209.218
ctl03 = 172.16.209.219
}
gpu_nodes = {
gpu01 = 172.16.209.217
}
Check servers
openstack server list --all-projects --long | grep K40
Check HW
lspci | egrep NVIDIA
sudo lshw -numeric -C display
lspci | egrep NVIDIA
lshw -numeric -C display
cat /proc/driver/nvidia/version
nvidia-smi
cat /proc/driver/nvidia/version
NVRM version: NVIDIA UNIX x86_64 Kernel Module 396.44 Wed Jul 11 16:51:49 PDT 2018
GCC version: gcc version 5.4.0 20160609 (Ubuntu 5.4.0-6ubuntu1~16.04.10)
root@kcm102:~# nvidia-smi
Tue Oct 16 13:03:58 2018
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 396.44 Driver Version: 396.44 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla V100-PCIE... On | 00000000:18:00.0 Off | 0 |
| N/A 38C P0 26W / 250W | 0MiB / 16160MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 1 Tesla V100-PCIE... On | 00000000:86:00.0 Off | 0 |
| N/A 41C P0 30W / 250W | 0MiB / 16160MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
kubectl exec -ti poc-group-charts-tensorflow-notebook-68fbf745db-hn67d -c jupyter /bin/bash
mount
root@poc-group-charts-tensorflow-notebook-68fbf745db-hn67d:/notebooks# nvidia-smi
Tue Oct 16 13:07:59 2018
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 396.44 Driver Version: 396.44 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla V100-PCIE... On | 00000000:18:00.0 Off | 0 |
| N/A 38C P0 26W / 250W | 0MiB / 16160MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+