epcim/ML_DEMO.md

## ML_DEMO.md

      
    Raw
  

              ML_DEMO.md
            
          
    OVERVIEW

Openstack

Nova:

PCI PASSTHROUGH
FLAVOR

Device:
product: GK110BGL [Tesla K40c] [10DE:1024]
vendor: NVIDIA Corporation [10DE]
physical id: 0
bus info: pci@0000:83:00.0

Before Queens vGPU:
[pci] 
passthrough_whitelist={"vendor_id":"10de","product_id":"1024"}

Queens vGPU:
TBD

cmp03:
lspci | egrep NVIDIA
sudo lshw -numeric -C display

K8s

K8s:

DeviceClass

Drivers:

NVIDIA DOCKER2
NVIDIA CUDA TOOLKIT/Drivers

kcm102:
lspci | egrep NVIDIA
sudo lshw -numeric -C display
cat /proc/driver/nvidia/version
nvidia-smi

cat /etc/docker/daemon.json
  {
    "runtimes": {                              <-----
      "nvidia": {
        "path": "/usr/bin/nvidia-container-runtime",
        "runtimeArgs": []
      }
    },
    "storage-driver": "overlay2",
    "log-opts": {
      "max-size": "50m",
      "labels": "io.kubernetes.pod.name"
    },
    "iptables": false,
    "bip": "172.31.255.1/24",
    "log-driver": "json-file",
    "default-runtime": "nvidia",                <-----
    "insecure-registries": [                    <-----
      "registry.services.baremetal.local:80"
    ]
  }

poc-demo-helm repo,
cat ../charts/k8s-config/templates/device-plugin/nvidia.yaml:
# #
# https://github.com/NVIDIA/k8s-device-plugin

{{- if .Values.enabled.gpu }}
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
  name: nvidia-device-plugin-daemonset
  namespace: kube-system
spec:
  template:
    metadata:
      # Mark this pod as a critical add-on; when enabled, the critical add-on scheduler
      # reserves resources for critical add-on pods so that they can be rescheduled after
      # a failure.  This annotation works in tandem with the toleration below.
      annotations:
        scheduler.alpha.kubernetes.io/critical-pod: ""
      labels:
        name: nvidia-device-plugin-ds
    spec:
      affinity:
          nodeAffinity:
            requiredDuringSchedulingIgnoredDuringExecution:
              nodeSelectorTerms:
              - matchExpressions:
                - key: kubernetes.io/hostname
                  operator: In
                  values:
                  - kcm102

      tolerations:
      # Allow this pod to be rescheduled while the node is in "critical add-ons only" mode.
      # This, along with the annotation above marks this pod as a critical add-on.
      - key: CriticalAddonsOnly
        operator: Exists
      - key: nvidia.com/gpu
        operator: Exists
        effect: NoSchedule
      containers:
      - image: nvidia/k8s-device-plugin:1.11
        name: nvidia-device-plugin-ctr
        env:
# https://github.com/NVIDIA/k8s-device-plugin/issues/33
        - name: DP_DISABLE_HEALTHCHECKS
          value: "xids"
        securityContext:
          allowPrivilegeEscalation: false
          capabilities:
            drop: ["ALL"]
        volumeMounts:
          - name: device-plugin
            mountPath: /var/lib/kubelet/device-plugins
      volumes:
        - name: device-plugin
          hostPath:
            path: /var/lib/kubelet/device-plugins
{{- end }}

ctl01

cd ~/terraform-infra/vm-deploy-openstack
pipenv --py && pipenv shell 

openstack flavor list
+--------------------------------------+-----------------+-------+------+-----------+-------+-----------+                                                                                  
| ID                                   | Name            |   RAM | Disk | Ephemeral | VCPUs | Is Public |                                                                                  
+--------------------------------------+-----------------+-------+------+-----------+-------+-----------+                                                                                  
| 015babce-361b-4412-98cc-be2aed311fdc | k1.tiny         |  4096 |   30 |         0 |     2 | True      |                                                                                  
| 064b4941-66de-42d4-a070-ead81f009b31 | ml.medium.1xK40 |  8192 |   40 |         0 |     2 | True      |                                                                                  
| 30f98ce0-3302-4fcd-aec0-a50fd45ab71f | small           |  2048 |   20 |         0 |     1 | True      |                                                                                  
| 4567c063-da5c-492d-a328-d307d32e34d5 | medium          |  8192 |   40 |         0 |     2 | True      |                                                                                  
| fd477f9f-5be8-440e-baf8-ba9c5364da73 | large           | 16384 |   40 |         0 |     4 | True      |                                                                                  
+--------------------------------------+-----------------+-------+------+-----------+-------+-----------+                                                                                  

openstack flavor show ml.medium.1xK40
+----------------------------+--------------------------------------+                                                                                                                      
| Field                      | Value                                |                                                                                                                      
+----------------------------+--------------------------------------+                                                                                                                      
| OS-FLV-DISABLED:disabled   | False                                |                                                                                                                      
| OS-FLV-EXT-DATA:ephemeral  | 0                                    |                                                                                                                      
| access_project_ids         | None                                 |                                                                                                                      
| disk                       | 40                                   |                                                                                                                      
| id                         | 064b4941-66de-42d4-a070-ead81f009b31 |                                                                                                                      
| name                       | ml.medium.1xK40                      |                                                                                                                      
| os-flavor-access:is_public | True                                 |                                                                                                                      
| properties                 | pci_passthrough:alias='K40c_Tesla:1' |                                                                                                                      
| ram                        | 8192                                 |                                                                                                                      
| rxtx_factor                | 1.0                                  |                                                                                                                      
| swap                       |                                      |                                                                                                                      
| vcpus                      | 2                                    |                                                                                                                      
+----------------------------+--------------------------------------+                                                                                                                      

terraform apply
Outputs:

access = <sensitive>
ctl_nodes = {
  ctl01 = 172.16.209.216
  ctl02 = 172.16.209.218
  ctl03 = 172.16.209.219
}
gpu_nodes = {
  gpu01 = 172.16.209.217
}

Check servers
openstack server list --all-projects --long  | grep K40

Check HW
lspci | egrep NVIDIA
sudo lshw -numeric -C display

kcm102

lspci | egrep NVIDIA
lshw -numeric -C display
cat /proc/driver/nvidia/version
nvidia-smi


cat /proc/driver/nvidia/version

  NVRM version: NVIDIA UNIX x86_64 Kernel Module  396.44  Wed Jul 11 16:51:49 PDT 2018
  GCC version:  gcc version 5.4.0 20160609 (Ubuntu 5.4.0-6ubuntu1~16.04.10) 

root@kcm102:~# nvidia-smi

  Tue Oct 16 13:03:58 2018       
  +-----------------------------------------------------------------------------+
  | NVIDIA-SMI 396.44                 Driver Version: 396.44                    |
  |-------------------------------+----------------------+----------------------+
  | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
  | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
  |===============================+======================+======================|
  |   0  Tesla V100-PCIE...  On   | 00000000:18:00.0 Off |                    0 |
  | N/A   38C    P0    26W / 250W |      0MiB / 16160MiB |      0%      Default |
  +-------------------------------+----------------------+----------------------+
  |   1  Tesla V100-PCIE...  On   | 00000000:86:00.0 Off |                    0 |
  | N/A   41C    P0    30W / 250W |      0MiB / 16160MiB |      0%      Default |
  +-------------------------------+----------------------+----------------------+

  +-----------------------------------------------------------------------------+
  | Processes:                                                       GPU Memory |
  |  GPU       PID   Type   Process name                             Usage      |
  |=============================================================================|
  |  No running processes found                                                 |
  +-----------------------------------------------------------------------------+

jupyter

kubectl exec -ti poc-group-charts-tensorflow-notebook-68fbf745db-hn67d -c jupyter /bin/bash

mount

root@poc-group-charts-tensorflow-notebook-68fbf745db-hn67d:/notebooks# nvidia-smi 

  Tue Oct 16 13:07:59 2018       
  +-----------------------------------------------------------------------------+
  | NVIDIA-SMI 396.44                 Driver Version: 396.44                    |
  |-------------------------------+----------------------+----------------------+
  | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
  | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
  |===============================+======================+======================|
  |   0  Tesla V100-PCIE...  On   | 00000000:18:00.0 Off |                    0 |
  | N/A   38C    P0    26W / 250W |      0MiB / 16160MiB |      0%      Default |
  +-------------------------------+----------------------+----------------------+

  +-----------------------------------------------------------------------------+
  | Processes:                                                       GPU Memory |
  |  GPU       PID   Type   Process name                             Usage      |
  |=============================================================================|
  |  No running processes found                                                 |
  +-----------------------------------------------------------------------------+