sub-mod/gpu-cuda9.txt

## gpu-cuda9.txt
## Check the GPU model
--------------------------------------
# lspci -nn | grep VGA
00:02.0 VGA compatible controller [0300]: Cirrus Logic GD 5446 [1013:00b8]
00:1e.0 VGA compatible controller [0300]: NVIDIA Corporation GM204GL [Tesla M60] [10de:13f2] (rev a1)
# you might need to do #yum install pciutils

## Check the kernel versions. Not all kernel versions are supported for drivers
-------------------------------------------------------------------------------------
# uname -r
3.10.0-765.el7.x86_64

## Change the kernel version to the version which works (here 3.10.0-693)
--------------------------------------------------------------------------------
# grep '^menuentry' /boot/grub2/grub.cfg
menuentry 'Red Hat Enterprise Linux Server 7.4 Rescue f2732ce2113247b297338f3e384f122e (3.10.0-765.el7.x86_64)' --class red --class gnu-linux --class gnu --class os --unrestricted $menuentry_id_option 'gnulinux-3.10.0-693.el7.x86_64-advanced-de4def96-ff72-4eb9-ad5e-0847257d1866' {
menuentry 'Red Hat Enterprise Linux Server (3.10.0-765.el7.x86_64) 7.4 (Maipo)' --class red --class gnu-linux --class gnu --class os --unrestricted $menuentry_id_option 'gnulinux-3.10.0-693.el7.x86_64-advanced-de4def96-ff72-4eb9-ad5e-0847257d1866' {
menuentry 'Red Hat Enterprise Linux Server (3.10.0-693.el7.x86_64) 7.4 (Maipo)' --class red --class gnu-linux --class gnu --class os --unrestricted $menuentry_id_option 'gnulinux-3.10.0-693.el7.x86_64-advanced-de4def96-ff72-4eb9-ad5e-0847257d1866' {
menuentry 'Red Hat Enterprise Linux Server (0-rescue-164021837e1e44d996ee35429af77920) 7.4 (Maipo)' --class red --class gnu-linux --class gnu --class os --unrestricted $menuentry_id_option 'gnulinux-0-rescue-164021837e1e44d996ee35429af77920-advanced-de4def96-ff72-4eb9-ad5e-0847257d1866' {
#

Modify "GRUB_DEFAULT=2"
# vi /etc/default/grub

# sudo grub2-mkconfig -o /boot/grub2/grub.cfg
Generating grub configuration file ...
Found linux image: /boot/vmlinuz-3.10.0-765.el7.x86_64
Found initrd image: /boot/initramfs-3.10.0-765.el7.x86_64.img
Found linux image: /boot/vmlinuz-3.10.0-693.el7.x86_64
Found initrd image: /boot/initramfs-3.10.0-693.el7.x86_64.img
Found linux image: /boot/vmlinuz-0-rescue-164021837e1e44d996ee35429af77920
Found initrd image: /boot/initramfs-0-rescue-164021837e1e44d996ee35429af77920.img
Found linux image: /boot/vmlinuz-0-rescue-f2732ce2113247b297338f3e384f122e
Found initrd image: /boot/initramfs-0-rescue-f2732ce2113247b297338f3e384f122e.img
done
#

# reboot

# uname -r
3.10.0-693.el7.x86_64

##  Disable Nouveau Driver
--------------------------------------
Add "blacklist nouveau"
# vi /etc/modprobe.d/blacklist.conf

# mv /boot/initramfs-$(uname -r).img /boot/initramfs-$(uname -r).img.bak

# dracut -v /boot/initramfs-$(uname -r).img $(uname -r)

# reboot

## Installa of NVIDIA Drivers
--------------------------------------
# yum install "kernel-devel-uname-r == $(uname -r)"
# yum groupinstall "Development Tools"


# lspci -nn | grep VGA
00:02.0 VGA compatible controller [0300]: Cirrus Logic GD 5446 [1013:00b8]
00:1e.0 VGA compatible controller [0300]: NVIDIA Corporation GM204GL [Tesla M60] [10de:13f2] (rev a1)

# Download the Driver for Linux x86_64
wget http://us.download.nvidia.com/tesla/384.81/NVIDIA-Linux-x86_64-384.81.run
./NVIDIA-Linux-x86_64-384.81.run


# Download Tesla Driver for Linux RHEL 7
Make a note of the GPU Product Type.Here it is Tesla M60.
Download from http://www.nvidia.com/Download/index.aspx
Tesla M Class ,Product M60, RHEL7, CUDA 9.0
Version:	384.81
Release Date:	2017.9.25
Operating System:	Linux 64-bit
CUDA Toolkit:	9.0
Language:	English (US)
GOTO http://www.nvidia.com/content/DriverDownload-March2009/confirmation.php?url=/tesla/384.81/NVIDIA-Linux-x86_64-384.81.run&lang=us&type=Tesla


 # ./NVIDIA-Linux-x86_64-384.81.run
Verifying archive integrity... OK
Uncompressing NVIDIA Accelerated Graphics Driver for Linux-x86_64 384.81..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
#


# Check if nvidia-smi works
Mon Nov 27 21:11:01 2017
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.81                 Driver Version: 384.81                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla M60           Off  | 00000000:00:1E.0 Off |                    0 |
| N/A   34C    P8    13W / 150W |     11MiB /  7613MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+


## Install CUDA 9.0
----------------------------
yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
yum install -y https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-repo-rhel7-8.0.61-1.x86_64.rpm
yum --showduplicates list cuda
yum install -y cuda-9.0.176-1
echo -e “$CUDA_HOME \\n $CUDA_PATH \\n $LD_LIBRARY_PATH”
ls -l /usr/local/cuda

## Install CuDNN 6.0
----------------------------
export CUDA_HOME="/usr/local/cuda"
export CUDA_PATH="${CUDA_HOME}"
export PATH="${CUDA_HOME}/bin${PATH:+:${PATH}}"
export LD_LIBRARY_PATH="${CUDA_HOME}/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}";
echo -e “$CUDA_HOME \\n $CUDA_PATH \\n $LD_LIBRARY_PATH”
wget http://perf1.perf.lab.eng.bos.redhat.com/jeder/NVIDIA/cuDNN/CUDA9/cuDNN_v7.0.3/cudnn-9.0-linux-x64-v7.tgz
tar -C /usr/local -xvf cudnn-9.0-linux-x64-v7.tgz
ls -l /usr/local/cuda/lib64/libcudnn.so.7
ls -l  /usr/local/cuda/lib64/libcudnn_static.a
export LD_LIBRARY_PATH=/usr/local/cuda/lib64/


## Setup Openshift
----------------------------
1.Find GPU name
nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0 | sed -e 's/ /-/g'

2.Label the node with the gpu name(here it is Tesla-M60)
oc label node ip-172-31-1-39.us-west-2.compute.internal alpha.kubernetes.io/nvidia-gpu-name='Tesla-M60' --overwrite

3.Modify node config & systemctl restart atomic-openshift-node
open /etc/origin/node/node-config.yaml and add below lines

kubeletArguments:

feature-gates:
- Accelerators=true

4.Check for changes in node
# journalctl -u atomic-openshift-node --since="5 minutes ago" | grep feature
< Aug 22 14:57:31 ip-172-31-4-10.us-west-2.compute.internal atomic-openshift-node[27395]: I0822 14:57:31.486944   27395 feature_gate.go:144] feature gates: map[Accelerators:true]

5.Check for label changes on node
# oc describe node ip-172-31-1-39.us-west-2.compute.internal | egrep -B1 'Name:|gpu:'
Name:			ip-172-31-1-39.us-west-2.compute.internal
--
Capacity:
 alpha.kubernetes.io/nvidia-gpu:	1
--
Allocatable:
 alpha.kubernetes.io/nvidia-gpu:	1


6.add changes to dc in affinity and resources/limits
{
            "kind": "DeploymentConfig",
            "apiVersion": "v1",
            "metadata": {
                "name": "${APPLICATION_NAME}",
                "labels": {
                    "appid": "tf-app-server-${APPLICATION_NAME}",
                    "appName": "${APPLICATION_NAME}"
                }
            },
            "spec": {
                "strategy": {
                    "type": "Rolling"
                },
                "affinity": {
                    "nodeAffinity": {
                        "requiredDuringSchedulingIgnoredDuringExecution": {
                            "nodeSelectorTerms": [
                                {
                                    "matchExpressions": [
                                        {
                                            "key": "alpha.kubernetes.io/nvidia-gpu-name",
                                            "operator": "In",
                                            "values": [
                                                "Tesla_M60"
                                            ]
                                        }
                                    ]
                                }
                            ]
                        }
                    }
                },
                "triggers": [
                    {
                        "type": "ConfigChange"
                    },
                    {
                        "type": "ImageChange",
                        "imageChangeParams": {
                            "automatic": true,
                            "containerNames": [
                                "${APPLICATION_NAME}"
                            ],
                            "from": {
                                "kind": "ImageStreamTag",
                                "name": "${APPLICATION_NAME}:latest"
                            }
                        }
                    }
                ],
                "replicas": 1,
                "selector": {
                    "deploymentconfig": "${APPLICATION_NAME}"
                },
                "template": {
                    "metadata": {
                        "labels": {
                            "appid": "tf-app-server-${APPLICATION_NAME}",
                            "deploymentconfig": "${APPLICATION_NAME}",
                            "appName": "${APPLICATION_NAME}"
                        }
                    },
                    "spec": {
                        "containers": [
                            {
                                "env": [
                                    {
                                        "name": "MODEL_NAME",
                                        "value": "${MODEL_NAME}"
                                    },
                                    {
                                        "name": "MODEL_PATH",
                                        "value": "${SOURCE_DIRECTORY}"
                                    }
                                ],
                                "name": "${APPLICATION_NAME}",
                                "image": "${APPLICATION_NAME}:latest",
                                "resources": {
                                    "limits": {
                                        "alpha.kubernetes.io/nvidia-gpu": "1"
                                    }
                                },
                                "ports": [
                                    {
                                        "containerPort": 6006,
                                        "protocol": "TCP"
                                    }
                                ]
                            }
                        ]
                    }
                }
            }
        },

7.Verify if Pods are allocated only in GPU pod
oadm manage-node ip-172-31-1-39.us-west-2.compute.internal  --list-pods

OR

oc get pod <tf-pods> --template={{.spec.nodeName}}
	## Check the GPU model
	--------------------------------------
	# lspci -nn \| grep VGA
	00:02.0 VGA compatible controller [0300]: Cirrus Logic GD 5446 [1013:00b8]
	00:1e.0 VGA compatible controller [0300]: NVIDIA Corporation GM204GL [Tesla M60] [10de:13f2] (rev a1)
	# you might need to do #yum install pciutils

	## Check the kernel versions. Not all kernel versions are supported for drivers
	-------------------------------------------------------------------------------------
	# uname -r
	3.10.0-765.el7.x86_64

	## Change the kernel version to the version which works (here 3.10.0-693)
	--------------------------------------------------------------------------------
	# grep '^menuentry' /boot/grub2/grub.cfg
	menuentry 'Red Hat Enterprise Linux Server 7.4 Rescue f2732ce2113247b297338f3e384f122e (3.10.0-765.el7.x86_64)' --class red --class gnu-linux --class gnu --class os --unrestricted $menuentry_id_option 'gnulinux-3.10.0-693.el7.x86_64-advanced-de4def96-ff72-4eb9-ad5e-0847257d1866' {
	menuentry 'Red Hat Enterprise Linux Server (3.10.0-765.el7.x86_64) 7.4 (Maipo)' --class red --class gnu-linux --class gnu --class os --unrestricted $menuentry_id_option 'gnulinux-3.10.0-693.el7.x86_64-advanced-de4def96-ff72-4eb9-ad5e-0847257d1866' {
	menuentry 'Red Hat Enterprise Linux Server (3.10.0-693.el7.x86_64) 7.4 (Maipo)' --class red --class gnu-linux --class gnu --class os --unrestricted $menuentry_id_option 'gnulinux-3.10.0-693.el7.x86_64-advanced-de4def96-ff72-4eb9-ad5e-0847257d1866' {
	menuentry 'Red Hat Enterprise Linux Server (0-rescue-164021837e1e44d996ee35429af77920) 7.4 (Maipo)' --class red --class gnu-linux --class gnu --class os --unrestricted $menuentry_id_option 'gnulinux-0-rescue-164021837e1e44d996ee35429af77920-advanced-de4def96-ff72-4eb9-ad5e-0847257d1866' {
	#

	Modify "GRUB_DEFAULT=2"
	# vi /etc/default/grub

	# sudo grub2-mkconfig -o /boot/grub2/grub.cfg
	Generating grub configuration file ...
	Found linux image: /boot/vmlinuz-3.10.0-765.el7.x86_64
	Found initrd image: /boot/initramfs-3.10.0-765.el7.x86_64.img
	Found linux image: /boot/vmlinuz-3.10.0-693.el7.x86_64
	Found initrd image: /boot/initramfs-3.10.0-693.el7.x86_64.img
	Found linux image: /boot/vmlinuz-0-rescue-164021837e1e44d996ee35429af77920
	Found initrd image: /boot/initramfs-0-rescue-164021837e1e44d996ee35429af77920.img
	Found linux image: /boot/vmlinuz-0-rescue-f2732ce2113247b297338f3e384f122e
	Found initrd image: /boot/initramfs-0-rescue-f2732ce2113247b297338f3e384f122e.img
	done
	#

	# reboot

	# uname -r
	3.10.0-693.el7.x86_64

	## Disable Nouveau Driver
	--------------------------------------
	Add "blacklist nouveau"
	# vi /etc/modprobe.d/blacklist.conf

	# mv /boot/initramfs-$(uname -r).img /boot/initramfs-$(uname -r).img.bak

	# dracut -v /boot/initramfs-$(uname -r).img $(uname -r)

	# reboot

	## Installa of NVIDIA Drivers
	--------------------------------------
	# yum install "kernel-devel-uname-r == $(uname -r)"
	# yum groupinstall "Development Tools"


	# lspci -nn \| grep VGA
	00:02.0 VGA compatible controller [0300]: Cirrus Logic GD 5446 [1013:00b8]
	00:1e.0 VGA compatible controller [0300]: NVIDIA Corporation GM204GL [Tesla M60] [10de:13f2] (rev a1)

	# Download the Driver for Linux x86_64
	wget http://us.download.nvidia.com/tesla/384.81/NVIDIA-Linux-x86_64-384.81.run
	./NVIDIA-Linux-x86_64-384.81.run


	# Download Tesla Driver for Linux RHEL 7
	Make a note of the GPU Product Type.Here it is Tesla M60.
	Download from http://www.nvidia.com/Download/index.aspx
	Tesla M Class ,Product M60, RHEL7, CUDA 9.0
	Version: 384.81
	Release Date: 2017.9.25
	Operating System: Linux 64-bit
	CUDA Toolkit: 9.0
	Language: English (US)
	GOTO http://www.nvidia.com/content/DriverDownload-March2009/confirmation.php?url=/tesla/384.81/NVIDIA-Linux-x86_64-384.81.run&lang=us&type=Tesla



	# ./NVIDIA-Linux-x86_64-384.81.run
	Verifying archive integrity... OK
	Uncompressing NVIDIA Accelerated Graphics Driver for Linux-x86_64 384.81..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
	#



	# Check if nvidia-smi works
	Mon Nov 27 21:11:01 2017
	+-----------------------------------------------------------------------------+
	\| NVIDIA-SMI 384.81 Driver Version: 384.81 \|
	\|-------------------------------+----------------------+----------------------+
	\| GPU Name Persistence-M\| Bus-Id Disp.A \| Volatile Uncorr. ECC \|
	\| Fan Temp Perf Pwr:Usage/Cap\| Memory-Usage \| GPU-Util Compute M. \|
	\|===============================+======================+======================\|
	\| 0 Tesla M60 Off \| 00000000:00:1E.0 Off \| 0 \|
	\| N/A 34C P8 13W / 150W \| 11MiB / 7613MiB \| 0% Default \|
	+-------------------------------+----------------------+----------------------+

	+-----------------------------------------------------------------------------+
	\| Processes: GPU Memory \|
	\| GPU PID Type Process name Usage \|
	\|=============================================================================\|
	\| No running processes found \|
	+-----------------------------------------------------------------------------+



	## Install CUDA 9.0
	----------------------------
	yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
	yum install -y https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-repo-rhel7-8.0.61-1.x86_64.rpm
	yum --showduplicates list cuda
	yum install -y cuda-9.0.176-1
	echo -e “$CUDA_HOME \\n $CUDA_PATH \\n $LD_LIBRARY_PATH”
	ls -l /usr/local/cuda

	## Install CuDNN 6.0
	----------------------------
	export CUDA_HOME="/usr/local/cuda"
	export CUDA_PATH="${CUDA_HOME}"
	export PATH="${CUDA_HOME}/bin${PATH:+:${PATH}}"
	export LD_LIBRARY_PATH="${CUDA_HOME}/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}";
	echo -e “$CUDA_HOME \\n $CUDA_PATH \\n $LD_LIBRARY_PATH”
	wget http://perf1.perf.lab.eng.bos.redhat.com/jeder/NVIDIA/cuDNN/CUDA9/cuDNN_v7.0.3/cudnn-9.0-linux-x64-v7.tgz
	tar -C /usr/local -xvf cudnn-9.0-linux-x64-v7.tgz
	ls -l /usr/local/cuda/lib64/libcudnn.so.7
	ls -l /usr/local/cuda/lib64/libcudnn_static.a
	export LD_LIBRARY_PATH=/usr/local/cuda/lib64/


	## Setup Openshift
	----------------------------
	1.Find GPU name
	nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0 \| sed -e 's/ /-/g'

	2.Label the node with the gpu name(here it is Tesla-M60)
	oc label node ip-172-31-1-39.us-west-2.compute.internal alpha.kubernetes.io/nvidia-gpu-name='Tesla-M60' --overwrite

	3.Modify node config & systemctl restart atomic-openshift-node
	open /etc/origin/node/node-config.yaml and add below lines

	kubeletArguments:

	feature-gates:
	- Accelerators=true

	4.Check for changes in node
	# journalctl -u atomic-openshift-node --since="5 minutes ago" \| grep feature
	< Aug 22 14:57:31 ip-172-31-4-10.us-west-2.compute.internal atomic-openshift-node[27395]: I0822 14:57:31.486944 27395 feature_gate.go:144] feature gates: map[Accelerators:true]

	5.Check for label changes on node
	# oc describe node ip-172-31-1-39.us-west-2.compute.internal \| egrep -B1 'Name:\|gpu:'
	Name: ip-172-31-1-39.us-west-2.compute.internal
	--
	Capacity:
	alpha.kubernetes.io/nvidia-gpu: 1
	--
	Allocatable:
	alpha.kubernetes.io/nvidia-gpu: 1



	6.add changes to dc in affinity and resources/limits
	{
	"kind": "DeploymentConfig",
	"apiVersion": "v1",
	"metadata": {
	"name": "${APPLICATION_NAME}",
	"labels": {
	"appid": "tf-app-server-${APPLICATION_NAME}",
	"appName": "${APPLICATION_NAME}"
	}
	},
	"spec": {
	"strategy": {
	"type": "Rolling"
	},
	"affinity": {
	"nodeAffinity": {
	"requiredDuringSchedulingIgnoredDuringExecution": {
	"nodeSelectorTerms": [
	{
	"matchExpressions": [
	{
	"key": "alpha.kubernetes.io/nvidia-gpu-name",
	"operator": "In",
	"values": [
	"Tesla_M60"
	]
	}
	]
	}
	]
	}
	}
	},
	"triggers": [
	{
	"type": "ConfigChange"
	},
	{
	"type": "ImageChange",
	"imageChangeParams": {
	"automatic": true,
	"containerNames": [
	"${APPLICATION_NAME}"
	],
	"from": {
	"kind": "ImageStreamTag",
	"name": "${APPLICATION_NAME}:latest"
	}
	}
	}
	],
	"replicas": 1,
	"selector": {
	"deploymentconfig": "${APPLICATION_NAME}"
	},
	"template": {
	"metadata": {
	"labels": {
	"appid": "tf-app-server-${APPLICATION_NAME}",
	"deploymentconfig": "${APPLICATION_NAME}",
	"appName": "${APPLICATION_NAME}"
	}
	},
	"spec": {
	"containers": [
	{
	"env": [
	{
	"name": "MODEL_NAME",
	"value": "${MODEL_NAME}"
	},
	{
	"name": "MODEL_PATH",
	"value": "${SOURCE_DIRECTORY}"
	}
	],
	"name": "${APPLICATION_NAME}",
	"image": "${APPLICATION_NAME}:latest",
	"resources": {
	"limits": {
	"alpha.kubernetes.io/nvidia-gpu": "1"
	}
	},
	"ports": [
	{
	"containerPort": 6006,
	"protocol": "TCP"
	}
	]
	}
	]
	}
	}
	}
	},

	7.Verify if Pods are allocated only in GPU pod
	oadm manage-node ip-172-31-1-39.us-west-2.compute.internal --list-pods

	OR

	oc get pod <tf-pods> --template={{.spec.nodeName}}