Update CentOS and reboot (you will need to login again):
sudo yum update -y
sudo reboot
As: root user
Install Java and misc:
sudo yum install -y rsync net-tools sbt git bzip2 java-1.8.0-openjdk.x86_64 java-1.8.0-openjdk-devel
java -version
echo export JAVA_HOME=\"$(readlink -f $(which java) | grep -oP '.*(?=/bin)')\" >> /root/.bash_profile
source /root/.bash_profile
$JAVA_HOME/bin/java -version
Download and extract Spark:
curl http://mirrors.ocf.berkeley.edu/apache/spark/spark-2.3.0/spark-2.3.0-bin-hadoop2.7.tgz | tar -zx -C /usr/local --show-transformed --transform='s,/*[^/]*,spark,'
Add Spark to bash_profile
echo export SPARK_HOME=\"/usr/local/spark\" >> /root/.bash_profile
echo export SPARK_EXAMPLES_JAR=\"/usr/local/spark/examples/jars/spark-examples_2.11-2.0.0.jar\" >> /root/.bash_profile
echo "export PATH=$PATH:$HOME/.local/bin:$HOME/bin:/usr/local/spark/bin" >> /root/.bash_profile
source /root/.bash_profile
Check that Spark is installed correctl
spark-shell
Create user <username>
:
adduser <username>
Set password:
passwd <username>
Add user to wheel group for sudo
privileges:
usermod -aG wheel <username>
Login as user:
su - <username>
Copy ~/.bash_profile
from root:
sudo cp /root/.bash_profile ~/.bash_profile
source ~/.bash_profile
Or make sure spark and java paths are in new user's profile:
PATH=$PATH:$HOME/bin:$HOME/.local/bin:$HOME/bin:/usr/local/spark/bin
export PATH
export JAVA_HOME="/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.161-0.b14.el7_4.x86_64/jre"
export JRE_HOME="/usr/lib/jvm/jre"
export SPARK_HOME="/usr/local/spark"
export SPARK_EXAMPLES_JAR="/usr/local/spark/examples/jars/spark-examples_2.11-2.0.0.jar"
export JRE_HOME="/usr/lib/jvm/jre"
As: non-root user
Add git ssh:
mkdir /home/<centos_username>/.ssh/github
ssh-keygen -t rsa -b 4096 -C "your_email@example.com"
Enter file in which to save the key (/home/<centos_username>/.ssh/id_rsa): /home/<centos_username>/.ssh/github/id_rsa
Login to https://www.github.com and add ssh key from:
cat ~/.ssh/github/id_rsa.pub
Start ssh agent and add key:
NOTE: You may want to add this to ~/.bash_profile to avoid having to redo this step at every login
eval "$(ssh-agent -s)"
ssh-add ~/.ssh/github/id_rsa
Check for existing installation:
which gcc
Install gcc:
sudo yum group install "Development Tools"
whereis gcc
Install cuda: instructions
sudo yum install pciutils
Check compatibility:
lspci | grep -i nvidia
uname -m && cat /etc/*release
uname -r
Download development packages:
sudo yum install kernel-devel-$(uname -r) kernel-headers-$(uname -r)
Enable epel release
sudo yum install --enablerepo=extras epel-release
Install DKMS and libvdpau
sudo yum install kernel-debug-devel dkms
sudo yum install libvdpau
Download CUDA toolkit 9.0 and patches: IMPORTANT: CUDA toolkit 9.1 not currently supported by Tensorflow
wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-rhel7-9-0-local-9.0.176-1.x86_64-rpm
wget https://developer.nvidia.com/compute/cuda/9.0/Prod/patches/1/cuda-repo-rhel7-9-0-local-cublas-performance-update-1.0-1.x86_64-rpm
wget https://developer.nvidia.com/compute/cuda/9.0/Prod/patches/2/cuda-repo-rhel7-9-0-local-cublas-performance-update-2-1.0-1.x86_64-rpm
Install cuda toolkit and patches
sudo rpm -i cuda-repo-rhel7-9-0-local-9.0.176-1.x86_64-rpm
sudo rpm -i cuda-repo-rhel7-9-0-local-cublas-performance-update-1.0-1.x86_64-rpm
sudo rpm -i cuda-repo-rhel7-9-0-local-cublas-performance-update-2-1.0-1.x86_64-rpm
sudo yum clean all
sudo yum install cuda-9-0
sudo yum clean expire-cache
Environment Setup:
export PATH=/usr/local/cuda-9.0/bin${PATH:+:${PATH}}
export LD_LIBRARY_PATH=/usr/local/cuda-9.0/lib64\
${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
or to add to ~/.bash_profile:
echo 'export PATH=/usr/local/cuda-9.0/bin${PATH:+:${PATH}}' >> ~/.bash_profile
echo 'export LD_LIBRARY_PATH=/usr/local/cuda-9.0/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' >> ~/.bash_profile
source ~/.bash_profile
Update /usr/lib/systemd/system/nvidia-persistenced.service
using text editor such as vim:
# NVIDIA Persistence Daemon Init Script
#
# Copyright (c) 2013 NVIDIA Corporation
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#
# This is a sample systemd service file, designed to show how the NVIDIA
# Persistence Daemon can be started.
#
# Defaults
#[Unit]
#Description=NVIDIA Persistence Daemon
#Wants=syslog.target
#[Service]
#Type=forking
#ExecStart=/usr/bin/nvidia-persistenced --user root
#ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced
#[Install]
#WantedBy=multi-user.target
[Unit]
Description=NVIDIA Persistence Daemon
Wants=syslog.target
[Service]
Type=forking
PIDFile=/var/run/nvidia-persistenced/nvidia-persistenced.pid
Restart=always
ExecStart=/usr/bin/nvidia-persistenced --verbose
ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced
[Install]
WantedBy=multi-user.target
Comment out line in /lib/udev/rules.d/40-redhat.rules
as shown below:
# Memory hotadd request
# SUBSYSTEM=="memory", ACTION=="add", PROGRAM="/bin/uname -p", RESULT!="s390*", ATTR{state}=="offline", ATTR{state}="online"
Reboot machine (from root):
reboot
After reboot start nvidia persistence daemon from root user:
/usr/bin/nvidia-persistenced --verbose
Confirm CUDA Installation
Log back in as user and test installation:
cuda-install-samples-9.0.sh ./
cd ~/NVIDIA_CUDA-9.0_Samples
sudo make
cd ~/NVIDIA_CUDA-9.0_Samples/1_Utilities/deviceQueryDrv
./deviceQueryDrv
If CUDA was installed correctly, output of ./deviceQueryDrv
should look something like (note driver version):
CUDA Device Query (Driver API) statically linked version
Detected 1 CUDA Capable device(s)
Device 0: "Tesla P100-PCIE-16GB"
CUDA Driver Version: 9.0
CUDA Capability Major/Minor version number: 6.0
Total amount of global memory: 16276 MBytes (17066885120 bytes)
(56) Multiprocessors, ( 64) CUDA Cores/MP: 3584 CUDA Cores
GPU Max Clock rate: 1329 MHz (1.33 GHz)
Memory Clock rate: 715 Mhz
Memory Bus Width: 4096-bit
L2 Cache Size: 4194304 bytes
Max Texture Dimension Sizes 1D=(131072) 2D=(131072, 65536) 3D=(16384, 16384, 16384)
Maximum Layered 1D Texture Size, (num) layers 1D=(32768), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(32768, 32768), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Texture alignment: 512 bytes
Maximum memory pitch: 2147483647 bytes
Concurrent copy and kernel execution: Yes with 2 copy engine(s)
Run time limit on kernels: No
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Concurrent kernel execution: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Enabled
Device supports Unified Addressing (UVA): Yes
Supports Cooperative Kernel Launch: Yes
Supports MultiDevice Co-op Kernel Launch: Yes
Device PCI Domain ID / Bus ID / location ID: 0 / 0 / 7
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
Result = PASS
Install open MPI: instructions
wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.1.tar.gz
tar -xzf openmpi-3.0.1.tar.gz && cd openmpi-3.0.1
./configure --with-cuda
make -j$(nproc)
sudo make install
NOTE: These must be downloaded from NVIDIA's website and don't work via wget
so we use a scp transfer from local machine to VM
NOTE ALSO: make sure these are uploaded to usr/local/
on VM
Install NCCL2: instructions
Download NCCL 2.1.15 O/S agnostic and CUDA 9.0 from: https://developer.nvidia.com/nccl/nccl-download
To transfer locally downloaded NCCL files to VM:
scp -i ~/.ssh/<VM_key> /<local_path_to_nccl_file>/nccl_2.1.15-1+cuda9.0_x86_64.txz <USERNAME>@<VM_IP_ADDRESS>:/usr/local/nccl_2.1.15-1+cuda9.0_x86_64.txz
Download to /usr/local
:
cd /usr/local
sudo tar xvf nccl_2.1.15-1+cuda9.0_x86_64.txz
Add to env variables:
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/nccl_2.1.15-1+cuda9.0_x86_64/lib
Or to add directly to ~/.bash_profile
(preferred):
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/nccl_2.1.15-1+cuda9.0_x86_64/lib' >> ~/.bash_profile
source ~/.bash_profile
Install cuDNN: instructions
Download: https://developer.nvidia.com/compute/machine-learning/cudnn/secure/v7.1.2/prod/9.0_20180316/cudnn-9.0-linux-x64-v7.1 from https://developer.nvidia.com/rdp/cudnn-download
cd /usr/local
sudo tar -xzvf cudnn-9.0-linux-x64-v7.1.tgz
sudo cp cuda/include/cudnn.h /usr/local/cuda/include
sudo cp cuda/lib64/libcudnn* /usr/local/cuda/lib64
sudo chmod a+r /usr/local/cuda/include/cudnn.h
sudo chmod a+r /usr/local/cuda/lib64/libcudnn*
Check Installation:
cat /usr/local/cuda/include/cudnn.h | grep CUDNN_MAJOR -A 2
Install latest miniconda:
wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh
Do you wish the installer to prepend the Miniconda3 install location
to PATH in your /home/<username>/.bashrc ? [yes|no]
>> yes
Reload .bashrc
source ~/.bashrc
Check installation
which conda
Remove installer
rm -f Miniconda3-latest-Linux-x86_64.sh
Create Conda Env:
conda create -n <ENV_NAME> python=3.6
conda update conda
conda activate <ENV_NAME>
conda install -c conda-forge pyspark
pip install ipython ipykernel jupyterlab
python -m ipykernel install --user --name <ENV_NAME> --display-name "Python (<ENV_NAME>)"
Install Tensorflow: NOTE: tensorflow 1.7 currently doesn't work with cuDNN, so we need the nightly build
First install dependencies
sudo yum -y install gcc gcc-c++ python-pip python-devel atlas atlas-devel gcc-gfortran openssl-devel libffi-devel hdf5-devel
Specific tf-nightly-gpu
version that was installed in this setup: 1.8.0.dev20180329
Activate conda environment (conda activate w251
) then:
pip install tf-nightly-gpu
pip install keras
pip install h5py
Pip install Horovod using NCCL and Allreduce options:
HOROVOD_NCCL_HOME=/usr/local/nccl_2.1.15-1+cuda9.0_x86_64 HOROVOD_GPU_ALLREDUCE=NCCL pip install --no-cache-dir horovod
Clone horovod repo
:
git clone git@github.com:uber/horovod.git
To run examples in horovod/examples
:
instructions
cd horovod/examples
mpirun -np 1 \
-H localhost:1 \
-bind-to none -map-by slot \
-x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH \
-mca pml ob1 -mca btl ^openib \
python tensorflow_mnist.py
If you are getting mpi errors when running examples, you might try the following:
Install mpi4py
:
NOTE: This may affect relative paths of the Open MPI install we did system-wide. Make sure to do this inside a Conda environment
conda install -c conda-forge mpi4py
Alternatively, you can try running mpirun
using an absolute path:
/usr/local/bin/mpirun -np 1 \
-H localhost:1 \
-bind-to none -map-by slot \
-x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH \
-mca pml ob1 -mca btl ^openib \
python tensorflow_mnist.py
You can also change the options. Look at mpirun --help
for more info:
mpirun --allow-run-as-root \
-np 4 -H 4 \
-bind-to none -map-by slot \
-x LD_LIBRARY_PATH -x NCCL_DEBUG=INFO \
-x HOROVOD_FUSION_THRESHOLD=4194304 \
-x NCCL_SOCKET_IFNAME=eth0 \
python tf_cnn_benchmarks.py --num_batches 100 --display_every 1 \
--num_gpus 1 --model resnet50 --batch_size 128 \
--data_name imagenet --data_dir /data0/imagenet_data \
--variable_update horovod --horovod_device gpu \
--batch_group_size=4 --num_inter_threads=0 --optimizer momentum
Helpful commands:
Get list of cuda repos installed:
NOTE: this helps repair problems relating to CUDA versioning (i.e. 9.0
vs 9.1
)
yum --disablerepo="*" --enablerepo="cuda*" list available
also:
modinfo nvidia
To remove a previous nvidia installation entirely:
sudo yum remove nvidia*
rm -rf /etc/yum.repos.d/cuda-<prev_vers>*
rm -rf /etc/yum.repos.d/cuda-9-1*