morgangiraud/multi-gpu.sh

## multi-gpu.sh
# Script for Ubuntu: Nvidia Multi-GPU Installation and Testing (Adaptable for other distros)

# Step 0: Clean Nvidia Installation
# If you need to completely remove a previous Nvidia installation, use these commands.
# This ensures that you start with a clean slate for a new installation.
sudo apt-get --purge remove "*nvidia*"
sudo apt-get --purge remove "*cuda*" "*cudnn*" "*cublas*" "*cufft*" "*cufile*" "*curand*" "*cusolver*" "*cusparse*" "*gds-tools*" "*npp*" "*nvjpeg*" "nsight*" "*nvvm*" "*libnccl*"

# Verify that the removal is complete by checking if any Nvidia, CUDA, or cuDNN packages are still installed.
apt list --installed | grep cuda
apt list --installed | grep cudnn
apt list --installed | grep nvidia

# Make sure to reboot to clean everything
sudo reboot

# Step 1: Nvidia Driver Installation

# Optional: Check Nvidia's website for driver lists: https://www.nvidia.com/en-us/drivers/unix/
# For Beta drivers, add PPA graphics-drivers: https://launchpad.net/~graphics-drivers/+archive/ubuntu/ppa
# Recommendation: Start with stable releases unless specifically needed.
sudo add-apt-repository ppa:graphics-drivers/ppa
sudo apt-get update

# Search for available Nvidia drivers and install the latest version.
apt search --names-only nvidia-driver
sudo apt install nvidia-driver-550

# Reboot the system to apply changes.
sudo reboot

# Step 2: Verify Driver Installation
# After rebooting, use 'nvidia-smi' to check if the Nvidia driver is correctly installed and recognizes your GPU.
nvidia-smi

# Step 3: CUDA Toolkit Installation, install latest stable cuda-toolkit
# !!! Checkout https://developer.nvidia.com/cuda-downloads for potential update to this part of the script !!!
# !!! This is the "over network" deb installation
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
apt search --names-only cuda-toolkit
sudo apt install cuda-toolkit-12-4

# Step 4: cuDNN Installation (dependent on step 3, i.e. same deb source)
apt search --names-only cudnn
sudo apt install cudnn9-cuda-12-4

# Another reboot to ensure all installations are properly initialized.
sudo reboot

# Verify again with 'nvidia-smi' to check CUDA and driver installation.
nvidia-smi

# If there are any issues at this point, stop and resolve them before proceeding.

# Step 5: NCCL and nvidia GPUDirect installation for Multi-GPU Communication
apt search --names-only nccl
sudo apt install libnccl2 nvidia-gds

# Step 6: Installation Validation
# Install git and cmake, tools needed to clone and build software from source.
sudo apt install git cmake

# Clone the Nvidia CUDA samples repository to test CUDA installation.
git clone https://github.com/nvidia/cuda-samples
cd cuda-samples

# Build the samples, using all available cores to speed up the process.
make -j `nproc`

# Run 'deviceQuery' to test if CUDA is properly recognizing your GPU.
./bin/x86_64/linux/release/deviceQuery

# Optional: Explore and run other samples in the 'bin' directory.

# Step 7: P2P (Peer-to-Peer) Testing
# You can do those steps even if you GPUs do not support P2P, it will give you an idea of how fast they can transfer information between them

# Run P2P bandwidth and latency test:
./bin/x86_64/linux/release/p2pBandwidthLatencyTest

# Step 8: Advanced P2P Testing

# Clone and build a more comprehensive Nvidia bandwidth test.
git clone https://github.com/nvidia/nvbandwidth
cd nvbandwidth
cmake .
make -j `nproc`

# Install any missing dependencies if you encounter errors.
./debian_install.sh

# Run the nvbandwidth tests to thoroughly check P2P communication.
./nvbandwidth

# Step 9: NCCL Testing

# Clone and build NCCL tests to verify NCCL installation and functionality.
git clone https://github.com/nvidia/nccl-tests
cd nccl-tests
make -j `nproc`

# Run an NCCL benchmark, adjusting the number of GPUs (-g) as needed.
./build/all_reduce_perf -b 8 -e 128M -f 2 -g 2

# Congratulations! If everything went smoothly, your machine is ML-ready!
	# Script for Ubuntu: Nvidia Multi-GPU Installation and Testing (Adaptable for other distros)

	# Step 0: Clean Nvidia Installation
	# If you need to completely remove a previous Nvidia installation, use these commands.
	# This ensures that you start with a clean slate for a new installation.
	sudo apt-get --purge remove "nvidia"
	sudo apt-get --purge remove "cuda" "cudnn" "cublas" "cufft" "cufile" "curand" "cusolver" "cusparse" "gds-tools" "npp" "nvjpeg" "nsight" "nvvm" "libnccl*"

	# Verify that the removal is complete by checking if any Nvidia, CUDA, or cuDNN packages are still installed.
	apt list --installed \| grep cuda
	apt list --installed \| grep cudnn
	apt list --installed \| grep nvidia

	# Make sure to reboot to clean everything
	sudo reboot

	# Step 1: Nvidia Driver Installation

	# Optional: Check Nvidia's website for driver lists: https://www.nvidia.com/en-us/drivers/unix/
	# For Beta drivers, add PPA graphics-drivers: https://launchpad.net/~graphics-drivers/+archive/ubuntu/ppa
	# Recommendation: Start with stable releases unless specifically needed.
	sudo add-apt-repository ppa:graphics-drivers/ppa
	sudo apt-get update

	# Search for available Nvidia drivers and install the latest version.
	apt search --names-only nvidia-driver
	sudo apt install nvidia-driver-550

	# Reboot the system to apply changes.
	sudo reboot

	# Step 2: Verify Driver Installation
	# After rebooting, use 'nvidia-smi' to check if the Nvidia driver is correctly installed and recognizes your GPU.
	nvidia-smi

	# Step 3: CUDA Toolkit Installation, install latest stable cuda-toolkit
	# !!! Checkout https://developer.nvidia.com/cuda-downloads for potential update to this part of the script !!!
	# !!! This is the "over network" deb installation
	wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
	sudo dpkg -i cuda-keyring_1.1-1_all.deb
	sudo apt-get update
	apt search --names-only cuda-toolkit
	sudo apt install cuda-toolkit-12-4

	# Step 4: cuDNN Installation (dependent on step 3, i.e. same deb source)
	apt search --names-only cudnn
	sudo apt install cudnn9-cuda-12-4

	# Another reboot to ensure all installations are properly initialized.
	sudo reboot

	# Verify again with 'nvidia-smi' to check CUDA and driver installation.
	nvidia-smi

	# If there are any issues at this point, stop and resolve them before proceeding.

	# Step 5: NCCL and nvidia GPUDirect installation for Multi-GPU Communication
	apt search --names-only nccl
	sudo apt install libnccl2 nvidia-gds

	# Step 6: Installation Validation
	# Install git and cmake, tools needed to clone and build software from source.
	sudo apt install git cmake

	# Clone the Nvidia CUDA samples repository to test CUDA installation.
	git clone https://github.com/nvidia/cuda-samples
	cd cuda-samples

	# Build the samples, using all available cores to speed up the process.
	make -j `nproc`

	# Run 'deviceQuery' to test if CUDA is properly recognizing your GPU.
	./bin/x86_64/linux/release/deviceQuery

	# Optional: Explore and run other samples in the 'bin' directory.

	# Step 7: P2P (Peer-to-Peer) Testing
	# You can do those steps even if you GPUs do not support P2P, it will give you an idea of how fast they can transfer information between them

	# Run P2P bandwidth and latency test:
	./bin/x86_64/linux/release/p2pBandwidthLatencyTest

	# Step 8: Advanced P2P Testing

	# Clone and build a more comprehensive Nvidia bandwidth test.
	git clone https://github.com/nvidia/nvbandwidth
	cd nvbandwidth
	cmake .
	make -j `nproc`

	# Install any missing dependencies if you encounter errors.
	./debian_install.sh

	# Run the nvbandwidth tests to thoroughly check P2P communication.
	./nvbandwidth

	# Step 9: NCCL Testing

	# Clone and build NCCL tests to verify NCCL installation and functionality.
	git clone https://github.com/nvidia/nccl-tests
	cd nccl-tests
	make -j `nproc`

	# Run an NCCL benchmark, adjusting the number of GPUs (-g) as needed.
	./build/all_reduce_perf -b 8 -e 128M -f 2 -g 2

	# Congratulations! If everything went smoothly, your machine is ML-ready!