-
-
Save bilalmughal/0500f27454a508bd3552fcf03e3adadb to your computer and use it in GitHub Desktop.
#!/bin/bash | |
set -e # Exit on any error | |
# Check if required arguments are provided | |
if [ -z "$REGION" ] || [ -z "$SECURITY_GROUPS" ] || [ -z "$KEY_PAIR" ] || [ -z "$SUBNET" ]; then | |
echo "Error: You must provide REGION, SECURITY_GROUPS, KEY_PAIR, and SUBNET as environment variables." | |
echo "Example:" | |
echo " export REGION=us-east-1" | |
echo " export SECURITY_GROUPS=sg-12345678,sg-87654321" | |
echo " export KEY_PAIR=my-key-pair" | |
echo " export SUBNET=subnet-12345678" | |
exit 1 | |
fi | |
USER_DATA=$(cat <<'EOF' | |
#!/bin/bash | |
set -e # Exit on any error | |
CUDA_HOME=/usr/local/cuda | |
HOME_DIR=/home/ec2-user | |
SRC_DIR=$HOME_DIR/sources | |
USER_EXEC="sudo -u ec2-user" | |
CPUS=$(nproc) | |
LOG_FILE="$HOME_DIR/install.log" | |
if [[ "$1" != "--stdout" ]]; then | |
exec >>"$LOG_FILE" 2>&1 | |
fi | |
# Create source directory | |
mkdir -p $SRC_DIR | |
pushd $SRC_DIR | |
# Install system utilities and updates | |
install_utils() { | |
echo "Installing utilities..." | |
dnf -y update | |
dnf -y groupinstall "Development Tools" | |
dnf install -y openssl-devel cmake3 rust cargo amazon-efs-utils htop iotop yasm nasm jq python3-pip python-devel cronie cronie-anacron | |
echo "Success : Updates and developer tools installed." | |
echo "PATH=$CUDA_HOME/bin:\$PATH" | sudo tee -a $HOME_DIR/.bashrc | |
echo "LD_LIBRARY_PATH=$CUDA_HOME/lib64:\$LD_LIBRARY_PATH" | sudo tee -a $HOME_DIR/.bashrc | |
echo "/usr/local/lib" | sudo tee /etc/ld.so.conf.d/usr-local-lib.conf | |
echo "/usr/local/lib64" | sudo tee -a /etc/ld.so.conf.d/usr-local-lib.conf | |
} | |
# Setup GPU, CUDA and CUDNN | |
setup_gpu() { | |
echo "Setting up GPU..." | |
wget https://us.download.nvidia.com/tesla/535.104.05/NVIDIA-Linux-aarch64-535.104.05.run | |
sh NVIDIA-Linux-aarch64-535.104.05.run --disable-nouveau --silent | |
wget https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux_sbsa.run | |
sh cuda_12.2.2_535.104.05_linux_sbsa.run --silent --override --toolkit --samples --toolkitpath=/usr/local/cuda-12.2 --samplespath=$CUDA_HOME --no-opengl-libs | |
wget https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-8.9.4.25_cuda12-archive.tar.xz | |
tar -xf cudnn-linux-sbsa-8.9.4.25_cuda12-archive.tar.xz | |
cp -P cudnn-linux-sbsa-8.9.4.25_cuda12-archive/include/* $CUDA_HOME/include/ | |
cp -P cudnn-linux-sbsa-8.9.4.25_cuda12-archive/lib/* $CUDA_HOME/lib64/ | |
chmod a+r $CUDA_HOME/lib64/* | |
ldconfig | |
rm -fr cu* NVIDIA* | |
} | |
# Install PyTorch from source | |
install_pytorch() { | |
echo "Installing PyTorch..." | |
wget https://github.com/ccache/ccache/releases/download/v4.8.3/ccache-4.8.3.tar.xz | |
tar -xf ccache-4.8.3.tar.xz | |
pushd ccache-4.8.3 | |
cmake . | |
make -j $CPUS | |
popd | |
dnf install -y numpy | |
pip3 install typing-extensions | |
git clone --recursive https://github.com/pytorch/pytorch.git | |
pushd pytorch | |
python3 setup.py install | |
popd | |
ldconfig | |
$USER_EXEC pip3 install sympy filelock fsspec networkx | |
} | |
mount -o remount,size=15G /tmp/ | |
# Execute Functions | |
install_utils | |
setup_gpu | |
source $HOME_DIR/.bashrc | |
install_pytorch | |
# Cleanup | |
popd | |
rm -fr $SRC_DIR | |
# Test the installation | |
$USER_EXEC python3 -c "import torch; print('Congratulations, you are all set to go.' if torch.cuda.is_available() else 'Something went wrong. Please check if you missed any steps.')" | |
EOF | |
) | |
AMI_ID="ami-0b9ce70cf1bc24fc3" | |
aws ec2 run-instances \ | |
--image-id $AMI_ID \ | |
--instance-type g5g.4xlarge \ | |
--key-name $KEY_PAIR \ | |
--subnet-id $SUBNET \ | |
--security-group-ids $SECURITY_GROUPS \ | |
--region $REGION \ | |
--block-device-mappings '[{"DeviceName":"/dev/xvda","Ebs":{"VolumeSize":20,"VolumeType":"gp3"}}]' \ | |
--tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value=AMI-Builder}]' \ | |
--user-data "$USER_DATA" |
Thank you for this script!
We need to add
dnf -y kernel-modules-extra.aarch64
to pass below error. By the way it took 2:25 to finish install on g5g.xlarge :)
Verifying archive integrity... OK
Uncompressing NVIDIA Accelerated Graphics Driver for Linux-aarch64 535.104.05...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
ERROR: Unable to load the kernel module 'nvidia.ko'. This happens most frequently when this kernel module was built against the wrong or improperly configured kernel sources, with a version of gcc that differs from the one used to build the target kernel, or if another driver, such as nouveau, is present and prevents the NVIDIA kernel module from obtaining ownership of the NVIDIA device(s), or no NVIDIA device installed in this system is supported by this NVIDIA Linux graphics driver release.
Please see the log entries 'Kernel module load error' and 'Kernel messages' at the end of the file '/var/log/nvidia-installer.log' for more information.
ERROR: Installation has failed. Please see the file '/var/log/nvidia-installer.log' for details. You may find suggestions on fixing installation problems in the README available on the Linux driver download page at www.nvidia.com.
At the nvidia-installer log :
[ 164.692055] VFIO - User Level meta-driver version: 0.3
[ 164.982629] nvidia: loading out-of-tree module taints kernel.
[ 164.982643] nvidia: module license 'NVIDIA' taints kernel.
[ 164.982644] Disabling lock debugging due to kernel taint
[ 164.989702] nvidia: module verification failed: signature and/or required key missing - tainting kernel
[ 164.990223] nvidia: Unknown symbol drm_gem_object_free (err -2)
ERROR: Installation has failed. Please see the file '/var/log/nvidia-installer.log' for details. You may find suggestions on fixing installation problems in the README available on the Linux driver download page at www.nvidia.com.
Can you confirm you are using AMI_ID ami-0b9ce70cf1bc24fc3
and driver version NVIDIA-Linux-aarch64-535.104.05.run
?
It was the latest Amazon Linux 2023 AMI , ami-04c97e62cb19d53f1. driver version correct. Can we get cuda enabled torch with a package manager?
Use ami-0b9ce70cf1bc24fc3
as mentioned in the script. I will update the script later for the latest Amazon Linux 2023 version. Meanwhile, you can use the current one.
Regarding the Package manager, there were so many things breaking over and over again for me. That's why I opted for this route.
Thanks for this script!
Unfortunately it doesn't work for me:
- I get "ami not found" when I try to use
ami-0b9ce70cf1bc24fc3
- When I switch to the latest Amazon Linux AMI for ARM (
ami-0084b033b53412473
), I get the following error when installing the nvidia driver:
ERROR: Unable to load the kernel module 'nvidia.ko'. This happens most frequently when this kernel module was built against the wrong or improperly configured kernel sources, with a version of gcc that differs from the one used to build the target kernel, or if another driver, such as nouveau, is present and prevents the NVIDIA kernel module from obtaining ownership of the NVIDIA device(s), or no NVIDIA device installed in this system is supported by this NVIDIA Linux graphics driver release.
Thank you @maurera for updating me about this issue, i will check the script for latest AMI and update the script accordingly, if that works for you.
Great, thanks @bilalmughal.
I tried searching for the version myself. I browsed to https://www.nvidia.com/Download/index.aspx?lang=en-us and found the current version: https://us.download.nvidia.com/tesla/535.129.03/NVIDIA-Linux-aarch64-535.129.03.run
I then updated your script with this version number
sudo wget https://us.download.nvidia.com/tesla/535.129.03/NVIDIA-Linux-aarch64-535.129.03.run
sudo sh NVIDIA-Linux-aarch64-535.129.03.run --disable-nouveau --silent
But I still get the same error about kernel module 'nvidia.ko'
@maurera same issue here, but i get stuck on another part, to pass the NVIDIA.ko error: CC=/usr/bin/gcc ./NVIDIA-Linux-aarch64-535.129.03.run
Thank you @maurera and @menahem121, and i tested the above script. With the AMI mentioned in the script.
AMI_ID="ami-0b9ce70cf1bc24fc3"
The whole script work out of the box. But for newer AMI of Amazon Linux 2023, the repository structure have changed and to make it work with newer AMI, you need to install additional package kernel-modules-extra tested
before install the driver . Tested it with following AMI
ami-0730971bf8e0532d6
6.1.66-93.164.amzn2023.aarch64
sudo dnf install kernel-modules-extra
@bilalmughal - I get "ami not found" from AMI_ID="ami-0b9ce70cf1bc24fc3". Do you have a link to this ami? I get the error from command line and also don't see any information in it from a google search or from the launch instances page in the AWS Management Console.
sudo dnf install kernel-modules-extra
seems to have fixed the cuda issue with the current AMI (ami-0084b033b53412473
). I'm currently waiting for torch to build (it seems like it's taking over 2 hours at the stage of building torch). I'm going to try saving the wheel file with python3 setup.py bdist_wheel
instead of python3 setup.py install
and store the built wheel file so that I don't have to wait the 2 hours again the next time I want to spin up a new instance.
@maurera Its a one time process, once everything is setup and installed. I will recommend you to create an ami and use that AMI for future deployments. This way instances will spawn quickly
the compilation works perfectly thank you @bilalmughal
i only have an issue when installing torchvision after installing it torch is loosing access to cuda
python3 -c "import torch; print(torch.version, torch.version.cuda, torch.cuda.is_available())"
2.3.0a0+git1deb75b 12.2 False
i tried uninstall and reinstalled it but still cant have access to cuda
@menahem121 same, tried with --no-deps
too
python3 -m venv .testing --system-site-packages
source .testing/bin/activate
pip3 install torchvision --no-deps
The lib that requires it fails with
RuntimeError: operator torchvision::nms does not exist
I tried building from source, and got almost close-ish
torch_DIR=/usr/local/lib64/python3.9/site-packages/torch/share/cmake/Torch/ cmake -DWITH_CUDA=on ..
-- Caffe2: CUDA detected: 12.2
-- Caffe2: CUDA nvcc is: /usr/local/cuda/bin/nvcc
-- Caffe2: CUDA toolkit directory: /usr/local/cuda
-- Caffe2: Header version is: 12.2
-- /usr/local/cuda/lib64/libnvrtc.so shorthash is 0924ef55
-- USE_CUDNN is set to 0. Compiling without cuDNN support
-- USE_CUSPARSELT is set to 0. Compiling without cuSPARSELt support
-- Autodetected CUDA architecture(s): 7.5
-- Added CUDA NVCC flags for: -gencode;arch=compute_75,code=sm_75
CMake Warning at /usr/local/lib64/python3.9/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:22 (message):
static library kineto_LIBRARY-NOTFOUND not found.
Call Stack (most recent call first):
/usr/local/lib64/python3.9/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:127 (append_torchlib_if_found)
CMakeLists.txt:24 (find_package)
CMake Error at /usr/share/cmake/Modules/FindPackageHandleStandardArgs.cmake:230 (message):
Could NOT find PNG (missing: PNG_LIBRARY PNG_PNG_INCLUDE_DIR)
Call Stack (most recent call first):
/usr/share/cmake/Modules/FindPackageHandleStandardArgs.cmake:594 (_FPHSA_FAILURE_MESSAGE)
/usr/share/cmake/Modules/FindPNG.cmake:159 (find_package_handle_standard_args)
CMakeLists.txt:28 (find_package)
@maurera Any chance we could use your magic and add a torchvision?
Just kidding, all i needed to was make sure libpng/jpg development packages were installed and
python3 setup.py install
from the cloned torchvision repo
# Install torchvision from source
install_torchvision() {
echo "Installing torchvision..."
dnf install -y libjpeg-turbo-devel libpng-devel
wget https://github.com/pytorch/vision/archive/refs/tags/v0.17.0.tar.gz
tar -xf v0.17.0.tar.gz
pushd vision-0.17.0
python3 setup.py install
popd
ldconfig
}
From interactive terminal post launch
[ec2-user /home/ec2-user]$ sudo su
[root /home/ec2-user]# dnf install -y libjpeg-turbo-devel libpng-devel
[root /home/ec2-user]# source /home/ec2-user/.bashrc
[root /home/ec2-user]# mkdir -p /home/ec2-user/sources && pushd /home/ec2-user/sources
[root /home/ec2-user/sources]# wget https://github.com/pytorch/vision/archive/refs/tags/v0.17.0.tar.gz
[root /home/ec2-user/sources]# tar -xf v0.17.0.tar.gz && pushd vision-0.17.0
[root /home/ec2-user/sources/vision-0.17.0]# python3 setup.py install
[root /home/ec2-user/sources/vision-0.17.0]# ldconfig
[root /home/ec2-user/sources/vision-0.17.0]# exit
[ec2-user /home/ec2-user]$ python3 -c "import torch; print('Congratulations, you are all set to go.' if torch.cuda.is_available() else 'Something went wrong. Please check if you missed any steps.')"
[ec2-user /home/ec2-user]$ python3 -c "import torchvision; print('Imported torchvision, you are probably all set to go.')"
EC2 Graviton Deep Learning Bootstrap Script
This Bash script is a part of comprehensive step-by-step guide to automate Amazon EC2 Graviton-based (ARM architecture) instance setup for deep learning applications with the latest Nvidia Driver, CUDA 12.2 toolkit, and CUDA Deep Neural Network Library. It performs the following tasks:
The script ends by running a simple Python test to verify if PyTorch with GPU support is installed correctly.
To run the script, populate the required environment variables (REGION, SECURITY_GROUPS, KEY_PAIR, SUBNET) and execute the script.