-
-
Save bilalmughal/0500f27454a508bd3552fcf03e3adadb to your computer and use it in GitHub Desktop.
#!/bin/bash | |
set -e # Exit on any error | |
# Check if required arguments are provided | |
if [ -z "$REGION" ] || [ -z "$SECURITY_GROUPS" ] || [ -z "$KEY_PAIR" ] || [ -z "$SUBNET" ]; then | |
echo "Error: You must provide REGION, SECURITY_GROUPS, KEY_PAIR, and SUBNET as environment variables." | |
echo "Example:" | |
echo " export REGION=us-east-1" | |
echo " export SECURITY_GROUPS=sg-12345678,sg-87654321" | |
echo " export KEY_PAIR=my-key-pair" | |
echo " export SUBNET=subnet-12345678" | |
exit 1 | |
fi | |
USER_DATA=$(cat <<'EOF' | |
#!/bin/bash | |
set -e # Exit on any error | |
CUDA_HOME=/usr/local/cuda | |
HOME_DIR=/home/ec2-user | |
SRC_DIR=$HOME_DIR/sources | |
USER_EXEC="sudo -u ec2-user" | |
CPUS=$(nproc) | |
LOG_FILE="$HOME_DIR/install.log" | |
if [[ "$1" != "--stdout" ]]; then | |
exec >>"$LOG_FILE" 2>&1 | |
fi | |
# Create source directory | |
mkdir -p $SRC_DIR | |
pushd $SRC_DIR | |
# Install system utilities and updates | |
install_utils() { | |
echo "Installing utilities..." | |
dnf -y update | |
dnf -y groupinstall "Development Tools" | |
dnf install -y openssl-devel cmake3 rust cargo amazon-efs-utils htop iotop yasm nasm jq python3-pip python-devel cronie cronie-anacron | |
echo "Success : Updates and developer tools installed." | |
echo "PATH=$CUDA_HOME/bin:\$PATH" | sudo tee -a $HOME_DIR/.bashrc | |
echo "LD_LIBRARY_PATH=$CUDA_HOME/lib64:\$LD_LIBRARY_PATH" | sudo tee -a $HOME_DIR/.bashrc | |
echo "/usr/local/lib" | sudo tee /etc/ld.so.conf.d/usr-local-lib.conf | |
echo "/usr/local/lib64" | sudo tee -a /etc/ld.so.conf.d/usr-local-lib.conf | |
} | |
# Setup GPU, CUDA and CUDNN | |
setup_gpu() { | |
echo "Setting up GPU..." | |
wget https://us.download.nvidia.com/tesla/535.104.05/NVIDIA-Linux-aarch64-535.104.05.run | |
sh NVIDIA-Linux-aarch64-535.104.05.run --disable-nouveau --silent | |
wget https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux_sbsa.run | |
sh cuda_12.2.2_535.104.05_linux_sbsa.run --silent --override --toolkit --samples --toolkitpath=/usr/local/cuda-12.2 --samplespath=$CUDA_HOME --no-opengl-libs | |
wget https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-8.9.4.25_cuda12-archive.tar.xz | |
tar -xf cudnn-linux-sbsa-8.9.4.25_cuda12-archive.tar.xz | |
cp -P cudnn-linux-sbsa-8.9.4.25_cuda12-archive/include/* $CUDA_HOME/include/ | |
cp -P cudnn-linux-sbsa-8.9.4.25_cuda12-archive/lib/* $CUDA_HOME/lib64/ | |
chmod a+r $CUDA_HOME/lib64/* | |
ldconfig | |
rm -fr cu* NVIDIA* | |
} | |
# Install PyTorch from source | |
install_pytorch() { | |
echo "Installing PyTorch..." | |
wget https://github.com/ccache/ccache/releases/download/v4.8.3/ccache-4.8.3.tar.xz | |
tar -xf ccache-4.8.3.tar.xz | |
pushd ccache-4.8.3 | |
cmake . | |
make -j $CPUS | |
popd | |
dnf install -y numpy | |
pip3 install typing-extensions | |
git clone --recursive https://github.com/pytorch/pytorch.git | |
pushd pytorch | |
python3 setup.py install | |
popd | |
ldconfig | |
$USER_EXEC pip3 install sympy filelock fsspec networkx | |
} | |
mount -o remount,size=15G /tmp/ | |
# Execute Functions | |
install_utils | |
setup_gpu | |
source $HOME_DIR/.bashrc | |
install_pytorch | |
# Cleanup | |
popd | |
rm -fr $SRC_DIR | |
# Test the installation | |
$USER_EXEC python3 -c "import torch; print('Congratulations, you are all set to go.' if torch.cuda.is_available() else 'Something went wrong. Please check if you missed any steps.')" | |
EOF | |
) | |
AMI_ID="ami-0b9ce70cf1bc24fc3" | |
aws ec2 run-instances \ | |
--image-id $AMI_ID \ | |
--instance-type g5g.4xlarge \ | |
--key-name $KEY_PAIR \ | |
--subnet-id $SUBNET \ | |
--security-group-ids $SECURITY_GROUPS \ | |
--region $REGION \ | |
--block-device-mappings '[{"DeviceName":"/dev/xvda","Ebs":{"VolumeSize":20,"VolumeType":"gp3"}}]' \ | |
--tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value=AMI-Builder}]' \ | |
--user-data "$USER_DATA" |
the compilation works perfectly thank you @bilalmughal
i only have an issue when installing torchvision after installing it torch is loosing access to cuda
python3 -c "import torch; print(torch.version, torch.version.cuda, torch.cuda.is_available())"
2.3.0a0+git1deb75b 12.2 False
i tried uninstall and reinstalled it but still cant have access to cuda
@menahem121 same, tried with --no-deps
too
python3 -m venv .testing --system-site-packages
source .testing/bin/activate
pip3 install torchvision --no-deps
The lib that requires it fails with
RuntimeError: operator torchvision::nms does not exist
I tried building from source, and got almost close-ish
torch_DIR=/usr/local/lib64/python3.9/site-packages/torch/share/cmake/Torch/ cmake -DWITH_CUDA=on ..
-- Caffe2: CUDA detected: 12.2
-- Caffe2: CUDA nvcc is: /usr/local/cuda/bin/nvcc
-- Caffe2: CUDA toolkit directory: /usr/local/cuda
-- Caffe2: Header version is: 12.2
-- /usr/local/cuda/lib64/libnvrtc.so shorthash is 0924ef55
-- USE_CUDNN is set to 0. Compiling without cuDNN support
-- USE_CUSPARSELT is set to 0. Compiling without cuSPARSELt support
-- Autodetected CUDA architecture(s): 7.5
-- Added CUDA NVCC flags for: -gencode;arch=compute_75,code=sm_75
CMake Warning at /usr/local/lib64/python3.9/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:22 (message):
static library kineto_LIBRARY-NOTFOUND not found.
Call Stack (most recent call first):
/usr/local/lib64/python3.9/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:127 (append_torchlib_if_found)
CMakeLists.txt:24 (find_package)
CMake Error at /usr/share/cmake/Modules/FindPackageHandleStandardArgs.cmake:230 (message):
Could NOT find PNG (missing: PNG_LIBRARY PNG_PNG_INCLUDE_DIR)
Call Stack (most recent call first):
/usr/share/cmake/Modules/FindPackageHandleStandardArgs.cmake:594 (_FPHSA_FAILURE_MESSAGE)
/usr/share/cmake/Modules/FindPNG.cmake:159 (find_package_handle_standard_args)
CMakeLists.txt:28 (find_package)
@maurera Any chance we could use your magic and add a torchvision?
Just kidding, all i needed to was make sure libpng/jpg development packages were installed and
python3 setup.py install
from the cloned torchvision repo
# Install torchvision from source
install_torchvision() {
echo "Installing torchvision..."
dnf install -y libjpeg-turbo-devel libpng-devel
wget https://github.com/pytorch/vision/archive/refs/tags/v0.17.0.tar.gz
tar -xf v0.17.0.tar.gz
pushd vision-0.17.0
python3 setup.py install
popd
ldconfig
}
From interactive terminal post launch
[ec2-user /home/ec2-user]$ sudo su
[root /home/ec2-user]# dnf install -y libjpeg-turbo-devel libpng-devel
[root /home/ec2-user]# source /home/ec2-user/.bashrc
[root /home/ec2-user]# mkdir -p /home/ec2-user/sources && pushd /home/ec2-user/sources
[root /home/ec2-user/sources]# wget https://github.com/pytorch/vision/archive/refs/tags/v0.17.0.tar.gz
[root /home/ec2-user/sources]# tar -xf v0.17.0.tar.gz && pushd vision-0.17.0
[root /home/ec2-user/sources/vision-0.17.0]# python3 setup.py install
[root /home/ec2-user/sources/vision-0.17.0]# ldconfig
[root /home/ec2-user/sources/vision-0.17.0]# exit
[ec2-user /home/ec2-user]$ python3 -c "import torch; print('Congratulations, you are all set to go.' if torch.cuda.is_available() else 'Something went wrong. Please check if you missed any steps.')"
[ec2-user /home/ec2-user]$ python3 -c "import torchvision; print('Imported torchvision, you are probably all set to go.')"
@maurera Its a one time process, once everything is setup and installed. I will recommend you to create an ami and use that AMI for future deployments. This way instances will spawn quickly