Skip to content

Instantly share code, notes, and snippets.

@bilalmughal
Last active April 10, 2024 19:36
Show Gist options
  • Star 12 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save bilalmughal/0500f27454a508bd3552fcf03e3adadb to your computer and use it in GitHub Desktop.
Save bilalmughal/0500f27454a508bd3552fcf03e3adadb to your computer and use it in GitHub Desktop.
This script automates the setup of an Amazon EC2 Graviton ARM-based instances for deep learning tasks. It takes care of installing essential utilities, setting up latest Nvidia drivers and CUDA 12.2 toolkit and cuDNN library, and build PyTorch from source. The step-by-step guided can be checked here. https://jumpshare.com/blog/deep-learning-on-a…
#!/bin/bash
set -e # Exit on any error
# Check if required arguments are provided
if [ -z "$REGION" ] || [ -z "$SECURITY_GROUPS" ] || [ -z "$KEY_PAIR" ] || [ -z "$SUBNET" ]; then
echo "Error: You must provide REGION, SECURITY_GROUPS, KEY_PAIR, and SUBNET as environment variables."
echo "Example:"
echo " export REGION=us-east-1"
echo " export SECURITY_GROUPS=sg-12345678,sg-87654321"
echo " export KEY_PAIR=my-key-pair"
echo " export SUBNET=subnet-12345678"
exit 1
fi
USER_DATA=$(cat <<'EOF'
#!/bin/bash
set -e # Exit on any error
CUDA_HOME=/usr/local/cuda
HOME_DIR=/home/ec2-user
SRC_DIR=$HOME_DIR/sources
USER_EXEC="sudo -u ec2-user"
CPUS=$(nproc)
LOG_FILE="$HOME_DIR/install.log"
if [[ "$1" != "--stdout" ]]; then
exec >>"$LOG_FILE" 2>&1
fi
# Create source directory
mkdir -p $SRC_DIR
pushd $SRC_DIR
# Install system utilities and updates
install_utils() {
echo "Installing utilities..."
dnf -y update
dnf -y groupinstall "Development Tools"
dnf install -y openssl-devel cmake3 rust cargo amazon-efs-utils htop iotop yasm nasm jq python3-pip python-devel cronie cronie-anacron
echo "Success : Updates and developer tools installed."
echo "PATH=$CUDA_HOME/bin:\$PATH" | sudo tee -a $HOME_DIR/.bashrc
echo "LD_LIBRARY_PATH=$CUDA_HOME/lib64:\$LD_LIBRARY_PATH" | sudo tee -a $HOME_DIR/.bashrc
echo "/usr/local/lib" | sudo tee /etc/ld.so.conf.d/usr-local-lib.conf
echo "/usr/local/lib64" | sudo tee -a /etc/ld.so.conf.d/usr-local-lib.conf
}
# Setup GPU, CUDA and CUDNN
setup_gpu() {
echo "Setting up GPU..."
wget https://us.download.nvidia.com/tesla/535.104.05/NVIDIA-Linux-aarch64-535.104.05.run
sh NVIDIA-Linux-aarch64-535.104.05.run --disable-nouveau --silent
wget https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux_sbsa.run
sh cuda_12.2.2_535.104.05_linux_sbsa.run --silent --override --toolkit --samples --toolkitpath=/usr/local/cuda-12.2 --samplespath=$CUDA_HOME --no-opengl-libs
wget https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-8.9.4.25_cuda12-archive.tar.xz
tar -xf cudnn-linux-sbsa-8.9.4.25_cuda12-archive.tar.xz
cp -P cudnn-linux-sbsa-8.9.4.25_cuda12-archive/include/* $CUDA_HOME/include/
cp -P cudnn-linux-sbsa-8.9.4.25_cuda12-archive/lib/* $CUDA_HOME/lib64/
chmod a+r $CUDA_HOME/lib64/*
ldconfig
rm -fr cu* NVIDIA*
}
# Install PyTorch from source
install_pytorch() {
echo "Installing PyTorch..."
wget https://github.com/ccache/ccache/releases/download/v4.8.3/ccache-4.8.3.tar.xz
tar -xf ccache-4.8.3.tar.xz
pushd ccache-4.8.3
cmake .
make -j $CPUS
popd
dnf install -y numpy
pip3 install typing-extensions
git clone --recursive https://github.com/pytorch/pytorch.git
pushd pytorch
python3 setup.py install
popd
ldconfig
$USER_EXEC pip3 install sympy filelock fsspec networkx
}
mount -o remount,size=15G /tmp/
# Execute Functions
install_utils
setup_gpu
source $HOME_DIR/.bashrc
install_pytorch
# Cleanup
popd
rm -fr $SRC_DIR
# Test the installation
$USER_EXEC python3 -c "import torch; print('Congratulations, you are all set to go.' if torch.cuda.is_available() else 'Something went wrong. Please check if you missed any steps.')"
EOF
)
AMI_ID="ami-0b9ce70cf1bc24fc3"
aws ec2 run-instances \
--image-id $AMI_ID \
--instance-type g5g.4xlarge \
--key-name $KEY_PAIR \
--subnet-id $SUBNET \
--security-group-ids $SECURITY_GROUPS \
--region $REGION \
--block-device-mappings '[{"DeviceName":"/dev/xvda","Ebs":{"VolumeSize":20,"VolumeType":"gp3"}}]' \
--tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value=AMI-Builder}]' \
--user-data "$USER_DATA"
@rromanchuk
Copy link

rromanchuk commented Feb 8, 2024

Just kidding, all i needed to was make sure libpng/jpg development packages were installed and

python3 setup.py install from the cloned torchvision repo

# Install torchvision from source
install_torchvision() {
    echo "Installing torchvision..."
    dnf install -y libjpeg-turbo-devel libpng-devel
    wget https://github.com/pytorch/vision/archive/refs/tags/v0.17.0.tar.gz
    tar -xf v0.17.0.tar.gz
   
    pushd vision-0.17.0
    python3 setup.py install
    popd
    ldconfig
}

From interactive terminal post launch

[ec2-user /home/ec2-user]$ sudo su
[root /home/ec2-user]# dnf install -y libjpeg-turbo-devel libpng-devel
[root /home/ec2-user]# source /home/ec2-user/.bashrc
[root /home/ec2-user]# mkdir -p /home/ec2-user/sources && pushd /home/ec2-user/sources
[root /home/ec2-user/sources]# wget https://github.com/pytorch/vision/archive/refs/tags/v0.17.0.tar.gz
[root /home/ec2-user/sources]# tar -xf v0.17.0.tar.gz && pushd vision-0.17.0
[root /home/ec2-user/sources/vision-0.17.0]# python3 setup.py install
[root /home/ec2-user/sources/vision-0.17.0]# ldconfig
[root /home/ec2-user/sources/vision-0.17.0]# exit
[ec2-user /home/ec2-user]$  python3 -c "import torch; print('Congratulations, you are all set to go.' if torch.cuda.is_available() else 'Something went wrong. Please check if you missed any steps.')"
[ec2-user /home/ec2-user]$  python3 -c "import torchvision; print('Imported torchvision, you are probably all set to go.')"

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment