Last active
April 10, 2024 19:36
-
-
Save bilalmughal/0500f27454a508bd3552fcf03e3adadb to your computer and use it in GitHub Desktop.
This script automates the setup of an Amazon EC2 Graviton ARM-based instances for deep learning tasks. It takes care of installing essential utilities, setting up latest Nvidia drivers and CUDA 12.2 toolkit and cuDNN library, and build PyTorch from source. The step-by-step guided can be checked here. https://jumpshare.com/blog/deep-learning-on-a…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e # Exit on any error | |
# Check if required arguments are provided | |
if [ -z "$REGION" ] || [ -z "$SECURITY_GROUPS" ] || [ -z "$KEY_PAIR" ] || [ -z "$SUBNET" ]; then | |
echo "Error: You must provide REGION, SECURITY_GROUPS, KEY_PAIR, and SUBNET as environment variables." | |
echo "Example:" | |
echo " export REGION=us-east-1" | |
echo " export SECURITY_GROUPS=sg-12345678,sg-87654321" | |
echo " export KEY_PAIR=my-key-pair" | |
echo " export SUBNET=subnet-12345678" | |
exit 1 | |
fi | |
USER_DATA=$(cat <<'EOF' | |
#!/bin/bash | |
set -e # Exit on any error | |
CUDA_HOME=/usr/local/cuda | |
HOME_DIR=/home/ec2-user | |
SRC_DIR=$HOME_DIR/sources | |
USER_EXEC="sudo -u ec2-user" | |
CPUS=$(nproc) | |
LOG_FILE="$HOME_DIR/install.log" | |
if [[ "$1" != "--stdout" ]]; then | |
exec >>"$LOG_FILE" 2>&1 | |
fi | |
# Create source directory | |
mkdir -p $SRC_DIR | |
pushd $SRC_DIR | |
# Install system utilities and updates | |
install_utils() { | |
echo "Installing utilities..." | |
dnf -y update | |
dnf -y groupinstall "Development Tools" | |
dnf install -y openssl-devel cmake3 rust cargo amazon-efs-utils htop iotop yasm nasm jq python3-pip python-devel cronie cronie-anacron | |
echo "Success : Updates and developer tools installed." | |
echo "PATH=$CUDA_HOME/bin:\$PATH" | sudo tee -a $HOME_DIR/.bashrc | |
echo "LD_LIBRARY_PATH=$CUDA_HOME/lib64:\$LD_LIBRARY_PATH" | sudo tee -a $HOME_DIR/.bashrc | |
echo "/usr/local/lib" | sudo tee /etc/ld.so.conf.d/usr-local-lib.conf | |
echo "/usr/local/lib64" | sudo tee -a /etc/ld.so.conf.d/usr-local-lib.conf | |
} | |
# Setup GPU, CUDA and CUDNN | |
setup_gpu() { | |
echo "Setting up GPU..." | |
wget https://us.download.nvidia.com/tesla/535.104.05/NVIDIA-Linux-aarch64-535.104.05.run | |
sh NVIDIA-Linux-aarch64-535.104.05.run --disable-nouveau --silent | |
wget https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux_sbsa.run | |
sh cuda_12.2.2_535.104.05_linux_sbsa.run --silent --override --toolkit --samples --toolkitpath=/usr/local/cuda-12.2 --samplespath=$CUDA_HOME --no-opengl-libs | |
wget https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-8.9.4.25_cuda12-archive.tar.xz | |
tar -xf cudnn-linux-sbsa-8.9.4.25_cuda12-archive.tar.xz | |
cp -P cudnn-linux-sbsa-8.9.4.25_cuda12-archive/include/* $CUDA_HOME/include/ | |
cp -P cudnn-linux-sbsa-8.9.4.25_cuda12-archive/lib/* $CUDA_HOME/lib64/ | |
chmod a+r $CUDA_HOME/lib64/* | |
ldconfig | |
rm -fr cu* NVIDIA* | |
} | |
# Install PyTorch from source | |
install_pytorch() { | |
echo "Installing PyTorch..." | |
wget https://github.com/ccache/ccache/releases/download/v4.8.3/ccache-4.8.3.tar.xz | |
tar -xf ccache-4.8.3.tar.xz | |
pushd ccache-4.8.3 | |
cmake . | |
make -j $CPUS | |
popd | |
dnf install -y numpy | |
pip3 install typing-extensions | |
git clone --recursive https://github.com/pytorch/pytorch.git | |
pushd pytorch | |
python3 setup.py install | |
popd | |
ldconfig | |
$USER_EXEC pip3 install sympy filelock fsspec networkx | |
} | |
mount -o remount,size=15G /tmp/ | |
# Execute Functions | |
install_utils | |
setup_gpu | |
source $HOME_DIR/.bashrc | |
install_pytorch | |
# Cleanup | |
popd | |
rm -fr $SRC_DIR | |
# Test the installation | |
$USER_EXEC python3 -c "import torch; print('Congratulations, you are all set to go.' if torch.cuda.is_available() else 'Something went wrong. Please check if you missed any steps.')" | |
EOF | |
) | |
AMI_ID="ami-0b9ce70cf1bc24fc3" | |
aws ec2 run-instances \ | |
--image-id $AMI_ID \ | |
--instance-type g5g.4xlarge \ | |
--key-name $KEY_PAIR \ | |
--subnet-id $SUBNET \ | |
--security-group-ids $SECURITY_GROUPS \ | |
--region $REGION \ | |
--block-device-mappings '[{"DeviceName":"/dev/xvda","Ebs":{"VolumeSize":20,"VolumeType":"gp3"}}]' \ | |
--tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value=AMI-Builder}]' \ | |
--user-data "$USER_DATA" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Just kidding, all i needed to was make sure libpng/jpg development packages were installed and
python3 setup.py install
from the cloned torchvision repoFrom interactive terminal post launch