Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rromanchuk/fd7895d4546d639a0ad3bf7270c0ad73 to your computer and use it in GitHub Desktop.
Save rromanchuk/fd7895d4546d639a0ad3bf7270c0ad73 to your computer and use it in GitHub Desktop.
This script automates the setup of an Amazon EC2 Graviton ARM-based instances for deep learning tasks. It takes care of installing essential utilities, setting up latest Nvidia drivers and CUDA 12.2 toolkit and cuDNN library, and build PyTorch from source. The step-by-step guided can be checked here. https://jumpshare.com/blog/deep-learning-on-a…
#!/bin/bash
set -e # Exit on any error
# Check if required arguments are provided
if [ -z "$REGION" ] || [ -z "$SECURITY_GROUPS" ] || [ -z "$KEY_PAIR" ] || [ -z "$SUBNET" ]; then
echo "Error: You must provide REGION, SECURITY_GROUPS, KEY_PAIR, and SUBNET as environment variables."
echo "Example:"
echo " export REGION=us-east-1"
echo " export SECURITY_GROUPS=sg-12345678,sg-87654321"
echo " export KEY_PAIR=my-key-pair"
echo " export SUBNET=subnet-12345678"
exit 1
fi
USER_DATA=$(cat <<'EOF'
#!/bin/bash
set -e # Exit on any error
CUDA_HOME=/usr/local/cuda
HOME_DIR=/home/ec2-user
SRC_DIR=$HOME_DIR/sources
USER_EXEC="sudo -u ec2-user"
CPUS=$(nproc)
LOG_FILE="$HOME_DIR/install.log"
if [[ "$1" != "--stdout" ]]; then
exec >>"$LOG_FILE" 2>&1
fi
# Create source directory
mkdir -p $SRC_DIR
pushd $SRC_DIR
# Install system utilities and updates
install_utils() {
echo "Installing utilities..."
dnf -y update
dnf -y groupinstall "Development Tools"
dnf install -y openssl-devel cmake3 rust cargo amazon-efs-utils htop iotop yasm nasm jq python3-pip python-devel cronie cronie-anacron
echo "Success : Updates and developer tools installed."
echo "PATH=$CUDA_HOME/bin:\$PATH" | sudo tee -a $HOME_DIR/.bashrc
echo "LD_LIBRARY_PATH=$CUDA_HOME/lib64:\$LD_LIBRARY_PATH" | sudo tee -a $HOME_DIR/.bashrc
echo "/usr/local/lib" | sudo tee /etc/ld.so.conf.d/usr-local-lib.conf
echo "/usr/local/lib64" | sudo tee -a /etc/ld.so.conf.d/usr-local-lib.conf
}
# Setup GPU, CUDA and CUDNN
setup_gpu() {
echo "Setting up GPU..."
wget https://us.download.nvidia.com/tesla/535.104.05/NVIDIA-Linux-aarch64-535.104.05.run
sh NVIDIA-Linux-aarch64-535.104.05.run --disable-nouveau --silent
wget https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux_sbsa.run
sh cuda_12.2.2_535.104.05_linux_sbsa.run --silent --override --toolkit --samples --toolkitpath=/usr/local/cuda-12.2 --samplespath=$CUDA_HOME --no-opengl-libs
wget https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-8.9.4.25_cuda12-archive.tar.xz
tar -xf cudnn-linux-sbsa-8.9.4.25_cuda12-archive.tar.xz
cp -P cudnn-linux-sbsa-8.9.4.25_cuda12-archive/include/* $CUDA_HOME/include/
cp -P cudnn-linux-sbsa-8.9.4.25_cuda12-archive/lib/* $CUDA_HOME/lib64/
chmod a+r $CUDA_HOME/lib64/*
ldconfig
rm -fr cu* NVIDIA*
}
# Install PyTorch from source
install_pytorch() {
echo "Installing PyTorch..."
wget https://github.com/ccache/ccache/releases/download/v4.8.3/ccache-4.8.3.tar.xz
tar -xf ccache-4.8.3.tar.xz
pushd ccache-4.8.3
cmake .
make -j $CPUS
popd
dnf install -y numpy
pip3 install typing-extensions
git clone --recursive https://github.com/pytorch/pytorch.git
pushd pytorch
python3 setup.py install
popd
ldconfig
$USER_EXEC pip3 install sympy filelock fsspec networkx
}
mount -o remount,size=15G /tmp/
# Execute Functions
install_utils
setup_gpu
source $HOME_DIR/.bashrc
install_pytorch
# Cleanup
popd
rm -fr $SRC_DIR
# Test the installation
$USER_EXEC python3 -c "import torch; print('Congratulations, you are all set to go.' if torch.cuda.is_available() else 'Something went wrong. Please check if you missed any steps.')"
EOF
)
AMI_ID="ami-0b9ce70cf1bc24fc3"
aws ec2 run-instances \
--image-id $AMI_ID \
--instance-type g5g.4xlarge \
--key-name $KEY_PAIR \
--subnet-id $SUBNET \
--security-group-ids $SECURITY_GROUPS \
--region $REGION \
--block-device-mappings '[{"DeviceName":"/dev/xvda","Ebs":{"VolumeSize":100,"VolumeType":"gp3"}}]' \
--tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value=AMI-Builder}]' \
--user-data "$USER_DATA"
/bin/bash
whoami
# ec2-user
pwd
# /home/ec2-user
sudo su
CUDA_HOME=/usr/local/cuda
HOME_DIR=/home/ec2-user
SRC_DIR=$HOME_DIR/sources
USER_EXEC="sudo -u ec2-user"
CPUS=$(nproc)
LOG_FILE="$HOME_DIR/install.log"
mkdir -p $SRC_DIR
pushd $SRC_DIR
dnf -y update
dnf -y groupinstall "Development Tools"
dnf install -y openssl-devel cmake3 rust cargo amazon-efs-utils htop iotop yasm nasm jq python3-pip python-devel cronie cronie-anacron
echo "Success : Updates and developer tools installed."
echo "PATH=$CUDA_HOME/bin:\$PATH" | sudo tee -a $HOME_DIR/.bashrc
echo "LD_LIBRARY_PATH=$CUDA_HOME/lib64:\$LD_LIBRARY_PATH" | sudo tee -a $HOME_DIR/.bashrc
echo "/usr/local/lib" | sudo tee /etc/ld.so.conf.d/usr-local-lib.conf
echo "/usr/local/lib64" | sudo tee -a /etc/ld.so.conf.d/usr-local-lib.conf
cat .bashrc
# .bashrc
# Source global definitions
if [ -f /etc/bashrc ]; then
. /etc/bashrc
fi
# User specific environment
if ! [[ "$PATH" =~ "$HOME/.local/bin:$HOME/bin:" ]]
then
PATH="$HOME/.local/bin:$HOME/bin:$PATH"
fi
export PATH
# Uncomment the following line if you don't like systemctl's auto-paging feature:
# export SYSTEMD_PAGER=
# User specific aliases and functions
if [ -d ~/.bashrc.d ]; then
for rc in ~/.bashrc.d/*; do
if [ -f "$rc" ]; then
. "$rc"
fi
done
fi
unset rc
PATH=/usr/local/cuda/bin:$PATH
LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
echo "Setting up GPU..."
wget https://us.download.nvidia.com/tesla/535.104.05/NVIDIA-Linux-aarch64-535.104.05.run
sh NVIDIA-Linux-aarch64-535.104.05.run --disable-nouveau --silent
wget https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux_sbsa.run
sh cuda_12.2.2_535.104.05_linux_sbsa.run --silent --override --toolkit --samples --toolkitpath=/usr/local/cuda-12.2 --samplespath=$CUDA_HOME --no-opengl-libs
wget https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-8.9.4.25_cuda12-archive.tar.xz
tar -xf cudnn-linux-sbsa-8.9.4.25_cuda12-archive.tar.xz
cp -P cudnn-linux-sbsa-8.9.4.25_cuda12-archive/include/* $CUDA_HOME/include/
cp -P cudnn-linux-sbsa-8.9.4.25_cuda12-archive/lib/* $CUDA_HOME/lib64/
chmod a+r $CUDA_HOME/lib64/*
ldconfig
echo "Installing PyTorch..."
wget https://github.com/ccache/ccache/releases/download/v4.8.3/ccache-4.8.3.tar.xz
tar -xf ccache-4.8.3.tar.xz
pushd ccache-4.8.3
cmake .
make -j $CPUS
popd
dnf install -y numpy
pip3 install typing-extensions
git clone --recursive https://github.com/pytorch/pytorch.git
pushd pytorch
python3 setup.py install
popd
ldconfig
sudo -u ec2-user pip3 install sympy filelock fsspec networkx
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment