bilalmughal/ec2_graviton_dl_bootstrap.sh

## ec2_graviton_dl_bootstrap.sh
#!/bin/bash
set -e # Exit on any error

# Check if required arguments are provided
if [ -z "$REGION" ] || [ -z "$SECURITY_GROUPS" ] || [ -z "$KEY_PAIR" ] || [ -z "$SUBNET" ]; then
    echo "Error: You must provide REGION, SECURITY_GROUPS, KEY_PAIR, and SUBNET as environment variables."
    echo "Example:"
    echo "  export REGION=us-east-1"
    echo "  export SECURITY_GROUPS=sg-12345678,sg-87654321"
    echo "  export KEY_PAIR=my-key-pair"
    echo "  export SUBNET=subnet-12345678"
    exit 1
fi

USER_DATA=$(cat <<'EOF'
#!/bin/bash
set -e # Exit on any error

CUDA_HOME=/usr/local/cuda
HOME_DIR=/home/ec2-user
SRC_DIR=$HOME_DIR/sources
USER_EXEC="sudo -u ec2-user"
CPUS=$(nproc)
LOG_FILE="$HOME_DIR/install.log"

if [[ "$1" != "--stdout" ]]; then
exec >>"$LOG_FILE" 2>&1
fi

# Create source directory
mkdir -p $SRC_DIR
pushd $SRC_DIR

# Install system utilities and updates
install_utils() {
    echo "Installing utilities..."
    dnf -y update
    dnf -y groupinstall "Development Tools"
    dnf install -y openssl-devel cmake3 rust cargo amazon-efs-utils htop iotop yasm nasm jq python3-pip python-devel cronie cronie-anacron
    echo "Success : Updates and developer tools installed."
    echo "PATH=$CUDA_HOME/bin:\$PATH" | sudo tee -a $HOME_DIR/.bashrc
    echo "LD_LIBRARY_PATH=$CUDA_HOME/lib64:\$LD_LIBRARY_PATH" | sudo tee -a $HOME_DIR/.bashrc

    echo "/usr/local/lib" | sudo tee /etc/ld.so.conf.d/usr-local-lib.conf
    echo "/usr/local/lib64" | sudo tee -a /etc/ld.so.conf.d/usr-local-lib.conf
}

# Setup GPU, CUDA and CUDNN
setup_gpu() {
    echo "Setting up GPU..."
    wget https://us.download.nvidia.com/tesla/535.104.05/NVIDIA-Linux-aarch64-535.104.05.run
    sh NVIDIA-Linux-aarch64-535.104.05.run --disable-nouveau --silent

    wget https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux_sbsa.run
    sh cuda_12.2.2_535.104.05_linux_sbsa.run --silent --override --toolkit --samples --toolkitpath=/usr/local/cuda-12.2 --samplespath=$CUDA_HOME --no-opengl-libs

    wget https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-8.9.4.25_cuda12-archive.tar.xz
    tar -xf cudnn-linux-sbsa-8.9.4.25_cuda12-archive.tar.xz
    cp -P cudnn-linux-sbsa-8.9.4.25_cuda12-archive/include/* $CUDA_HOME/include/
    cp -P cudnn-linux-sbsa-8.9.4.25_cuda12-archive/lib/* $CUDA_HOME/lib64/
    chmod a+r $CUDA_HOME/lib64/*
    ldconfig
    rm -fr cu* NVIDIA*
}

# Install PyTorch from source
install_pytorch() {
    echo "Installing PyTorch..."
    wget https://github.com/ccache/ccache/releases/download/v4.8.3/ccache-4.8.3.tar.xz
    tar -xf ccache-4.8.3.tar.xz
    pushd ccache-4.8.3
    cmake .
    make -j $CPUS
    popd
    dnf install -y numpy
    pip3 install typing-extensions
    git clone --recursive https://github.com/pytorch/pytorch.git
    pushd pytorch
    python3 setup.py install
    popd
    ldconfig
    $USER_EXEC pip3 install sympy filelock fsspec networkx
}

mount -o remount,size=15G /tmp/

# Execute Functions
install_utils
setup_gpu
source $HOME_DIR/.bashrc
install_pytorch

# Cleanup
popd
rm -fr $SRC_DIR

# Test the installation
$USER_EXEC python3 -c "import torch; print('Congratulations, you are all set to go.' if torch.cuda.is_available() else 'Something went wrong. Please check if you missed any steps.')"

EOF
)

AMI_ID="ami-0b9ce70cf1bc24fc3"

aws ec2 run-instances \
    --image-id $AMI_ID \
    --instance-type g5g.4xlarge \
    --key-name $KEY_PAIR \
    --subnet-id $SUBNET \
    --security-group-ids $SECURITY_GROUPS \
    --region $REGION \
    --block-device-mappings '[{"DeviceName":"/dev/xvda","Ebs":{"VolumeSize":20,"VolumeType":"gp3"}}]' \
    --tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value=AMI-Builder}]' \
    --user-data "$USER_DATA"
	#!/bin/bash
	set -e # Exit on any error

	# Check if required arguments are provided
	if [ -z "$REGION" ] \|\| [ -z "$SECURITY_GROUPS" ] \|\| [ -z "$KEY_PAIR" ] \|\| [ -z "$SUBNET" ]; then
	echo "Error: You must provide REGION, SECURITY_GROUPS, KEY_PAIR, and SUBNET as environment variables."
	echo "Example:"
	echo " export REGION=us-east-1"
	echo " export SECURITY_GROUPS=sg-12345678,sg-87654321"
	echo " export KEY_PAIR=my-key-pair"
	echo " export SUBNET=subnet-12345678"
	exit 1
	fi

	USER_DATA=$(cat <<'EOF'
	#!/bin/bash
	set -e # Exit on any error

	CUDA_HOME=/usr/local/cuda
	HOME_DIR=/home/ec2-user
	SRC_DIR=$HOME_DIR/sources
	USER_EXEC="sudo -u ec2-user"
	CPUS=$(nproc)
	LOG_FILE="$HOME_DIR/install.log"

	if [[ "$1" != "--stdout" ]]; then
	exec >>"$LOG_FILE" 2>&1
	fi

	# Create source directory
	mkdir -p $SRC_DIR
	pushd $SRC_DIR

	# Install system utilities and updates
	install_utils() {
	echo "Installing utilities..."
	dnf -y update
	dnf -y groupinstall "Development Tools"
	dnf install -y openssl-devel cmake3 rust cargo amazon-efs-utils htop iotop yasm nasm jq python3-pip python-devel cronie cronie-anacron
	echo "Success : Updates and developer tools installed."
	echo "PATH=$CUDA_HOME/bin:\$PATH" \| sudo tee -a $HOME_DIR/.bashrc
	echo "LD_LIBRARY_PATH=$CUDA_HOME/lib64:\$LD_LIBRARY_PATH" \| sudo tee -a $HOME_DIR/.bashrc

	echo "/usr/local/lib" \| sudo tee /etc/ld.so.conf.d/usr-local-lib.conf
	echo "/usr/local/lib64" \| sudo tee -a /etc/ld.so.conf.d/usr-local-lib.conf
	}

	# Setup GPU, CUDA and CUDNN
	setup_gpu() {
	echo "Setting up GPU..."
	wget https://us.download.nvidia.com/tesla/535.104.05/NVIDIA-Linux-aarch64-535.104.05.run
	sh NVIDIA-Linux-aarch64-535.104.05.run --disable-nouveau --silent

	wget https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux_sbsa.run
	sh cuda_12.2.2_535.104.05_linux_sbsa.run --silent --override --toolkit --samples --toolkitpath=/usr/local/cuda-12.2 --samplespath=$CUDA_HOME --no-opengl-libs

	wget https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-8.9.4.25_cuda12-archive.tar.xz
	tar -xf cudnn-linux-sbsa-8.9.4.25_cuda12-archive.tar.xz
	cp -P cudnn-linux-sbsa-8.9.4.25_cuda12-archive/include/* $CUDA_HOME/include/
	cp -P cudnn-linux-sbsa-8.9.4.25_cuda12-archive/lib/* $CUDA_HOME/lib64/
	chmod a+r $CUDA_HOME/lib64/*
	ldconfig
	rm -fr cu* NVIDIA*
	}

	# Install PyTorch from source
	install_pytorch() {
	echo "Installing PyTorch..."
	wget https://github.com/ccache/ccache/releases/download/v4.8.3/ccache-4.8.3.tar.xz
	tar -xf ccache-4.8.3.tar.xz
	pushd ccache-4.8.3
	cmake .
	make -j $CPUS
	popd
	dnf install -y numpy
	pip3 install typing-extensions
	git clone --recursive https://github.com/pytorch/pytorch.git
	pushd pytorch
	python3 setup.py install
	popd
	ldconfig
	$USER_EXEC pip3 install sympy filelock fsspec networkx
	}

	mount -o remount,size=15G /tmp/

	# Execute Functions
	install_utils
	setup_gpu
	source $HOME_DIR/.bashrc
	install_pytorch

	# Cleanup
	popd
	rm -fr $SRC_DIR

	# Test the installation
	$USER_EXEC python3 -c "import torch; print('Congratulations, you are all set to go.' if torch.cuda.is_available() else 'Something went wrong. Please check if you missed any steps.')"

	EOF
	)

	AMI_ID="ami-0b9ce70cf1bc24fc3"

	aws ec2 run-instances \
	--image-id $AMI_ID \
	--instance-type g5g.4xlarge \
	--key-name $KEY_PAIR \
	--subnet-id $SUBNET \
	--security-group-ids $SECURITY_GROUPS \
	--region $REGION \
	--block-device-mappings '[{"DeviceName":"/dev/xvda","Ebs":{"VolumeSize":20,"VolumeType":"gp3"}}]' \
	--tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value=AMI-Builder}]' \
	--user-data "$USER_DATA"