Skip to content

Instantly share code, notes, and snippets.

@ResidentMario
Last active February 13, 2024 13:55
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ResidentMario/9f41ac480f9efbf2ff1d05d450c29470 to your computer and use it in GitHub Desktop.
Save ResidentMario/9f41ac480f9efbf2ff1d05d450c29470 to your computer and use it in GitHub Desktop.
EC2 GPU Image Builder Script
#!/bin/bash
# https://gist.github.com/ResidentMario/9f41ac480f9efbf2ff1d05d450c29470
set -ex
if [[ -z "$EC2_SSH_KEY_NAME" ]]; then
echo "EC2_SSH_KEY_NAME environment variable not set, exiting." && exit 1
fi
if [[ -z "$EC2_SSH_KEY_FILEPATH" ]]; then
echo "EC2_SSH_KEY_FILEPATH environment variable not set, exiting." && exit 1
fi
if [[ -z "$DOCKER_USERNAME" ]]; then
echo "DOCKER_USERNAME environment variable not set, exiting." && exit 1
fi
if [[ -z "$DOCKER_PASSWORD" ]]; then
echo "DOCKER_PASSWORD environment variable not set, exiting." && exit 1
fi
if [[ -z "$AWS_REGION" ]]; then
AWS_REGION=us-east-2
fi
if [[ -z "$AWS_PROFILE" ]]; then
AWS_PROFILE=spell2
fi
if [[ -z "$DOCKERFILE_NAME" ]]; then
DOCKERFILE_NAME=Dockerfile
fi
DOCKERFILE_REPO_PATH=$1
DOCKER_TAG=$2
# Get the instance if it is already running, or launch it if it is not.
echo "Checking for running builder instances..."
INSTANCES=$(aws ec2 describe-instances \
--filters Name=tag:gpu-docker-builder,Values=true Name=instance-state-code,Values=16 \
--profile spell2 \
--region $AWS_REGION \
--output json)
INSTANCES_ALREADY_RUNNING=$(echo $INSTANCES | \
jq -r '.Reservations[0].Instances | length')
if [[ "$INSTANCES_ALREADY_RUNNING" -eq 0 ]]; then
echo "No instance running yet, starting one now..."
# Using the Ubuntu 18 Deep Learning AMI: https://aws.amazon.com/marketplace/pp/B077GFM7L7. This has
# CUDA drivers and Docker already installed!
#
# If we just use base Linux we'll have to install NVIDIA CUDA drivers ourselves! Super inconvenient.
# This AMI ID was taken from the marketplace page. No idea if this is something that can change.
ML_AMI_ID=ami-0c18adb186cbc7a2f
RESERVATIONS_DETAILS=$(aws ec2 run-instances \
--image-id $ML_AMI_ID \
--instance-type g4dn.xlarge \
--count 1 \
--key-name $EC2_SSH_KEY_NAME \
--tag-specifications 'ResourceType=instance,Tags=[{Key=gpu-docker-builder,Value=true}]' \
--profile $AWS_PROFILE \
--region $AWS_REGION \
--output json)
INSTANCE_DETAILS=$(echo $RESERVATIONS_DETAILS | jq -r '.Instances[0]')
INSTANCE_ID=$(echo $INSTANCE_DETAILS | jq -r '.InstanceId')
echo "Waiting for it to come up (go get a drink or something, this'll be a while..."
INSTANCE_STATE="pending"
while [[ "$INSTANCE_STATE" != "running" ]]; do
>&2 echo "Machine is not up yet, sleeping another 10 seconds..."
sleep 10
INSTANCE_DETAILS=$(aws ec2 describe-instances \
--filters Name=instance-id,Values=$INSTANCE_ID \
--profile $AWS_PROFILE \
--region $AWS_REGION \
--output json)
INSTANCE_STATE=$(echo $INSTANCE_DETAILS | \
jq -r '.Reservations[0].Instances[0].State.Name')
done
echo "Instance launched, configuring security group to allow access from this IP..."
echo "If this raises an error it's fine, ignore it. Something more elegant is a TODO."
INSTANCE_SG_ID=$(echo $INSTANCE_DETAILS | jq -r '.Reservations[0].Instances[0].SecurityGroups[0].GroupId')
MY_IP=$(curl https://myexternalip.com/raw)
aws ec2 authorize-security-group-ingress \
--group-id $INSTANCE_SG_ID \
--protocol tcp \
--port 22 \
--cidr $MY_IP/32 \
--profile $AWS_PROFILE \
--region $AWS_REGION || true
else
echo "Instance already running, skipping initialization."
fi
RUNNING_INSTANCES=$(aws ec2 describe-instances \
--filters Name=tag:gpu-docker-builder,Values=true Name=instance-state-code,Values=16 \
--profile $AWS_PROFILE \
--region $AWS_REGION \
--output json)
INSTANCE_DETAILS=$(echo $RUNNING_INSTANCES | jq -r '.Reservations[0].Instances[0]')
INSTANCE_ADDR=$(echo $INSTANCE_DETAILS | jq -r '.PublicDnsName')
SSH_LOGIN=ec2-user@$INSTANCE_ADDR
cat ~/.ssh/known_hosts | grep $INSTANCE_ADDR && ADD_HOST_PLEASE=0 || ADD_HOST_PLEASE=1
# NOTE(aleksey): unfortunately the server is unresponsive on SSH until some time after it has
# launched? Super weird, but if you launch a new machine this script may fail because ssh-keyscan
# will no-op and return a non-zero response. If you wait a little bit, a couple of minutes, and
# rerun this script, it will start working.
if [[ "$ADD_HOST_PLEASE" -eq 1 ]]; then
ssh-keyscan -H $INSTANCE_ADDR >> ~/.ssh/known_hosts
fi
echo "Setup all done! Chucking your Dockerfile build job at the instance..."
RANDOM_DIR=docker-build-$RANDOM
CURR_DIR=$(basename $PWD)
ROOT_DIRECTORY=/home/ec2-user/$RANDOM_DIR
DOCKERFILE_HOME=$ROOT_DIRECTORY/$CURR_DIR
# For IdentitiesOnly cf. https://serverfault.com/a/989684/317372
rsync -avz -e "ssh -o 'IdentitiesOnly Yes' -i $EC2_SSH_KEY_FILEPATH" \
$DOCKERFILE_REPO_PATH \
$SSH_LOGIN:$ROOT_DIRECTORY
ssh -t -i $EC2_SSH_KEY_FILEPATH $SSH_LOGIN \
"docker build -t $DOCKER_TAG -f $DOCKERFILE_HOME/$DOCKERFILE_NAME $DOCKERFILE_HOME"
ssh -t -i $EC2_SSH_KEY_FILEPATH $SSH_LOGIN \
"docker login --username $DOCKER_USERNAME --password $DOCKER_PASSWORD && docker push $DOCKER_TAG"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment