Last active
February 13, 2024 13:55
-
-
Save ResidentMario/9f41ac480f9efbf2ff1d05d450c29470 to your computer and use it in GitHub Desktop.
EC2 GPU Image Builder Script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# https://gist.github.com/ResidentMario/9f41ac480f9efbf2ff1d05d450c29470 | |
set -ex | |
if [[ -z "$EC2_SSH_KEY_NAME" ]]; then | |
echo "EC2_SSH_KEY_NAME environment variable not set, exiting." && exit 1 | |
fi | |
if [[ -z "$EC2_SSH_KEY_FILEPATH" ]]; then | |
echo "EC2_SSH_KEY_FILEPATH environment variable not set, exiting." && exit 1 | |
fi | |
if [[ -z "$DOCKER_USERNAME" ]]; then | |
echo "DOCKER_USERNAME environment variable not set, exiting." && exit 1 | |
fi | |
if [[ -z "$DOCKER_PASSWORD" ]]; then | |
echo "DOCKER_PASSWORD environment variable not set, exiting." && exit 1 | |
fi | |
if [[ -z "$AWS_REGION" ]]; then | |
AWS_REGION=us-east-2 | |
fi | |
if [[ -z "$AWS_PROFILE" ]]; then | |
AWS_PROFILE=spell2 | |
fi | |
if [[ -z "$DOCKERFILE_NAME" ]]; then | |
DOCKERFILE_NAME=Dockerfile | |
fi | |
DOCKERFILE_REPO_PATH=$1 | |
DOCKER_TAG=$2 | |
# Get the instance if it is already running, or launch it if it is not. | |
echo "Checking for running builder instances..." | |
INSTANCES=$(aws ec2 describe-instances \ | |
--filters Name=tag:gpu-docker-builder,Values=true Name=instance-state-code,Values=16 \ | |
--profile spell2 \ | |
--region $AWS_REGION \ | |
--output json) | |
INSTANCES_ALREADY_RUNNING=$(echo $INSTANCES | \ | |
jq -r '.Reservations[0].Instances | length') | |
if [[ "$INSTANCES_ALREADY_RUNNING" -eq 0 ]]; then | |
echo "No instance running yet, starting one now..." | |
# Using the Ubuntu 18 Deep Learning AMI: https://aws.amazon.com/marketplace/pp/B077GFM7L7. This has | |
# CUDA drivers and Docker already installed! | |
# | |
# If we just use base Linux we'll have to install NVIDIA CUDA drivers ourselves! Super inconvenient. | |
# This AMI ID was taken from the marketplace page. No idea if this is something that can change. | |
ML_AMI_ID=ami-0c18adb186cbc7a2f | |
RESERVATIONS_DETAILS=$(aws ec2 run-instances \ | |
--image-id $ML_AMI_ID \ | |
--instance-type g4dn.xlarge \ | |
--count 1 \ | |
--key-name $EC2_SSH_KEY_NAME \ | |
--tag-specifications 'ResourceType=instance,Tags=[{Key=gpu-docker-builder,Value=true}]' \ | |
--profile $AWS_PROFILE \ | |
--region $AWS_REGION \ | |
--output json) | |
INSTANCE_DETAILS=$(echo $RESERVATIONS_DETAILS | jq -r '.Instances[0]') | |
INSTANCE_ID=$(echo $INSTANCE_DETAILS | jq -r '.InstanceId') | |
echo "Waiting for it to come up (go get a drink or something, this'll be a while..." | |
INSTANCE_STATE="pending" | |
while [[ "$INSTANCE_STATE" != "running" ]]; do | |
>&2 echo "Machine is not up yet, sleeping another 10 seconds..." | |
sleep 10 | |
INSTANCE_DETAILS=$(aws ec2 describe-instances \ | |
--filters Name=instance-id,Values=$INSTANCE_ID \ | |
--profile $AWS_PROFILE \ | |
--region $AWS_REGION \ | |
--output json) | |
INSTANCE_STATE=$(echo $INSTANCE_DETAILS | \ | |
jq -r '.Reservations[0].Instances[0].State.Name') | |
done | |
echo "Instance launched, configuring security group to allow access from this IP..." | |
echo "If this raises an error it's fine, ignore it. Something more elegant is a TODO." | |
INSTANCE_SG_ID=$(echo $INSTANCE_DETAILS | jq -r '.Reservations[0].Instances[0].SecurityGroups[0].GroupId') | |
MY_IP=$(curl https://myexternalip.com/raw) | |
aws ec2 authorize-security-group-ingress \ | |
--group-id $INSTANCE_SG_ID \ | |
--protocol tcp \ | |
--port 22 \ | |
--cidr $MY_IP/32 \ | |
--profile $AWS_PROFILE \ | |
--region $AWS_REGION || true | |
else | |
echo "Instance already running, skipping initialization." | |
fi | |
RUNNING_INSTANCES=$(aws ec2 describe-instances \ | |
--filters Name=tag:gpu-docker-builder,Values=true Name=instance-state-code,Values=16 \ | |
--profile $AWS_PROFILE \ | |
--region $AWS_REGION \ | |
--output json) | |
INSTANCE_DETAILS=$(echo $RUNNING_INSTANCES | jq -r '.Reservations[0].Instances[0]') | |
INSTANCE_ADDR=$(echo $INSTANCE_DETAILS | jq -r '.PublicDnsName') | |
SSH_LOGIN=ec2-user@$INSTANCE_ADDR | |
cat ~/.ssh/known_hosts | grep $INSTANCE_ADDR && ADD_HOST_PLEASE=0 || ADD_HOST_PLEASE=1 | |
# NOTE(aleksey): unfortunately the server is unresponsive on SSH until some time after it has | |
# launched? Super weird, but if you launch a new machine this script may fail because ssh-keyscan | |
# will no-op and return a non-zero response. If you wait a little bit, a couple of minutes, and | |
# rerun this script, it will start working. | |
if [[ "$ADD_HOST_PLEASE" -eq 1 ]]; then | |
ssh-keyscan -H $INSTANCE_ADDR >> ~/.ssh/known_hosts | |
fi | |
echo "Setup all done! Chucking your Dockerfile build job at the instance..." | |
RANDOM_DIR=docker-build-$RANDOM | |
CURR_DIR=$(basename $PWD) | |
ROOT_DIRECTORY=/home/ec2-user/$RANDOM_DIR | |
DOCKERFILE_HOME=$ROOT_DIRECTORY/$CURR_DIR | |
# For IdentitiesOnly cf. https://serverfault.com/a/989684/317372 | |
rsync -avz -e "ssh -o 'IdentitiesOnly Yes' -i $EC2_SSH_KEY_FILEPATH" \ | |
$DOCKERFILE_REPO_PATH \ | |
$SSH_LOGIN:$ROOT_DIRECTORY | |
ssh -t -i $EC2_SSH_KEY_FILEPATH $SSH_LOGIN \ | |
"docker build -t $DOCKER_TAG -f $DOCKERFILE_HOME/$DOCKERFILE_NAME $DOCKERFILE_HOME" | |
ssh -t -i $EC2_SSH_KEY_FILEPATH $SSH_LOGIN \ | |
"docker login --username $DOCKER_USERNAME --password $DOCKER_PASSWORD && docker push $DOCKER_TAG" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment