Skip to content

Instantly share code, notes, and snippets.

@omerh
Created September 27, 2023 14:48
Show Gist options
  • Save omerh/9dc5168a7575f7763326006309302e5b to your computer and use it in GitHub Desktop.
Save omerh/9dc5168a7575f7763326006309302e5b to your computer and use it in GitHub Desktop.
#!/bin/bash
# EC2 P5.48xlarge Instance based on Amazon Linux 2 PyTorch DLAMI
# set -x
# Install the dependencie
# sudo yum install -y jq curl
# Variables
REGION="<REGION>"
VPC_ID="<VPC_ID>" # You can list you VPCs aws ec2 describe-vpcs --region $REGION | jq '.[][].VpcId'
# PUBLIC="<true|false>" # If subnet is public, set PUBLIC=true, otherwise set PUBLIC=false, it will add a public IP to the instance interface 0
PUBLIC=true
# Instance variable configuration
# Subnet ID
SUBNET_ID="<SUBNET_ID>" # Make sure to check pool mapping to direc the customer to choose the AZ with capacity
# SSH_KEY="<SSH_KEY>"
SSH_KEY="us-east-1-default"
# INSTACE_PROFILE="<INSTACE_PROFILE>" ## Optional
ROOT_EBS_SIZE=100 # You need at least 20 GB for loading DLC
ROOT_EBS_IOPS=5000
ROOT_EBS_THROUGHPUT=250
INSTANCE_COUNT=1
# Security group
SG_ID=$(aws ec2 create-security-group --vpc-id $VPC_ID --region $REGION --group-name p5 --description p5 --output text --query 'GroupId')
if [ $? -ne 0 ]; then
SG_ID=$(aws ec2 describe-security-groups --region $REGION --group-names p5 --query 'SecurityGroups[*].GroupId' --output text)
fi
# Authorize all traffic inbound/outbound to self, requirement for EFA to work properly
aws ec2 authorize-security-group-egress --group-id $SG_ID --protocol all --source-group $SG_ID --region $REGION > /dev/null 2>&1
aws ec2 authorize-security-group-ingress --group-id $SG_ID --protocol all --source-group $SG_ID --region $REGION > /dev/null 2>&1
# Authorize your IP
MY_IP=$(curl -s ifconfig.co)
aws ec2 authorize-security-group-ingress --group-id $SG_ID --protocol tcp --port 22 --region $REGION --cidr $MY_IP/32 > /dev/null 2>&1
# Create a cluster placement group
CPG_ID=$(aws ec2 create-placement-group --region ${REGION} --group-name p5 --strategy cluster --query PlacementGroup.GroupId)
if [ $? -ne 0 ]; then
CPG_ID=$(aws ec2 describe-placement-groups --region ${REGION} --group-names p5 --query 'PlacementGroups[*].GroupId' --output text)
fi
# Get the appropriate AMI
AMI_ID=$(aws ec2 describe-images --region ${REGION} --owners amazon --filters 'Name=name,Values=Deep Learning AMI GPU PyTorch 2.0.? (Amazon Linux 2) ????????' 'Name=state,Values=available' --query 'reverse(sort_by(Images, &CreationDate))[:1].ImageId' --output text)
# Optional switchs to the run instances command
# IAM Profile
# --iam-instance-profile ${INSTANCE_PROFILE} \
# Monitoring
# Enable enhanced monitoring
# --monitoring Enabled=true \
INSTANCE_ID=$(aws ec2 run-instances \
--region ${REGION} \
--count 1 \
--image-id ${AMI_ID} \
--instance-type p5.48xlarge \
--instance-market-options "MarketType=spot" \
--placement GroupId=${CPG_ID} \
--key-name ${SSH_KEY} \
--ebs-optimized \
--block-device-mappings "DeviceName=/dev/sda1,Ebs={VolumeSize=${ROOT_EBS_SIZE},VolumeType=gp3,Iops=${ROOT_EBS_IOPS},Throughput=${ROOT_EBS_THROUGHPUT}}" \
--network-interfaces \
DeviceIndex=0,NetworkCardIndex=0,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=1,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=2,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=3,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=4,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=5,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=6,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=7,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=8,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=9,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=10,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=11,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=12,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=13,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=14,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=15,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=16,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=17,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=18,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=19,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=20,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=21,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=22,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=23,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=24,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=25,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=26,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=27,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=28,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=29,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=30,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
DeviceIndex=1,NetworkCardIndex=31,SubnetId=${SUBNET_ID},Groups=${SG_ID},AssociatePublicIpAddress=false,InterfaceType=efa \
--query 'Instances[0].InstanceId' --output text)
if [ $? -ne 0 ]; then
echo "Failed to launch instance, check limits or response error message"
exit 1
fi
echo "Waiting for instance $INSTANCE_ID to start"
aws ec2 wait instance-status-ok --region $REGION --instance-ids $INSTANCE_ID
if $PUBLIC; then
ENI_ID=$(aws ec2 describe-instances --region $REGION --instance-ids $INSTANCE_ID --query 'Reservations[*].Instances[*].NetworkInterfaces[0].NetworkInterfaceId' --output text)
EIP_ALLOCATION_ID=$(aws ec2 allocate-address --region $REGION --query 'AllocationId' --output text)
aws ec2 associate-address --region $REGION --allocation-id $EIP_ALLOCATION_ID --network-interface-id $ENI_ID > /dev/null 2>&1
TARGET_IP=$(aws ec2 describe-addresses --filters "Name=allocation-id,Values=$EIP_ALLOCATION_ID" --region $REGION --query 'Addresses[*].PublicIp' --output text)
else
TARGET_IP=$(aws ec2 describe-instances --region $REGION --instance-ids $INSTANCE_ID --query 'Reservations[*].Instances[*].NetworkInterfaces[0].PrivateIpAddress' --output text)
fi
echo "Instance $INSTANCE_ID is ready, To connect ec2-user@$TARGET_IP -i $SSH_KEY.pem"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment