Skip to content

Instantly share code, notes, and snippets.

@Kentzo
Last active October 23, 2019 02:16
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Kentzo/2e4b1f713ded60de542b335e11991004 to your computer and use it in GitHub Desktop.
Save Kentzo/2e4b1f713ded60de542b335e11991004 to your computer and use it in GitHub Desktop.
Set up AWS environment for dlcourse.ai using Terraform
# See https://www.terraform.io
# ---
# Variables
# ---
# Deep Learning AMI https://aws.amazon.com/marketplace/pp/B077GCH38C
variable "ami" {
default = "ami-0656a055aec8320d6"
}
variable "region" {
default = "us-west-1"
}
variable "availability_zone" {
default = "us-west-1a"
}
# See https://docs.aws.amazon.com/dlami/latest/devguide/gpu.html
variable "instance_type" {
default = "g3.4xlarge"
}
# Set to at least as high as on demand pricing for the selected region
# https://aws.amazon.com/emr/pricing/
variable "spot_price" {
default = "1.5"
}
# Manually create an EBS volume on the selected region and set default to its ID ("vol-...")
variable "volume" {
default = "vol-026f080e66a5c18dd"
}
variable "jupyter_token" {
default = "f572d396fae9206628714fb2ce00f72e94f2258f"
}
# ---
# Config
# ---
provider "aws" {
profile = "default"
region = "${var.region}"
}
resource "aws_default_subnet" "default" {
availability_zone = "${var.availability_zone}"
}
resource "aws_default_vpc" "default" {}
resource "aws_security_group" "ports" {
name = "ports"
vpc_id = "${aws_default_vpc.default.id}"
ingress {
from_port = 22
to_port = 22
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
}
ingress {
from_port = 8080
to_port = 8080
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
}
resource "aws_spot_instance_request" "gpu" {
ami = "${var.ami}"
availability_zone = "${var.availability_zone}"
spot_price = "${var.spot_price}"
spot_type = "one-time"
instance_type = "${var.instance_type}"
wait_for_fulfillment = true
ebs_optimized = true
key_name = "id_rsa"
user_data = <<EOF
#cloud-config
package_update: true
packages:
- htop
- libncurses5-dev
- libncursesw5-dev
bootcmd:
- test -z "$(blkid /dev/xvdf)" && mkfs -t ext4 -L dlcourse /dev/xvdf
- mkdir -p /home/ubuntu/dlcourse
mounts:
- [ "/dev/xvdf", "/home/ubuntu/dlcourse", "ext4", "defaults,nofail", "0", "2" ]
write_files:
- content: |
[Unit]
Description=Jupyter Notebook
[Service]
Type=simple
PIDFile=/run/jupyter.pid
ExecStart=/home/ubuntu/anaconda3/envs/pytorch_p36/bin/jupyter-lab --port 8080 --ip 0.0.0.0 --no-browser
User=ubuntu
Group=ubuntu
WorkingDirectory=/home/ubuntu/dlcourse/dlcourse_ai
Restart=always
RestartSec=10
Environment=TORCH_MODEL_ZOO=/home/ubuntu/dlcourse
Environment=MPLCONFIGDIR=/home/ubuntu/dlcourse/matplotlib
[Install]
WantedBy=multi-user.target
path: /etc/systemd/system/jupyter.service
owner: root:root
permissions: '0755'
- content: |
c.NotebookApp.kernel_spec_manager_class = 'environment_kernels.EnvironmentKernelSpecManager'
c.NotebookApp.iopub_data_rate_limit = 10000000000
c.NotebookApp.token = '${var.jupyter_token}'
path: /home/ubuntu/.jupyter/jupyter_notebook_config.py
owner: ubuntu:ubuntu
permissions: '0655'
- content: |
set -e
sudo apt-get install -y --reinstall cmake
if [ ! -d "/home/ubuntu/dlcourse/nvtop" ]; then
git clone https://github.com/Syllo/nvtop.git --depth 1 /home/ubuntu/dlcourse/nvtop
mkdir -p /home/ubuntu/dlcourse/nvtop/build && cd /home/ubuntu/dlcourse/nvtop/build
cmake .. -DCMAKE_BUILD_TYPE=Release
make
fi
cd /home/ubuntu/dlcourse/nvtop/build
sudo make install
path: /tmp/install_nvtop.sh
permissions: '0755'
runcmd:
- test -d "/home/ubuntu/dlcourse/dlcourse_ai" || "git clone --depth 1 git@github.com:sim0nsays/dlcourse_ai.git /home/ubuntu/dlcourse/dlcourse_ai"
- chown -R ubuntu:ubuntu /home/ubuntu/dlcourse
- sudo -u ubuntu /tmp/install_nvtop.sh
# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/optimize_gpu.html
- sudo nvidia-persistenced
- sudo nvidia-smi --auto-boost-default=0
# - sudo nvidia-smi -ac 2505,1177
- systemctl enable jupyter.service
- systemctl daemon-reload
- systemctl restart jupyter.service
EOF
vpc_security_group_ids = ["${aws_security_group.ports.id}"]
tags = {
Name = "dlcourse.ai"
}
}
resource "aws_volume_attachment" "dlcourse_attachment" {
device_name = "/dev/sdf"
volume_id = "${var.volume}"
instance_id = "${aws_spot_instance_request.gpu.spot_instance_id}"
skip_destroy = true
}
output "jupyter" {
value = "http://${aws_spot_instance_request.gpu.public_dns}:8080/?token=${var.jupyter_token}"
}
output "ssh" {
value = "ssh ubuntu@${aws_spot_instance_request.gpu.public_dns}"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment