richardliaw/example-gpu-docker.yaml

## example-gpu-docker.yaml
# An unique identifier for the head node and workers of this cluster.
cluster_name: gpu-docker

# The maximum number of workers nodes to launch in addition to the head
# node.
max_workers: 2

# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
    image: "rayproject/ray-ml:latest-gpu"
    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_nvidia_docker" # e.g. ray_docker


# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5

# Cloud-provider specific configuration.
provider:
    type: aws
    region: us-west-2
    # Availability zone(s), comma-separated, that nodes may be launched in.
    # Nodes will be launched in the first listed availability zone and will
    # be tried in the subsequent availability zones if launching fails.
    availability_zone: us-west-2a,us-west-2b

# How Ray will authenticate with newly launched nodes.
auth:
    ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below.
#    ssh_private_key: /path/to/your/key.pem


# The node config specifies the launch config and physical instance type.
available_node_types:
    # GPU head node.
    ray.head.gpu:
        # Provider-specific config for this node type, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
        # For more documentation on available fields, see:
        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
        node_config:
            InstanceType: p3.8xlarge
            # Default AMI. Uncomment to use a different AMI.
            # ImageId:
            # You can provision additional disk space with a conf as follows
            BlockDeviceMappings:
                - DeviceName: /dev/sda1
                  Ebs:
                      VolumeSize: 140
            # Additional options in the boto docs.
    # GPU workers.
    ray.worker.default:
        # Minimum and Maximum number of worker nodes to start
        min_workers: 1
        max_workers: 2
        node_config:
            InstanceType: p3.8xlarge
            # Default AMI. Uncomment to use a different AMI.
            # ImageId:
            # Run workers on spot by default. Comment this out to use on-demand.
            InstanceMarketOptions:
                MarketType: spot
                # Additional options can be found in the boto docs, e.g.
                #   SpotOptions:
                #       MaxPrice: MAX_HOURLY_PRICE
            # Additional options in the boto docs.

# Specify the node type of the head node (as configured above).
head_node_type: ray.head.gpu

# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
#    "/path1/on/remote/machine": "/path1/on/local/machine",
#    "/path2/on/remote/machine": "/path2/on/local/machine",
}

# List of shell commands to run to set up nodes.
# NOTE: rayproject/ray:latest has ray latest bundled
setup_commands: []
    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
    # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"

# Custom commands that will be run on the head node after common setup.
head_setup_commands:
    - pip install boto3>=1.4.8  # 1.4.8 adds InstanceMarketOptions

# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []

# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml

# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
	# An unique identifier for the head node and workers of this cluster.
	cluster_name: gpu-docker

	# The maximum number of workers nodes to launch in addition to the head
	# node.
	max_workers: 2

	# This executes all commands on all nodes in the docker container,
	# and opens all the necessary ports to support the Ray cluster.
	# Empty string means disabled.
	docker:
	image: "rayproject/ray-ml:latest-gpu"
	# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
	container_name: "ray_nvidia_docker" # e.g. ray_docker


	# If a node is idle for this many minutes, it will be removed.
	idle_timeout_minutes: 5

	# Cloud-provider specific configuration.
	provider:
	type: aws
	region: us-west-2
	# Availability zone(s), comma-separated, that nodes may be launched in.
	# Nodes will be launched in the first listed availability zone and will
	# be tried in the subsequent availability zones if launching fails.
	availability_zone: us-west-2a,us-west-2b

	# How Ray will authenticate with newly launched nodes.
	auth:
	ssh_user: ubuntu
	# By default Ray creates a new private keypair, but you can also use your own.
	# If you do so, make sure to also set "KeyName" in the head and worker node
	# configurations below.
	# ssh_private_key: /path/to/your/key.pem


	# The node config specifies the launch config and physical instance type.
	available_node_types:
	# GPU head node.
	ray.head.gpu:
	# Provider-specific config for this node type, e.g. instance type. By default
	# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
	# For more documentation on available fields, see:
	# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
	node_config:
	InstanceType: p3.8xlarge
	# Default AMI. Uncomment to use a different AMI.
	# ImageId:
	# You can provision additional disk space with a conf as follows
	BlockDeviceMappings:
	- DeviceName: /dev/sda1
	Ebs:
	VolumeSize: 140
	# Additional options in the boto docs.
	# GPU workers.
	ray.worker.default:
	# Minimum and Maximum number of worker nodes to start
	min_workers: 1
	max_workers: 2
	node_config:
	InstanceType: p3.8xlarge
	# Default AMI. Uncomment to use a different AMI.
	# ImageId:
	# Run workers on spot by default. Comment this out to use on-demand.
	InstanceMarketOptions:
	MarketType: spot
	# Additional options can be found in the boto docs, e.g.
	# SpotOptions:
	# MaxPrice: MAX_HOURLY_PRICE
	# Additional options in the boto docs.

	# Specify the node type of the head node (as configured above).
	head_node_type: ray.head.gpu

	# Files or directories to copy to the head and worker nodes. The format is a
	# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
	file_mounts: {
	# "/path1/on/remote/machine": "/path1/on/local/machine",
	# "/path2/on/remote/machine": "/path2/on/local/machine",
	}

	# List of shell commands to run to set up nodes.
	# NOTE: rayproject/ray:latest has ray latest bundled
	setup_commands: []
	# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
	# - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"

	# Custom commands that will be run on the head node after common setup.
	head_setup_commands:
	- pip install boto3>=1.4.8 # 1.4.8 adds InstanceMarketOptions

	# Custom commands that will be run on worker nodes after common setup.
	worker_setup_commands: []

	# Command to start ray on the head node. You don't need to change this.
	head_start_ray_commands:
	- ray stop
	- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml

	# Command to start ray on worker nodes. You don't need to change this.
	worker_start_ray_commands:
	- ray stop
	- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076