augray/ray-cluster.yaml

## ray-cluster.yaml
apiVersion: ray.io/v1alpha1
kind: RayCluster
metadata:
  labels:
    controller-tools.k8s.io: "1.0"
    # A unique identifier for the head node and workers of this cluster.
  name: raycluster-complete
spec:
  rayVersion: '2.1.0'
  ######################headGroupSpec#################################
  # Ray head pod template and specs
  headGroupSpec:
    # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
    serviceType: ClusterIP
    # the following params are used to complete the ray start: ray start --head --block --dashboard-host: '0.0.0.0' ...
    rayStartParams:
      dashboard-host: '0.0.0.0'
      block: 'true'
    #pod template
    template:
      metadata:
        # Custom labels. NOTE: To avoid conflicts with KubeRay operator, do not define custom labels start with `raycluster`.
        # Refer to https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
        labels: {}
      spec:
        containers:
        - name: ray-head
          #image: rayproject/ray:2.1.0
          image: 558717131297.dkr.ecr.us-west-2.amazonaws.com/sematic-dev@sha256:6f089f664f135fe62b9fac3ed0261c0a7b794ba518bc109f025e516555aa9791
          ports:
          - containerPort: 6379
            name: gcs
          - containerPort: 8265
            name: dashboard
          - containerPort: 10001
            name: client
          lifecycle:
            preStop:
              exec:
                command: ["/bin/sh","-c","ray stop"]
          volumeMounts:
            - mountPath: /tmp/ray
              name: ray-logs
          # The resource requests and limits in this config are too small for production!
          # For an example with more realistic resource configuration, see
          # ray-cluster.autoscaler.large.yaml.
          # It is better to use a few large Ray pod than many small ones.
          # For production, it is ideal to size each Ray pod to take up the
          # entire Kubernetes node on which it is scheduled.
          resources:
            limits:
              cpu: "1"
              memory: "2G"
            requests:
              # For production use-cases, we recommend specifying integer CPU reqests and limits.
              # We also recommend setting requests equal to limits for both CPU and memory.
              # For this example, we use a 500m CPU request to accomodate resource-constrained local
              # Kubernetes testing environments such as KinD and minikube.
              cpu: "500m"
              memory: "2G"
        volumes:
          - name: ray-logs
            emptyDir: {}
  workerGroupSpecs:
  # the pod replicas in this group typed worker
  - replicas: 1
    minReplicas: 1
    maxReplicas: 10
    # logical group name, for this called small-group, also can be functional
    groupName: small-group
    # If worker pods need to be added, we can increment the replicas.
    # If worker pods need to be removed, we decrement the replicas, and populate the workersToDelete list.
    # The operator will remove pods from the list until the desired number of replicas is satisfied.
    # If the difference between the current replica count and the desired replicas is greater than the
    # number of entries in workersToDelete, random worker pods will be deleted.
    #scaleStrategy:
    #  workersToDelete:
    #  - raycluster-complete-worker-small-group-bdtwh
    #  - raycluster-complete-worker-small-group-hv457
    #  - raycluster-complete-worker-small-group-k8tj7
    # the following params are used to complete the ray start: ray start --block
    rayStartParams:
      block: 'true'
    #pod template
    template:
      spec:
        containers:
        - name: ray-worker
          #image: rayproject/ray:2.1.0
          image: 558717131297.dkr.ecr.us-west-2.amazonaws.com/sematic-dev@sha256:6f089f664f135fe62b9fac3ed0261c0a7b794ba518bc109f025e516555aa9791
          lifecycle:
            preStop:
              exec:
                command: ["/bin/sh","-c","ray stop"]
          # use volumeMounts.Optional.
          # Refer to https://kubernetes.io/docs/concepts/storage/volumes/
          volumeMounts:
            - mountPath: /tmp/ray
              name: ray-logs
          # The resource requests and limits in this config are too small for production!
          # For an example with more realistic resource configuration, see
          # ray-cluster.autoscaler.large.yaml.
          # It is better to use a few large Ray pod than many small ones.
          # For production, it is ideal to size each Ray pod to take up the
          # entire Kubernetes node on which it is scheduled.
          resources:
            limits:
              cpu: "1"
              memory: "1G"
            # For production use-cases, we recommend specifying integer CPU reqests and limits.
            # We also recommend setting requests equal to limits for both CPU and memory.
            # For this example, we use a 500m CPU request to accomodate resource-constrained local
            # Kubernetes testing environments such as KinD and minikube.
            requests:
              # For production use-cases, we recommend specifying integer CPU reqests and limits.
              # We also recommend setting requests equal to limits for both CPU and memory.
              # For this example, we use a 500m CPU request to accomodate resource-constrained local
              # Kubernetes testing environments such as KinD and minikube.
              cpu: "500m"
              # For production use-cases, we recommend allocating at least 8Gb memory for each Ray container.
              memory: "1G"
        initContainers:
        # the env var $RAY_IP is set by the operator if missing, with the value of the head service name
        - name: init
          image: busybox:1.28
          # Change the cluster postfix if you don't have a default setting
          command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"]
        # use volumes
        # Refer to https://kubernetes.io/docs/concepts/storage/volumes/
        volumes:
          - name: ray-logs
            emptyDir: {}
	apiVersion: ray.io/v1alpha1
	kind: RayCluster
	metadata:
	labels:
	controller-tools.k8s.io: "1.0"
	# A unique identifier for the head node and workers of this cluster.
	name: raycluster-complete
	spec:
	rayVersion: '2.1.0'
	######################headGroupSpec#################################
	# Ray head pod template and specs
	headGroupSpec:
	# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
	serviceType: ClusterIP
	# the following params are used to complete the ray start: ray start --head --block --dashboard-host: '0.0.0.0' ...
	rayStartParams:
	dashboard-host: '0.0.0.0'
	block: 'true'
	#pod template
	template:
	metadata:
	# Custom labels. NOTE: To avoid conflicts with KubeRay operator, do not define custom labels start with `raycluster`.
	# Refer to https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
	labels: {}
	spec:
	containers:
	- name: ray-head
	#image: rayproject/ray:2.1.0
	image: 558717131297.dkr.ecr.us-west-2.amazonaws.com/sematic-dev@sha256:6f089f664f135fe62b9fac3ed0261c0a7b794ba518bc109f025e516555aa9791
	ports:
	- containerPort: 6379
	name: gcs
	- containerPort: 8265
	name: dashboard
	- containerPort: 10001
	name: client
	lifecycle:
	preStop:
	exec:
	command: ["/bin/sh","-c","ray stop"]
	volumeMounts:
	- mountPath: /tmp/ray
	name: ray-logs
	# The resource requests and limits in this config are too small for production!
	# For an example with more realistic resource configuration, see
	# ray-cluster.autoscaler.large.yaml.
	# It is better to use a few large Ray pod than many small ones.
	# For production, it is ideal to size each Ray pod to take up the
	# entire Kubernetes node on which it is scheduled.
	resources:
	limits:
	cpu: "1"
	memory: "2G"
	requests:
	# For production use-cases, we recommend specifying integer CPU reqests and limits.
	# We also recommend setting requests equal to limits for both CPU and memory.
	# For this example, we use a 500m CPU request to accomodate resource-constrained local
	# Kubernetes testing environments such as KinD and minikube.
	cpu: "500m"
	memory: "2G"
	volumes:
	- name: ray-logs
	emptyDir: {}
	workerGroupSpecs:
	# the pod replicas in this group typed worker
	- replicas: 1
	minReplicas: 1
	maxReplicas: 10
	# logical group name, for this called small-group, also can be functional
	groupName: small-group
	# If worker pods need to be added, we can increment the replicas.
	# If worker pods need to be removed, we decrement the replicas, and populate the workersToDelete list.
	# The operator will remove pods from the list until the desired number of replicas is satisfied.
	# If the difference between the current replica count and the desired replicas is greater than the
	# number of entries in workersToDelete, random worker pods will be deleted.
	#scaleStrategy:
	# workersToDelete:
	# - raycluster-complete-worker-small-group-bdtwh
	# - raycluster-complete-worker-small-group-hv457
	# - raycluster-complete-worker-small-group-k8tj7
	# the following params are used to complete the ray start: ray start --block
	rayStartParams:
	block: 'true'
	#pod template
	template:
	spec:
	containers:
	- name: ray-worker
	#image: rayproject/ray:2.1.0
	image: 558717131297.dkr.ecr.us-west-2.amazonaws.com/sematic-dev@sha256:6f089f664f135fe62b9fac3ed0261c0a7b794ba518bc109f025e516555aa9791
	lifecycle:
	preStop:
	exec:
	command: ["/bin/sh","-c","ray stop"]
	# use volumeMounts.Optional.
	# Refer to https://kubernetes.io/docs/concepts/storage/volumes/
	volumeMounts:
	- mountPath: /tmp/ray
	name: ray-logs
	# The resource requests and limits in this config are too small for production!
	# For an example with more realistic resource configuration, see
	# ray-cluster.autoscaler.large.yaml.
	# It is better to use a few large Ray pod than many small ones.
	# For production, it is ideal to size each Ray pod to take up the
	# entire Kubernetes node on which it is scheduled.
	resources:
	limits:
	cpu: "1"
	memory: "1G"
	# For production use-cases, we recommend specifying integer CPU reqests and limits.
	# We also recommend setting requests equal to limits for both CPU and memory.
	# For this example, we use a 500m CPU request to accomodate resource-constrained local
	# Kubernetes testing environments such as KinD and minikube.
	requests:
	# For production use-cases, we recommend specifying integer CPU reqests and limits.
	# We also recommend setting requests equal to limits for both CPU and memory.
	# For this example, we use a 500m CPU request to accomodate resource-constrained local
	# Kubernetes testing environments such as KinD and minikube.
	cpu: "500m"
	# For production use-cases, we recommend allocating at least 8Gb memory for each Ray container.
	memory: "1G"
	initContainers:
	# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
	- name: init
	image: busybox:1.28
	# Change the cluster postfix if you don't have a default setting
	command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"]
	# use volumes
	# Refer to https://kubernetes.io/docs/concepts/storage/volumes/
	volumes:
	- name: ray-logs
	emptyDir: {}