Created
June 26, 2024 04:26
-
-
Save asaiacai/8fb5bd877af7ce00f1ffcb4ad7aebfaa to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(base) gcpuser@k3s-ebd1-head-av78ndgn-compute:/usr/local/bin$ sky show-gpus --cloud kubernetes | |
No GPUs found in Kubernetes cluster. If your cluster contains GPUs, make sure nvidia.com/gpu resource is available on the nodes and the node labels for identifying GPUs (e.g., skypilot.co/accelerator) are setup correctly. To further debug, run: sky check | |
(base) gcpuser@k3s-ebd1-head-av78ndgn-compute:/usr/local/bin$ sky launch -c test2 --cloud kubernetes | |
Traceback (most recent call last): | |
File "/opt/conda/bin/sky", line 8, in <module> | |
sys.exit(cli()) | |
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1157, in __call__ | |
return self.main(*args, **kwargs) | |
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1078, in main | |
rv = self.invoke(ctx) | |
File "/home/gcpuser/skypilot/sky/utils/common_utils.py", line 367, in _record | |
return f(*args, **kwargs) | |
File "/home/gcpuser/skypilot/sky/cli.py", line 806, in invoke | |
return super().invoke(ctx) | |
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1688, in invoke | |
return _process_result(sub_ctx.command.invoke(sub_ctx)) | |
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1434, in invoke | |
return ctx.invoke(self.callback, **ctx.params) | |
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 783, in invoke | |
return __callback(*args, **kwargs) | |
File "/home/gcpuser/skypilot/sky/utils/common_utils.py", line 388, in _record | |
return f(*args, **kwargs) | |
File "/home/gcpuser/skypilot/sky/cli.py", line 1119, in launch | |
_launch_with_confirm(task, | |
File "/home/gcpuser/skypilot/sky/cli.py", line 572, in _launch_with_confirm | |
dag = sky.optimize(dag) | |
File "/home/gcpuser/skypilot/sky/optimizer.py", line 129, in optimize | |
unused_best_plan = Optimizer._optimize_dag( | |
File "/home/gcpuser/skypilot/sky/optimizer.py", line 1013, in _optimize_dag | |
Optimizer._estimate_nodes_cost_or_time(local_topo_order, | |
File "/home/gcpuser/skypilot/sky/optimizer.py", line 271, in _estimate_nodes_cost_or_time | |
_fill_in_launchable_resources( | |
File "/home/gcpuser/skypilot/sky/optimizer.py", line 1244, in _fill_in_launchable_resources | |
fuzzy_candidate_list) = cloud.get_feasible_launchable_resources( | |
File "/home/gcpuser/skypilot/sky/clouds/cloud.py", line 376, in get_feasible_launchable_resources | |
return self._get_feasible_launchable_resources(resources) | |
File "/home/gcpuser/skypilot/sky/clouds/kubernetes.py", line 387, in _get_feasible_launchable_resources | |
fits, reason = kubernetes_utils.check_instance_fits( | |
File "/home/gcpuser/skypilot/sky/provision/kubernetes/utils.py", line 399, in check_instance_fits | |
nodes = get_kubernetes_nodes() | |
File "/home/gcpuser/skypilot/sky/provision/kubernetes/utils.py", line 340, in get_kubernetes_nodes | |
raise exceptions.ResourcesUnavailableError( | |
sky.exceptions.ResourcesUnavailableError: Timed out when trying to get node info from Kubernetes cluster. Please check if the cluster is healthy and retry. | |
(base) gcpuser@k3s-ebd1-head-av78ndgn-compute:/usr/local/bin$ sky exec test "echo hi" | |
Task from command: echo hi | |
Executing task on cluster test... | |
E 06-26 04:17:05 subprocess_utils.py:84] The connection to the server 127.0.0.1:6443 was refused - did you specify the right host or port? | |
E 06-26 04:17:05 subprocess_utils.py:84] | |
I 06-26 04:17:05 cloud_vm_ray_backend.py:3466] | |
I 06-26 04:17:05 cloud_vm_ray_backend.py:3466] Cluster name: test | |
I 06-26 04:17:05 cloud_vm_ray_backend.py:3466] To log into the head VM: ssh test | |
I 06-26 04:17:05 cloud_vm_ray_backend.py:3466] To submit a job: sky exec test yaml_file | |
I 06-26 04:17:05 cloud_vm_ray_backend.py:3466] To stop the cluster: sky stop test | |
I 06-26 04:17:05 cloud_vm_ray_backend.py:3466] To teardown the cluster: sky down test | |
Clusters | |
NAME LAUNCHED RESOURCES STATUS AUTOSTOP COMMAND | |
test 6 mins ago 1x Kubernetes(2CPU--2GB) UP - sky exec test echo hi | |
sky.exceptions.CommandError: Command $([ -s ~/.sky/python_path ] && cat ~/.sky/python_path 2> /dev/null || which python3) -u -c 'import os;import getpass;from sky.skylet import job_lib, log_lib, constants;job_owner_kwargs = {} if getattr(constants, "SKYLET_LIB_VERSION", 0) >= 1 else {"job_owner": getpass.getuser()};job_id = job_lib.add_job('"'"'sky-cmd'"'"', '"'"'gcpuser'"'"', '"'"'sky-2024-06-26-04-17-05-281904'"'"', '"'"'1x[CPU:1+]'"'"');print("Job ID: " + str(job_id), flush=True)' failed with return code 1. | |
Failed to fetch job id. | |
(base) gcpuser@k3s-ebd1-head-av78ndgn-compute:/usr/local/bin$ sky down test | |
Terminating 1 cluster: test. Proceed? [Y/n]: Y | |
Traceback (most recent call last): | |
File "/opt/conda/lib/python3.10/site-packages/urllib3/connection.py", line 203, in _new_conn | |
sock = connection.create_connection( | |
File "/opt/conda/lib/python3.10/site-packages/urllib3/util/connection.py", line 85, in create_connection | |
raise err | |
File "/opt/conda/lib/python3.10/site-packages/urllib3/util/connection.py", line 73, in create_connection | |
sock.connect(sa) | |
ConnectionRefusedError: [Errno 111] Connection refused | |
The above exception was the direct cause of the following exception: | |
Traceback (most recent call last): | |
File "/opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py", line 791, in urlopen | |
response = self._make_request( | |
File "/opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py", line 492, in _make_request | |
raise new_e | |
File "/opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py", line 468, in _make_request | |
self._validate_conn(conn) | |
File "/opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py", line 1097, in _validate_conn | |
conn.connect() | |
File "/opt/conda/lib/python3.10/site-packages/urllib3/connection.py", line 611, in connect | |
self.sock = sock = self._new_conn() | |
File "/opt/conda/lib/python3.10/site-packages/urllib3/connection.py", line 218, in _new_conn | |
raise NewConnectionError( | |
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x7f92c84a1cf0>: Failed to establish a new connection: [Errno 111] Connection refused | |
The above exception was the direct cause of the following exception: | |
Traceback (most recent call last): | |
File "/opt/conda/bin/sky", line 8, in <module> | |
sys.exit(cli()) | |
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1157, in __call__ | |
return self.main(*args, **kwargs) | |
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1078, in main | |
rv = self.invoke(ctx) | |
File "/home/gcpuser/skypilot/sky/utils/common_utils.py", line 367, in _record | |
return f(*args, **kwargs) | |
File "/home/gcpuser/skypilot/sky/cli.py", line 806, in invoke | |
return super().invoke(ctx) | |
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1688, in invoke | |
return _process_result(sub_ctx.command.invoke(sub_ctx)) | |
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1434, in invoke | |
return ctx.invoke(self.callback, **ctx.params) | |
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 783, in invoke | |
return __callback(*args, **kwargs) | |
File "/home/gcpuser/skypilot/sky/utils/common_utils.py", line 388, in _record | |
return f(*args, **kwargs) | |
File "/home/gcpuser/skypilot/sky/cli.py", line 2563, in down | |
_down_or_stop_clusters(clusters, | |
File "/home/gcpuser/skypilot/sky/cli.py", line 2881, in _down_or_stop_clusters | |
subprocess_utils.run_in_parallel(_down_or_stop, clusters) | |
File "/home/gcpuser/skypilot/sky/utils/subprocess_utils.py", line 65, in run_in_parallel | |
return list(p.imap(func, args)) | |
File "/opt/conda/lib/python3.10/multiprocessing/pool.py", line 873, in next | |
raise value | |
File "/opt/conda/lib/python3.10/multiprocessing/pool.py", line 125, in worker | |
result = (True, func(*args, **kwds)) | |
File "/home/gcpuser/skypilot/sky/cli.py", line 2853, in _down_or_stop | |
core.down(name, purge=purge) | |
File "/home/gcpuser/skypilot/sky/utils/common_utils.py", line 388, in _record | |
return f(*args, **kwargs) | |
File "/home/gcpuser/skypilot/sky/core.py", line 401, in down | |
backend.teardown(handle, terminate=True, purge=purge) | |
File "/home/gcpuser/skypilot/sky/utils/common_utils.py", line 388, in _record | |
return f(*args, **kwargs) | |
File "/home/gcpuser/skypilot/sky/utils/common_utils.py", line 367, in _record | |
return f(*args, **kwargs) | |
File "/home/gcpuser/skypilot/sky/backends/backend.py", line 116, in teardown | |
self._teardown(handle, terminate, purge) | |
File "/home/gcpuser/skypilot/sky/backends/cloud_vm_ray_backend.py", line 3539, in _teardown | |
self.teardown_no_lock( | |
File "/home/gcpuser/skypilot/sky/backends/cloud_vm_ray_backend.py", line 3868, in teardown_no_lock | |
provisioner.teardown_cluster(repr(cloud), | |
File "/home/gcpuser/skypilot/sky/provision/provisioner.py", line 237, in teardown_cluster | |
provision.terminate_instances(cloud_name, cluster_name.name_on_cloud, | |
File "/home/gcpuser/skypilot/sky/provision/__init__.py", line 47, in _wrapper | |
return impl(*args, **kwargs) | |
File "/home/gcpuser/skypilot/sky/provision/kubernetes/instance.py", line 629, in terminate_instances | |
pods = _filter_pods(namespace, tag_filters, None) | |
File "/home/gcpuser/skypilot/sky/provision/kubernetes/instance.py", line 62, in _filter_pods | |
pod_list = kubernetes.core_api().list_namespaced_pod( | |
File "/opt/conda/lib/python3.10/contextlib.py", line 79, in inner | |
return func(*args, **kwds) | |
File "/opt/conda/lib/python3.10/site-packages/kubernetes/client/api/core_v1_api.py", line 15697, in list_namespaced_pod | |
return self.list_namespaced_pod_with_http_info(namespace, **kwargs) # noqa: E501 | |
File "/opt/conda/lib/python3.10/contextlib.py", line 79, in inner | |
return func(*args, **kwds) | |
File "/opt/conda/lib/python3.10/site-packages/kubernetes/client/api/core_v1_api.py", line 15812, in list_namespaced_pod_with_http_info | |
return self.api_client.call_api( | |
File "/opt/conda/lib/python3.10/site-packages/kubernetes/client/api_client.py", line 348, in call_api | |
return self.__call_api(resource_path, method, | |
File "/opt/conda/lib/python3.10/site-packages/kubernetes/client/api_client.py", line 180, in __call_api | |
response_data = self.request( | |
File "/opt/conda/lib/python3.10/site-packages/kubernetes/client/api_client.py", line 373, in request | |
return self.rest_client.GET(url, | |
File "/opt/conda/lib/python3.10/site-packages/kubernetes/client/rest.py", line 241, in GET | |
return self.request("GET", url, | |
File "/opt/conda/lib/python3.10/site-packages/kubernetes/client/rest.py", line 214, in request | |
r = self.pool_manager.request(method, url, | |
File "/opt/conda/lib/python3.10/site-packages/urllib3/_request_methods.py", line 110, in request | |
return self.request_encode_url( | |
File "/opt/conda/lib/python3.10/site-packages/urllib3/_request_methods.py", line 143, in request_encode_url | |
return self.urlopen(method, url, **extra_kw) | |
File "/opt/conda/lib/python3.10/site-packages/urllib3/poolmanager.py", line 443, in urlopen | |
response = conn.urlopen(method, u.request_uri, **kw) | |
File "/opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py", line 875, in urlopen | |
return self.urlopen( | |
File "/opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py", line 875, in urlopen | |
return self.urlopen( | |
File "/opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py", line 875, in urlopen | |
return self.urlopen( | |
File "/opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py", line 845, in urlopen | |
retries = retries.increment( | |
File "/opt/conda/lib/python3.10/site-packages/urllib3/util/retry.py", line 515, in increment | |
raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type] | |
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='127.0.0.1', port=6443): Max retries exceeded with url: /api/v1/namespaces/default/pods?fieldSelector=&labelSelector=ray-cluster-name%3Dtest-6daf (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f92c84a1cf0>: Failed to establish a new connection: [Errno 111] Connection refused')) | |
(base) gcpuser@k3s-ebd1-head-av78ndgn-compute:/usr/local/bin$ cd ~/skypilot | |
(base) gcpuser@k3s-ebd1-head-av78ndgn-compute:~/skypilot$ sky jobs launch "echo hihi" | |
Task from command: echo hihi | |
Managed job 'sky-cmd' will be launched on (estimated): | |
Traceback (most recent call last): | |
File "/opt/conda/bin/sky", line 8, in <module> | |
sys.exit(cli()) | |
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1157, in __call__ | |
return self.main(*args, **kwargs) | |
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1078, in main | |
rv = self.invoke(ctx) | |
File "/home/gcpuser/skypilot/sky/utils/common_utils.py", line 367, in _record | |
return f(*args, **kwargs) | |
File "/home/gcpuser/skypilot/sky/cli.py", line 806, in invoke | |
return super().invoke(ctx) | |
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1688, in invoke | |
return _process_result(sub_ctx.command.invoke(sub_ctx)) | |
File "/home/gcpuser/skypilot/sky/utils/common_utils.py", line 367, in _record | |
return f(*args, **kwargs) | |
File "/home/gcpuser/skypilot/sky/cli.py", line 806, in invoke | |
return super().invoke(ctx) | |
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1688, in invoke | |
return _process_result(sub_ctx.command.invoke(sub_ctx)) | |
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1434, in invoke | |
return ctx.invoke(self.callback, **ctx.params) | |
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 783, in invoke | |
return __callback(*args, **kwargs) | |
File "/home/gcpuser/skypilot/sky/utils/common_utils.py", line 388, in _record | |
return f(*args, **kwargs) | |
File "/home/gcpuser/skypilot/sky/utils/common_utils.py", line 388, in _record | |
return f(*args, **kwargs) | |
File "/home/gcpuser/skypilot/sky/cli.py", line 3569, in jobs_launch | |
dag = sky.optimize(dag) | |
File "/home/gcpuser/skypilot/sky/optimizer.py", line 129, in optimize | |
unused_best_plan = Optimizer._optimize_dag( | |
File "/home/gcpuser/skypilot/sky/optimizer.py", line 1013, in _optimize_dag | |
Optimizer._estimate_nodes_cost_or_time(local_topo_order, | |
File "/home/gcpuser/skypilot/sky/optimizer.py", line 271, in _estimate_nodes_cost_or_time | |
_fill_in_launchable_resources( | |
File "/home/gcpuser/skypilot/sky/optimizer.py", line 1244, in _fill_in_launchable_resources | |
fuzzy_candidate_list) = cloud.get_feasible_launchable_resources( | |
File "/home/gcpuser/skypilot/sky/clouds/cloud.py", line 376, in get_feasible_launchable_resources | |
return self._get_feasible_launchable_resources(resources) | |
File "/home/gcpuser/skypilot/sky/clouds/kubernetes.py", line 387, in _get_feasible_launchable_resources | |
fits, reason = kubernetes_utils.check_instance_fits( | |
File "/home/gcpuser/skypilot/sky/provision/kubernetes/utils.py", line 399, in check_instance_fits | |
nodes = get_kubernetes_nodes() | |
File "/home/gcpuser/skypilot/sky/provision/kubernetes/utils.py", line 340, in get_kubernetes_nodes | |
raise exceptions.ResourcesUnavailableError( | |
sky.exceptions.ResourcesUnavailableError: Timed out when trying to get node info from Kubernetes cluster. Please check if the cluster is healthy and retry. | |
(base) gcpuser@k3s-ebd1-head-av78ndgn-compute:~/skypilot$ sky serve up ~/skypilot/examples/serve/vllm.yaml | |
Service from YAML spec: /home/gcpuser/skypilot/examples/serve/vllm.yaml | |
Service Spec: | |
Readiness probe method: GET /v1/models | |
Readiness initial delay seconds: 1200 | |
Replica autoscaling policy: Fixed 2 replicas | |
Spot Policy: No spot policy | |
Each replica will use the following resources (estimated): | |
sky.exceptions.ResourcesUnavailableError: Timed out when trying to get node info from Kubernetes cluster. Please check if the cluster is healthy and retry. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment