-
-
Save mackrorysd/dcb7291a0dcc6549f1227fd1d0bf5eb5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<info> [2021-04-05, 22:11:08] master configuration: {"config_file":"","log":{"level":"info","color":true},"db":{"user":"postgres","password":"********","migrations":"file:///usr/share/determined/master/static/migrations","host":"10.108.192.5","port":"5432","name":"postgres","ssl_mode":"disable","ssl_root_cert":""},"tensorboard_timeout":300,"security":{"default_task":{"id":0,"user_id":0,"user":"root","uid":0,"group":"root","gid":0},"tls":{"cert":"/etc/ssl/certs/determined.cer","key":"/etc/ssl/private/determined.key"}},"checkpoint_storage":{"bucket":"release-party","save_experiment_best":0,"save_trial_best":1,"save_trial_latest":1,"type":"gcs"},"task_container_defaults":{"shm_size_bytes":4294967296,"network_mode":"bridge","cpu_pod_spec":null,"gpu_pod_spec":null},"port":443,"harness_path":"/opt/determined","root":"/usr/share/determined/master","telemetry":{"enabled":false,"segment_master_key":"********","segment_webui_key":"********"},"enable_cors":false,"cluster_name":"","logging":{"type":"default"},"hyperparameter_importance":{"workers_limit":0,"queue_limit":16,"cores_per_worker":1,"max_trees":100},"resource_manager":{"default_cpu_resource_pool":"default-cpu-pool","default_gpu_resource_pool":"default-gpu-pool","scheduler":{"default_priority":42,"fitting_policy":"best","preemption":true,"type":"priority"},"type":"agent"},"resource_pools":[{"pool_name":"default-gpu-pool","description":"","provider":{"agent_docker_image":"determinedai/determined-ee-agent:0.14.6","agent_docker_network":"host","agent_docker_runtime":"runc","agent_fluent_image":"fluent/fluent-bit:1.6","base_config":{"minCpuPlatform":"Intel Broadwell"},"boot_disk_size":200,"boot_disk_source_image":"projects/determined-ai/global/images/det-environments-067db2b","container_startup_script":"export HOME=/root\napt-get update \u0026\u0026 apt-get install -y curl docker.io\ncurl -fsSL \"https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v1.5.0/docker-credential-gcr_linux_amd64-1.5.0.tar.gz\" | tar xz --to-stdout \u003e /usr/bin/docker-credential-gcr \u0026\u0026 chmod +x /usr/bin/docker-credential-gcr\ndocker-credential-gcr configure-docker\n","instance_type":{"gpu_num":8,"gpu_type":"nvidia-tesla-v100","machine_type":"n1-standard-32","preemptible":false},"label_key":"managed-by","label_value":"","master_cert_name":"gcloud.determined.ai","master_url":"https://internal-ip:443","max_agent_starting_period":"20m0s","max_idle_agent_period":"30m0s","max_instances":8,"min_instances":0,"name_prefix":"det-agent-release-party-","network_interface":{"external_ip":false,"network":"projects/dai-public/global/networks/restricted-shared-network","subnetwork":"projects/dai-public/regions/us-west1/subnetworks/restricted-shared-network"},"network_tags":["https-server"],"operation_timeout_period":"5m0s","project":"","service_account":{"email":"argo-determined-ai-vm-agent@determined-ai.iam.gserviceaccount.com","scopes":["https://www.googleapis.com/auth/cloud-platform"]},"startup_script":"systemctl stop apt-daily.service\nsystemctl kill --kill-who=all apt-daily.service\nwhile pgrep -f apt.systemd.daily \u003e /dev/null; do echo waiting; sleep 1; done\n\necho \"cc2c8fc7-a61f-4ed1-935f-d6e04445c656\" | docker login -u=\"determinedaicustomer\" --password-stdin\n\nexport HOME=/root\napt-get update \u0026\u0026 apt-get install -y curl docker.io\ncurl -fsSL \"https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v1.5.0/docker-credential-gcr_linux_amd64-1.5.0.tar.gz\" | tar xz --to-stdout \u003e /usr/bin/docker-credential-gcr \u0026\u0026 chmod +x /usr/bin/docker-credential-gcr\ndocker-credential-gcr configure-docker\ndocker pull determinedai/determined-ee-agent:0.14.6 \n\ncurl -sSO https://dl.google.com/cloudagents/add-monitoring-agent-repo.sh\nbash add-monitoring-agent-repo.sh\napt-get update\napt-get install -y stackdriver-agent\nservice stackdriver-agent start\n\ncurl -sSO https://dl.google.com/cloudagents/add-logging-agent-repo.sh\nbash add-logging-agent-repo.sh\napt-get update\napt-get install google-fluentd\napt-get install -y google-fluentd-catch-all-config\nservice google-fluentd start\nservice google-fluentd status\n\napt-get install -y python-pip\npip install --upgrade pip\npip install --upgrade setuptools\ngit clone https://github.com/GoogleCloudPlatform/tensorflow-inference-tensorrt5-t4-gpu.git\ncd tensorflow-inference-tensorrt5-t4-gpu/metrics_reporting\npip install -r ./requirements.txt\ncp report_gpu_metrics.py /root/\ncat \u003c\u003c-EOH \u003e /lib/systemd/system/gpu_utilization_agent.service\n[Unit]\nDescription=GPU Utilization Metric Agent\n[Service]\nPIDFile=/run/gpu_agent.pid\nExecStart=/bin/bash --login -c '/usr/bin/python /root/report_gpu_metrics.py'\nUser=root\nGroup=root\nWorkingDirectory=/\nRestart=always\n[Install]\nWantedBy=multi-user.target\nEOH\nsystemctl daemon-reload\nsystemctl --no-reload --now enable /lib/systemd/system/gpu_utilization_agent.service\n","type":"gcp","zone":""},"max_cpu_containers_per_agent":0},{"pool_name":"default-cpu-pool","description":"","provider":{"agent_docker_image":"determinedai/determined-ee-agent:0.14.6","agent_docker_network":"host","agent_docker_runtime":"runc","agent_fluent_image":"fluent/fluent-bit:1.6","base_config":{"minCpuPlatform":"Intel Broadwell"},"boot_disk_size":200,"boot_disk_source_image":"projects/determined-ai/global/images/det-environments-067db2b","container_startup_script":"export HOME=/root\napt-get update \u0026\u0026 apt-get install -y curl docker.io\ncurl -fsSL \"https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v1.5.0/docker-credential-gcr_linux_amd64-1.5.0.tar.gz\" | tar xz --to-stdout \u003e /usr/bin/docker-credential-gcr \u0026\u0026 chmod +x /usr/bin/docker-credential-gcr\ndocker-credential-gcr configure-docker\n","instance_type":{"gpu_num":0,"gpu_type":"nvidia-tesla-v100","machine_type":"n1-standard-32","preemptible":false},"label_key":"managed-by","label_value":"","master_cert_name":"gcloud.determined.ai","master_url":"https://internal-ip:443","max_agent_starting_period":"20m0s","max_idle_agent_period":"30m0s","max_instances":8,"min_instances":0,"name_prefix":"det-agent-release-party-","network_interface":{"external_ip":false,"network":"projects/dai-public/global/networks/restricted-shared-network","subnetwork":"projects/dai-public/regions/us-west1/subnetworks/restricted-shared-network"},"network_tags":["https-server"],"operation_timeout_period":"5m0s","project":"","service_account":{"email":"argo-determined-ai-vm-agent@determined-ai.iam.gserviceaccount.com","scopes":["https://www.googleapis.com/auth/cloud-platform"]},"startup_script":"systemctl stop apt-daily.service\nsystemctl kill --kill-who=all apt-daily.service\nwhile pgrep -f apt.systemd.daily \u003e /dev/null; do echo waiting; sleep 1; done\n\necho \"cc2c8fc7-a61f-4ed1-935f-d6e04445c656\" | docker login -u=\"determinedaicustomer\" --password-stdin\n\nexport HOME=/root\napt-get update \u0026\u0026 apt-get install -y curl docker.io\ncurl -fsSL \"https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v1.5.0/docker-credential-gcr_linux_amd64-1.5.0.tar.gz\" | tar xz --to-stdout \u003e /usr/bin/docker-credential-gcr \u0026\u0026 chmod +x /usr/bin/docker-credential-gcr\ndocker-credential-gcr configure-docker\ndocker pull determinedai/determined-ee-agent:0.14.6 \n\ncurl -sSO https://dl.google.com/cloudagents/add-monitoring-agent-repo.sh\nbash add-monitoring-agent-repo.sh\napt-get update\napt-get install -y stackdriver-agent\nservice stackdriver-agent start\n\ncurl -sSO https://dl.google.com/cloudagents/add-logging-agent-repo.sh\nbash add-logging-agent-repo.sh\napt-get update\napt-get install google-fluentd\napt-get install -y google-fluentd-catch-all-config\nservice google-fluentd start\nservice google-fluentd status\n","type":"gcp","zone":""},"max_cpu_containers_per_agent":100},{"pool_name":"preemptible-gpu-pool","description":"","provider":{"agent_docker_image":"determinedai/determined-ee-agent:0.14.6","agent_docker_network":"host","agent_docker_runtime":"runc","agent_fluent_image":"fluent/fluent-bit:1.6","base_config":{"minCpuPlatform":"Intel Broadwell"},"boot_disk_size":200,"boot_disk_source_image":"projects/determined-ai/global/images/det-environments-067db2b","container_startup_script":"export HOME=/root\napt-get update \u0026\u0026 apt-get install -y curl docker.io\ncurl -fsSL \"https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v1.5.0/docker-credential-gcr_linux_amd64-1.5.0.tar.gz\" | tar xz --to-stdout \u003e /usr/bin/docker-credential-gcr \u0026\u0026 chmod +x /usr/bin/docker-credential-gcr\ndocker-credential-gcr configure-docker\n","instance_type":{"gpu_num":8,"gpu_type":"nvidia-tesla-v100","machine_type":"n1-standard-32","preemptible":true},"label_key":"managed-by","label_value":"","master_cert_name":"gcloud.determined.ai","master_url":"https://internal-ip:443","max_agent_starting_period":"20m0s","max_idle_agent_period":"30m0s","max_instances":8,"min_instances":0,"name_prefix":"det-agent-release-party-","network_interface":{"external_ip":false,"network":"projects/dai-public/global/networks/restricted-shared-network","subnetwork":"projects/dai-public/regions/us-west1/subnetworks/restricted-shared-network"},"network_tags":["https-server"],"operation_timeout_period":"5m0s","project":"","service_account":{"email":"argo-determined-ai-vm-agent@determined-ai.iam.gserviceaccount.com","scopes":["https://www.googleapis.com/auth/cloud-platform"]},"startup_script":"systemctl stop apt-daily.service\nsystemctl kill --kill-who=all apt-daily.service\nwhile pgrep -f apt.systemd.daily \u003e /dev/null; do echo waiting; sleep 1; done\n\necho \"cc2c8fc7-a61f-4ed1-935f-d6e04445c656\" | docker login -u=\"determinedaicustomer\" --password-stdin\n\nexport HOME=/root\napt-get update \u0026\u0026 apt-get install -y curl docker.io\ncurl -fsSL \"https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v1.5.0/docker-credential-gcr_linux_amd64-1.5.0.tar.gz\" | tar xz --to-stdout \u003e /usr/bin/docker-credential-gcr \u0026\u0026 chmod +x /usr/bin/docker-credential-gcr\ndocker-credential-gcr configure-docker\ndocker pull determinedai/determined-ee-agent:0.14.6 \n\ncurl -sSO https://dl.google.com/cloudagents/add-monitoring-agent-repo.sh\nbash add-monitoring-agent-repo.sh\napt-get update\napt-get install -y stackdriver-agent\nservice stackdriver-agent start\n\ncurl -sSO https://dl.google.com/cloudagents/add-logging-agent-repo.sh\nbash add-logging-agent-repo.sh\napt-get update\napt-get install google-fluentd\napt-get install -y google-fluentd-catch-all-config\nservice google-fluentd start\nservice google-fluentd status\n\napt-get install -y python-pip\npip install --upgrade pip\npip install --upgrade setuptools\ngit clone https://github.com/GoogleCloudPlatform/tensorflow-inference-tensorrt5-t4-gpu.git\ncd tensorflow-inference-tensorrt5-t4-gpu/metrics_reporting\npip install -r ./requirements.txt\ncp report_gpu_metrics.py /root/\ncat \u003c\u003c-EOH \u003e /lib/systemd/system/gpu_utilization_agent.service\n[Unit]\nDescription=GPU Utilization Metric Agent\n[Service]\nPIDFile=/run/gpu_agent.pid\nExecStart=/bin/bash --login -c '/usr/bin/python /root/report_gpu_metrics.py'\nUser=root\nGroup=root\nWorkingDirectory=/\nRestart=always\n[Install]\nWantedBy=multi-user.target\nEOH\nsystemctl daemon-reload\nsystemctl --no-reload --now enable /lib/systemd/system/gpu_utilization_agent.service\n","type":"gcp","zone":""},"max_cpu_containers_per_agent":0}],"scim":{"enabled":true,"auth":{"type":"oauth"}},"saml":{"enabled":true,"provider":"Okta","idp_recipient_url":"https://gcloud.determined.ai/saml/sso","idp_sso_url":"https://dev-2564556.okta.com/app/dev-2564556_determinedai_1/exkg2xv5x7w517g3G5d6/sso/saml","idp_sso_descriptor_url":"http://www.okta.com/exkg2xv5x7w517g3G5d6","idp_cert_path":"/etc/determined/etc/idp.cert"}} | |
<info> [2021-04-05, 22:11:08] Determined master 0.14.6 (built with go1.16.2) | |
<info> [2021-04-05, 22:11:08] connecting to database 10.108.192.5:5432 | |
<info> [2021-04-05, 22:11:08] running migrations from file:///usr/share/determined/master/static/migrations | |
<info> [2021-04-05, 22:11:08] found golang-migrate version 20210322160616 | |
<info> [2021-04-05, 22:11:08] deleting all snapshots for terminal state experiments | |
<info> [2021-04-05, 22:11:08] creating resource pool: default-gpu-pool id="agentRM" system="master" type="agentResourceManager" | |
<info> [2021-04-05, 22:11:08] pool default-gpu-pool using global scheduling config id="agentRM" system="master" type="agentResourceManager" | |
<info> [2021-04-05, 22:11:08] creating resource pool: default-cpu-pool id="agentRM" system="master" type="agentResourceManager" | |
<info> [2021-04-05, 22:11:08] pool default-cpu-pool using global scheduling config id="agentRM" system="master" type="agentResourceManager" | |
<info> [2021-04-05, 22:11:08] creating resource pool: preemptible-gpu-pool id="agentRM" system="master" type="agentResourceManager" | |
<info> [2021-04-05, 22:11:08] pool preemptible-gpu-pool using global scheduling config id="agentRM" system="master" type="agentResourceManager" | |
<info> [2021-04-05, 22:11:08] initializing endpoints for agents | |
<info> [2021-04-05, 22:11:08] found provisioner configuration id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:08] connecting to GCP id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:08] found provisioner configuration id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:08] connecting to GCP id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:08] found provisioner configuration id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:08] connecting to GCP id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] telemetry reporting is disabled | |
<info> [2021-04-05, 22:11:09] restoring experiment experiment="604" | |
<info> [2021-04-05, 22:11:09] restoring experiment experiment="571" | |
<info> [2021-04-05, 22:11:09] restoring experiment experiment="350" | |
<info> [2021-04-05, 22:11:09] restoring experiment experiment="345" | |
<info> [2021-04-05, 22:11:09] restoring experiment experiment="569" | |
<info> [2021-04-05, 22:11:09] restoring experiment experiment="495" | |
<info> [2021-04-05, 22:11:09] OAuth is enabled at https://10.138.0.18:443/oauth2 | |
<info> [2021-04-05, 22:11:09] SCIM is enabled at https://10.138.0.18:443/scim/v2 | |
<info> [2021-04-05, 22:11:09] SAML is enabled | |
<info> [2021-04-05, 22:11:09] accepting incoming connections on port 443 | |
<info> [2021-04-05, 22:11:09] Subchannel Connectivity change to READY system="system" | |
<info> [2021-04-05, 22:11:09] pickfirstBalancer: HandleSubConnStateChange: 0xc0008102d0, {READY <nil>} system="system" | |
<info> [2021-04-05, 22:11:09] Channel Connectivity change to READY system="system" | |
<info> [2021-04-05, 22:11:09] no snapshot found experiment-id="604" | |
<info> [2021-04-05, 22:11:09] no snapshot found experiment-id="571" | |
<info> [2021-04-05, 22:11:09] restored experiment experiment="604" | |
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/604 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/604 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] restored experiment experiment="571" | |
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/571 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/571 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/604 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/571 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] no snapshot found experiment-id="345" | |
<info> [2021-04-05, 22:11:09] no snapshot found experiment-id="350" | |
<info> [2021-04-05, 22:11:09] no snapshot found experiment-id="495" | |
<info> [2021-04-05, 22:11:09] no snapshot found experiment-id="569" | |
<info> [2021-04-05, 22:11:09] restored experiment experiment="345" | |
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/345 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/345 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/345 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] restored experiment experiment="495" | |
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/495 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/495 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/495 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] restored experiment experiment="569" | |
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/569 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/569 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/569 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] restored experiment experiment="350" | |
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/350 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/350 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/350 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-05, 22:11:09] aggregated resource allocation statistics for 2021-04-02 00:00:00 +0000 UTC in 534.808897ms | |
<info> [2021-04-05, 22:11:09] aggregated resource allocation statistics for 2021-04-03 00:00:00 +0000 UTC in 189.601047ms | |
<info> [2021-04-05, 22:11:10] aggregated resource allocation statistics for 2021-04-04 00:00:00 +0000 UTC in 235.883175ms | |
<info> [2021-04-05, 22:11:10] scheduling next resource allocation aggregation in 1h49m50s at 2021-04-06 00:01:00 +0000 UTC id="allocation-aggregator" system="master" type="allocationAggregator" | |
<info> [2021-04-05, 22:12:57] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="GetAgents" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-05T22:12:57Z" grpc.time_ms="0.052" span.kind="server" system="grpc" | |
<info> [2021-04-05, 22:13:03] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="GetAgents" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-05T22:13:03Z" grpc.time_ms="0.059" span.kind="server" system="grpc" | |
<info> [2021-04-05, 22:13:09] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="GetAgents" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-05T22:13:09Z" grpc.time_ms="0.043" span.kind="server" system="grpc" | |
<info> [2021-04-05, 22:17:30] user authorizing an OAuth application request_url="/oauth2/authorize?response_type=code&state=0oag2xv5yhSRnE4855d6%2Cdev-2564556%2C01de8e4cdd7d5bbbcc9b2be32e758750afa6500a0800ee3a71ede70b202fb6bb&client_id=47dabda2c57e0794c713d9a9173144d30910d0f3da875d7fbd9d3282b5625fca&redirect_uri=https%3A%2F%2Fsystem-admin.okta.com%2Fadmin%2Fapp%2Fcpc%2Fdev-2564556_determinedai_1%2Foauth%2Fcallback" username="hoang@determined.ai" | |
<error> [2021-04-05, 22:17:30] OAuth internal error occurred error="non-admin user hoang@determined.ai cannot authorize OAuth applications" | |
<error> [2021-04-05, 22:17:30] OAuth response error occurred error="server_error" response="&{server_error 0 The authorization server encountered an unexpected condition that prevented it from fulfilling the request 500 map[]}" | |
<info> [2021-04-05, 22:17:51] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="GetAgents" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-05T22:17:51Z" grpc.time_ms="0.042" span.kind="server" system="grpc" | |
<info> [2021-04-05, 22:17:55] user authorizing an OAuth application request_url="/oauth2/authorize?response_type=code&state=0oag2xv5yhSRnE4855d6%2Cdev-2564556%2C01de8e4cdd7d5bbbcc9b2be32e758750afa6500a0800ee3a71ede70b202fb6bb&client_id=47dabda2c57e0794c713d9a9173144d30910d0f3da875d7fbd9d3282b5625fca&redirect_uri=https%3A%2F%2Fsystem-admin.okta.com%2Fadmin%2Fapp%2Fcpc%2Fdev-2564556_determinedai_1%2Foauth%2Fcallback" username="admin" | |
<error> [2021-04-05, 22:17:55] OAuth internal error occurred error="unknown OAuth client ID \"47dabda2c57e0794c713d9a9173144d30910d0f3da875d7fbd9d3282b5625fca\"" | |
<error> [2021-04-05, 22:17:55] OAuth response error occurred error="server_error" response="&{server_error 0 The authorization server encountered an unexpected condition that prevented it from fulfilling the request 500 map[]}" | |
<info> [2021-04-05, 22:17:55] user authorizing an OAuth application request_url="/oauth2/authorize?response_type=code&state=0oag2xv5yhSRnE4855d6%2Cdev-2564556%2C01de8e4cdd7d5bbbcc9b2be32e758750afa6500a0800ee3a71ede70b202fb6bb&client_id=47dabda2c57e0794c713d9a9173144d30910d0f3da875d7fbd9d3282b5625fca&redirect_uri=https%3A%2F%2Fsystem-admin.okta.com%2Fadmin%2Fapp%2Fcpc%2Fdev-2564556_determinedai_1%2Foauth%2Fcallback" username="admin" | |
<error> [2021-04-05, 22:17:56] OAuth internal error occurred error="unknown OAuth client ID \"47dabda2c57e0794c713d9a9173144d30910d0f3da875d7fbd9d3282b5625fca\"" | |
<error> [2021-04-05, 22:17:56] OAuth response error occurred error="server_error" response="&{server_error 0 The authorization server encountered an unexpected condition that prevented it from fulfilling the request 500 map[]}" | |
<info> [2021-04-05, 22:18:50] user authorizing an OAuth application request_url="/oauth2/authorize?response_type=code&state=0oag2xv5yhSRnE4855d6%2Cdev-2564556%2C01de8e4cdd7d5bbbcc9b2be32e758750afa6500a0800ee3a71ede70b202fb6bb&client_id=3b26fb2b19167f4d114f31d2a5a50766d3e4ccce1d5ecbcd75e43e934e6a4aee&redirect_uri=https%3A%2F%2Fsystem-admin.okta.com%2Fadmin%2Fapp%2Fcpc%2Fdev-2564556_determinedai_1%2Foauth%2Fcallback" username="admin" | |
<error> [2021-04-05, 22:18:50] OAuth internal error occurred error="unknown OAuth client ID \"3b26fb2b19167f4d114f31d2a5a50766d3e4ccce1d5ecbcd75e43e934e6a4aee\"" | |
<error> [2021-04-05, 22:18:50] OAuth response error occurred error="server_error" response="&{server_error 0 The authorization server encountered an unexpected condition that prevented it from fulfilling the request 500 map[]}" | |
<info> [2021-04-05, 22:20:41] user authorizing an OAuth application request_url="/oauth2/authorize?response_type=code&state=0oag2xv5yhSRnE4855d6%2Cdev-2564556%2C01de8e4cdd7d5bbbcc9b2be32e758750afa6500a0800ee3a71ede70b202fb6bb&client_id=370946fb2e6381dc5566015de3e4cce2ed23e7a54514f7946e3490fdf2c9e495&redirect_uri=https%3A%2F%2Fsystem-admin.okta.com%2Fadmin%2Fapp%2Fcpc%2Fdev-2564556_determinedai_1%2Foauth%2Fcallback" username="admin" | |
<error> [2021-04-05, 22:20:41] OAuth internal error occurred error="unknown OAuth client ID \"370946fb2e6381dc5566015de3e4cce2ed23e7a54514f7946e3490fdf2c9e495\"" | |
<error> [2021-04-05, 22:20:41] OAuth response error occurred error="server_error" response="&{server_error 0 The authorization server encountered an unexpected condition that prevented it from fulfilling the request 500 map[]}" | |
<info> [2021-04-05, 22:24:13] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="GetAgents" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-05T22:24:13Z" grpc.time_ms="0.043" span.kind="server" system="grpc" | |
<info> [2021-04-06, 00:01:00] aggregated resource allocation statistics for 2021-04-05 00:00:00 +0000 UTC in 244.465402ms | |
<info> [2021-04-06, 00:01:00] scheduling next resource allocation aggregation in 24h0m0s at 2021-04-07 00:01:00 +0000 UTC id="allocation-aggregator" system="master" type="allocationAggregator" | |
<info> [2021-04-06, 13:38:26] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="GetAgents" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-06T13:38:26Z" grpc.time_ms="0.042" span.kind="server" system="grpc" | |
<info> [2021-04-06, 17:48:48] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="CurrentUser" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-06T17:48:48Z" grpc.time_ms="33.059" span.kind="server" system="grpc" | |
<info> [2021-04-06, 17:48:48] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="GetAgents" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-06T17:48:48Z" grpc.time_ms="33.101" span.kind="server" system="grpc" | |
<info> [2021-04-06, 17:48:48] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="Logout" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-06T17:48:48Z" grpc.time_ms="32.911" span.kind="server" system="grpc" | |
<info> [2021-04-06, 18:46:58] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="GetAgents" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-06T18:46:58Z" grpc.time_ms="0.045" span.kind="server" system="grpc" | |
<info> [2021-04-06, 19:32:11] setting priority for group of /experiments/816 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:32:11] setting priority for group of /experiments/816 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:32:11] setting priority for group of /experiments/816 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<warning> [2021-04-06, 19:32:11] response already committed | |
<info> [2021-04-06, 19:32:12] experiment state changed to ACTIVE id="816" system="master" type="experiment" | |
<info> [2021-04-06, 19:32:12] resources are requested by /experiments/816/a35b9598-94c5-4dcd-9399-0d94d90c3bf9 (Task ID: de2356b3-dcf6-4864-9b86-92103c1203da) id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:32:16] decided to launch 1 instances (type n1-standard-32-nvidia-tesla-v100-8) id="provisioner" resource-pool="default-gpu-pool" system="master" type="Provisioner" | |
<info> [2021-04-06, 19:32:22] found state changes in 1 instances: det-agent-release-party-outgoing-mosquito (Starting) id="provisioner" resource-pool="default-gpu-pool" system="master" type="Provisioner" | |
<info> [2021-04-06, 19:33:27] setting priority for group of /experiments/817 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:33:27] setting priority for group of /experiments/817 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:33:27] setting priority for group of /experiments/817 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<warning> [2021-04-06, 19:33:27] response already committed | |
<info> [2021-04-06, 19:33:27] experiment state changed to ACTIVE id="817" system="master" type="experiment" | |
<info> [2021-04-06, 19:33:27] resources are requested by /experiments/817/f2a8e7ea-5649-400a-90bf-c979d2fda817 (Task ID: bae45824-da61-46aa-a6a3-8d7af98ca3a7) id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:33:28] decided to launch 1 instances (type n1-standard-32-nvidia-tesla-v100-8) id="provisioner" resource-pool="preemptible-gpu-pool" system="master" type="Provisioner" | |
<info> [2021-04-06, 19:33:34] found state changes in 1 instances: det-agent-release-party-tidy-crawdad (Starting) id="provisioner" resource-pool="preemptible-gpu-pool" system="master" type="Provisioner" | |
<info> [2021-04-06, 19:33:50] setting priority for group of /experiments/818 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<warning> [2021-04-06, 19:33:50] response already committed | |
<info> [2021-04-06, 19:33:50] setting priority for group of /experiments/818 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:33:50] setting priority for group of /experiments/818 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:33:50] experiment state changed to ACTIVE id="818" system="master" type="experiment" | |
<info> [2021-04-06, 19:33:50] resources are requested by /experiments/818/7a608ae4-8cbc-4d11-9e16-83b86c74f458 (Task ID: d2f3e747-060d-4db8-84bc-a41ae9e5ef8c) id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:34:07] setting priority for group of /experiments/819 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<warning> [2021-04-06, 19:34:07] response already committed | |
<info> [2021-04-06, 19:34:07] setting priority for group of /experiments/819 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:34:07] setting priority for group of /experiments/819 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:34:07] experiment state changed to ACTIVE id="819" system="master" type="experiment" | |
<info> [2021-04-06, 19:34:07] resources are requested by /experiments/819/fe51990b-8878-4df1-8a52-245e85da1301 (Task ID: 30de80e4-8db7-4e87-b300-f0a48d26ebbb) id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<warning> [2021-04-06, 19:34:17] GCE throws out warning (code DISK_SIZE_LARGER_THAN_IMAGE_SIZE) for operation "4583197155660990894" targeting "https://www.googleapis.com/compute/v1/projects/determined-ai/zones/us-west1-b/instances/det-agent-release-party-outgoing-mosquito": Disk size: '200 GB' is larger than image size: '100 GB'. You might need to resize the root repartition manually if the operating system does not support automatic resizing. See https://cloud.google.com/compute/docs/disks/add-persistent-disk#resize_pd for details. id="track-batch-operation-4e0de6fd-daf2-4445-aa2e-fb31a608eeec" system="master" type="gcpBatchOperationTracker" | |
<info> [2021-04-06, 19:34:17] inserted 1/1 GCE instances: det-agent-release-party-outgoing-mosquito id="provisioner" resource-pool="default-gpu-pool" system="master" type="Provisioner" | |
<info> [2021-04-06, 19:34:20] found state changes in 1 instances: det-agent-release-party-outgoing-mosquito (Running) id="provisioner" resource-pool="default-gpu-pool" system="master" type="Provisioner" | |
<warning> [2021-04-06, 19:35:51] GCE throws out warning (code DISK_SIZE_LARGER_THAN_IMAGE_SIZE) for operation "4591043566989543783" targeting "https://www.googleapis.com/compute/v1/projects/determined-ai/zones/us-west1-b/instances/det-agent-release-party-tidy-crawdad": Disk size: '200 GB' is larger than image size: '100 GB'. You might need to resize the root repartition manually if the operating system does not support automatic resizing. See https://cloud.google.com/compute/docs/disks/add-persistent-disk#resize_pd for details. id="track-batch-operation-d23ce758-5475-440d-8116-51298dbc15fc" system="master" type="gcpBatchOperationTracker" | |
<info> [2021-04-06, 19:35:51] inserted 1/1 GCE instances: det-agent-release-party-tidy-crawdad id="provisioner" resource-pool="preemptible-gpu-pool" system="master" type="Provisioner" | |
<info> [2021-04-06, 19:35:53] found state changes in 1 instances: det-agent-release-party-tidy-crawdad (Running) id="provisioner" resource-pool="preemptible-gpu-pool" system="master" type="Provisioner" | |
<info> [2021-04-06, 19:39:12] agent connected ip: 10.138.0.20 resource pool: default-gpu-pool slots: 8 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<info> [2021-04-06, 19:39:12] adding agent: det-agent-release-party-outgoing-mosquito id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:39:12] adding device: gpu7 (Tesla V100-SXM2-16GB) on det-agent-release-party-outgoing-mosquito id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:39:12] adding device: gpu0 (Tesla V100-SXM2-16GB) on det-agent-release-party-outgoing-mosquito id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:39:12] adding device: gpu1 (Tesla V100-SXM2-16GB) on det-agent-release-party-outgoing-mosquito id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:39:12] adding device: gpu2 (Tesla V100-SXM2-16GB) on det-agent-release-party-outgoing-mosquito id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:39:12] adding device: gpu3 (Tesla V100-SXM2-16GB) on det-agent-release-party-outgoing-mosquito id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:39:12] adding device: gpu4 (Tesla V100-SXM2-16GB) on det-agent-release-party-outgoing-mosquito id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:39:12] adding device: gpu5 (Tesla V100-SXM2-16GB) on det-agent-release-party-outgoing-mosquito id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:39:12] adding device: gpu6 (Tesla V100-SXM2-16GB) on det-agent-release-party-outgoing-mosquito id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:39:13] allocated resources to /experiments/816/a35b9598-94c5-4dcd-9399-0d94d90c3bf9 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:39:13] allocated resources to /experiments/819/fe51990b-8878-4df1-8a52-245e85da1301 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:39:13] starting trial container: <RUN_STEP (100 Batches) (0 Prior Batches): (819,9184,1)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:39:13] starting trial container: <RUN_STEP (100 Batches) (0 Prior Batches): (816,9185,1)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:39:13] starting container id: 421d59a6-d124-4c9f-a638-b99b53c6cf44 slots: 4 task handler: /experiments/819/fe51990b-8878-4df1-8a52-245e85da1301 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<info> [2021-04-06, 19:39:13] starting container id: e2d307a7-c947-4bf1-9b80-83965beb8198 slots: 1 task handler: /experiments/816/a35b9598-94c5-4dcd-9399-0d94d90c3bf9 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<info> [2021-04-06, 19:40:55] agent connected ip: 10.138.0.21 resource pool: preemptible-gpu-pool slots: 8 id="det-agent-release-party-tidy-crawdad" system="master" type="agent" | |
<info> [2021-04-06, 19:40:55] adding agent: det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:40:55] adding device: gpu7 (Tesla V100-SXM2-16GB) on det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:40:55] adding device: gpu2 (Tesla V100-SXM2-16GB) on det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:40:55] adding device: gpu0 (Tesla V100-SXM2-16GB) on det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:40:55] adding device: gpu1 (Tesla V100-SXM2-16GB) on det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:40:55] adding device: gpu4 (Tesla V100-SXM2-16GB) on det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:40:55] adding device: gpu3 (Tesla V100-SXM2-16GB) on det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:40:55] adding device: gpu5 (Tesla V100-SXM2-16GB) on det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:40:55] adding device: gpu6 (Tesla V100-SXM2-16GB) on det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:40:55] allocated resources to /experiments/817/f2a8e7ea-5649-400a-90bf-c979d2fda817 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:40:55] starting trial container: <RUN_STEP (100 Batches) (0 Prior Batches): (817,9186,1)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:40:55] starting container id: d92e3ab2-54ef-47db-aaef-f0583c907afc slots: 1 task handler: /experiments/817/f2a8e7ea-5649-400a-90bf-c979d2fda817 id="det-agent-release-party-tidy-crawdad" system="master" type="agent" | |
<info> [2021-04-06, 19:41:26] found container running: 421d59a6-d124-4c9f-a638-b99b53c6cf44 (rank 0) experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:41:26] pushing rendezvous information experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:41:26] found not all containers are connected experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:41:26] found container running: e2d307a7-c947-4bf1-9b80-83965beb8198 (rank 0) experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:26] pushing rendezvous information experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:26] found not all containers are connected experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:33] new connection from container 421d59a6-d124-4c9f-a638-b99b53c6cf44 trial 9184 (experiment 819) at 10.138.0.20:51316 | |
<info> [2021-04-06, 19:41:33] pushing rendezvous information experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:41:33] found all containers are connected successfully experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:41:33] new connection from container e2d307a7-c947-4bf1-9b80-83965beb8198 trial 9185 (experiment 816) at 10.138.0.20:58214 | |
<info> [2021-04-06, 19:41:33] pushing rendezvous information experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:33] found all containers are connected successfully experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:41] trial completed workload: <RUN_STEP (100 Batches) (0 Prior Batches): (816,9185,1)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:41] continuing trial: <RUN_STEP (100 Batches) (100 Prior Batches): (816,9185,2)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:41] creating notebook id="notebooks" system="master" type="notebookManager" | |
<info> [2021-04-06, 19:41:41] resources are requested by /notebooks/2464edd4-5da7-4d8d-b01a-a146a9115e58 (Task ID: 2464edd4-5da7-4d8d-b01a-a146a9115e58) id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:41:41] created notebook 2464edd4-5da7-4d8d-b01a-a146a9115e58 id="notebooks" system="master" type="notebookManager" | |
<info> [2021-04-06, 19:41:41] setting priority for group of /notebooks/2464edd4-5da7-4d8d-b01a-a146a9115e58 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:41:41] setting priority for group of /notebooks/2464edd4-5da7-4d8d-b01a-a146a9115e58 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:41:41] setting priority for group of /notebooks/2464edd4-5da7-4d8d-b01a-a146a9115e58 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:41:42] decided to launch 1 instances (type n1-standard-32-nvidia-tesla-v100-0) id="provisioner" resource-pool="default-cpu-pool" system="master" type="Provisioner" | |
<info> [2021-04-06, 19:41:43] trial completed workload: <RUN_STEP (100 Batches) (100 Prior Batches): (816,9185,2)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:43] continuing trial: <RUN_STEP (100 Batches) (200 Prior Batches): (816,9185,3)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<error> [2021-04-06, 19:41:43] error while actor was running error="websocket: close 1001 (going away)" id="websocket-2da0ee0a-c0fb-4f25-b936-54ad51dcad15" system="master" type="websocketActor" | |
<error> [2021-04-06, 19:41:43] websocket: close 1001 (going away) | |
<error> [2021-04-06, 19:41:43] http: connection has been hijacked | |
<info> [2021-04-06, 19:41:45] trial completed workload: <RUN_STEP (100 Batches) (200 Prior Batches): (816,9185,3)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:45] continuing trial: <RUN_STEP (100 Batches) (300 Prior Batches): (816,9185,4)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:46] creating notebook id="notebooks" system="master" type="notebookManager" | |
<info> [2021-04-06, 19:41:46] resources are requested by /notebooks/b53a120e-0541-4139-b208-dcc21a7848e2 (Task ID: b53a120e-0541-4139-b208-dcc21a7848e2) id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:41:46] created notebook b53a120e-0541-4139-b208-dcc21a7848e2 id="notebooks" system="master" type="notebookManager" | |
<info> [2021-04-06, 19:41:46] setting priority for group of /notebooks/b53a120e-0541-4139-b208-dcc21a7848e2 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:41:46] setting priority for group of /notebooks/b53a120e-0541-4139-b208-dcc21a7848e2 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:41:46] setting priority for group of /notebooks/b53a120e-0541-4139-b208-dcc21a7848e2 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:41:46] trial completed workload: <RUN_STEP (100 Batches) (0 Prior Batches): (819,9184,1)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:41:46] continuing trial: <RUN_STEP (100 Batches) (100 Prior Batches): (819,9184,2)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:41:47] trial completed workload: <RUN_STEP (100 Batches) (300 Prior Batches): (816,9185,4)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:47] continuing trial: <RUN_STEP (100 Batches) (400 Prior Batches): (816,9185,5)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:47] allocated resources to /notebooks/b53a120e-0541-4139-b208-dcc21a7848e2 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:41:47] starting container id: ae9091e2-e545-494b-b3ce-970e72ee73eb slots: 1 task handler: /notebooks/b53a120e-0541-4139-b208-dcc21a7848e2 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<info> [2021-04-06, 19:41:48] found state changes in 1 instances: det-agent-release-party-accepted-sculpin (Starting) id="provisioner" resource-pool="default-cpu-pool" system="master" type="Provisioner" | |
<info> [2021-04-06, 19:41:48] trial completed workload: <RUN_STEP (100 Batches) (100 Prior Batches): (819,9184,2)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:41:48] continuing trial: <RUN_STEP (100 Batches) (200 Prior Batches): (819,9184,3)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<error> [2021-04-06, 19:41:48] error while actor was running error="websocket: close 1006 (abnormal closure): unexpected EOF" id="websocket-a71a73b6-cce3-4443-8b8a-610f694d92c6" system="master" type="websocketActor" | |
<error> [2021-04-06, 19:41:48] websocket: close 1006 (abnormal closure): unexpected EOF | |
<error> [2021-04-06, 19:41:48] http: connection has been hijacked | |
<info> [2021-04-06, 19:41:49] trial completed workload: <RUN_STEP (100 Batches) (400 Prior Batches): (816,9185,5)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:49] continuing trial: <RUN_STEP (100 Batches) (500 Prior Batches): (816,9185,6)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:50] trial completed workload: <RUN_STEP (100 Batches) (200 Prior Batches): (819,9184,3)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:41:50] continuing trial: <RUN_STEP (100 Batches) (300 Prior Batches): (819,9184,4)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:41:51] trial completed workload: <RUN_STEP (100 Batches) (500 Prior Batches): (816,9185,6)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:51] continuing trial: <RUN_STEP (100 Batches) (600 Prior Batches): (816,9185,7)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<warning> [2021-04-06, 19:41:52] GCE throws out warning (code DISK_SIZE_LARGER_THAN_IMAGE_SIZE) for operation "51051865676253080" targeting "https://www.googleapis.com/compute/v1/projects/determined-ai/zones/us-west1-b/instances/det-agent-release-party-accepted-sculpin": Disk size: '200 GB' is larger than image size: '100 GB'. You might need to resize the root repartition manually if the operating system does not support automatic resizing. See https://cloud.google.com/compute/docs/disks/add-persistent-disk#resize_pd for details. id="track-batch-operation-65300447-2877-4c33-bc79-9ee35fe401fd" system="master" type="gcpBatchOperationTracker" | |
<info> [2021-04-06, 19:41:52] inserted 1/1 GCE instances: det-agent-release-party-accepted-sculpin id="provisioner" resource-pool="default-cpu-pool" system="master" type="Provisioner" | |
<info> [2021-04-06, 19:41:52] trial completed workload: <RUN_STEP (100 Batches) (300 Prior Batches): (819,9184,4)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:41:52] continuing trial: <RUN_STEP (100 Batches) (400 Prior Batches): (819,9184,5)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:41:53] registering service: b53a120e-0541-4139-b208-dcc21a7848e2 (http://10.138.0.20:49157) id="proxy" system="master" type="Proxy" | |
<info> [2021-04-06, 19:41:53] trial completed workload: <RUN_STEP (100 Batches) (600 Prior Batches): (816,9185,7)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:53] continuing trial: <RUN_STEP (100 Batches) (700 Prior Batches): (816,9185,8)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:53] found state changes in 1 instances: det-agent-release-party-accepted-sculpin (Running) id="provisioner" resource-pool="default-cpu-pool" system="master" type="Provisioner" | |
<info> [2021-04-06, 19:41:55] trial completed workload: <RUN_STEP (100 Batches) (400 Prior Batches): (819,9184,5)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:41:55] continuing trial: <RUN_STEP (100 Batches) (500 Prior Batches): (819,9184,6)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:41:55] trial completed workload: <RUN_STEP (100 Batches) (700 Prior Batches): (816,9185,8)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:55] continuing trial: <RUN_STEP (100 Batches) (800 Prior Batches): (816,9185,9)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:56] trial completed workload: <RUN_STEP (100 Batches) (500 Prior Batches): (819,9184,6)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:41:56] continuing trial: <RUN_STEP (100 Batches) (600 Prior Batches): (819,9184,7)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:41:57] trial completed workload: <RUN_STEP (100 Batches) (800 Prior Batches): (816,9185,9)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:57] continuing trial: <RUN_STEP (37 Batches) (900 Prior Batches): (816,9185,10)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:58] trial completed workload: <RUN_STEP (37 Batches) (900 Prior Batches): (816,9185,10)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:58] continuing trial: <CHECKPOINT_MODEL (937 Prior Batches): (816,9185,10)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:41:59] trial completed workload: <RUN_STEP (100 Batches) (600 Prior Batches): (819,9184,7)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:41:59] continuing trial: <RUN_STEP (100 Batches) (700 Prior Batches): (819,9184,8)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:41:59] readiness check passed: notebook id="b53a120e-0541-4139-b208-dcc21a7848e2" system="master" type="command" | |
<info> [2021-04-06, 19:42:00] trial completed workload: <CHECKPOINT_MODEL (937 Prior Batches): (816,9185,10)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:42:00] continuing trial: <COMPUTE_VALIDATION_METRICS (937 Prior Batches): (816,9185,10)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:42:01] trial completed workload: <RUN_STEP (100 Batches) (700 Prior Batches): (819,9184,8)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:42:01] continuing trial: <RUN_STEP (100 Batches) (800 Prior Batches): (819,9184,9)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:42:03] trial completed workload: <RUN_STEP (100 Batches) (800 Prior Batches): (819,9184,9)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:42:03] continuing trial: <RUN_STEP (37 Batches) (900 Prior Batches): (819,9184,10)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:42:03] trial completed workload: <COMPUTE_VALIDATION_METRICS (937 Prior Batches): (816,9185,10)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:42:03] terminating gracefully because there are no more workloads experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:42:03] gracefully terminating trial experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:42:04] trial completed workload: <RUN_STEP (37 Batches) (900 Prior Batches): (819,9184,10)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:42:04] continuing trial: <CHECKPOINT_MODEL (937 Prior Batches): (819,9184,10)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:42:05] stopped container id: e2d307a7-c947-4bf1-9b80-83965beb8198 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<info> [2021-04-06, 19:42:05] found container terminated: e2d307a7-c947-4bf1-9b80-83965beb8198 experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:42:05] forcibly terminating trial experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:42:05] killing container id: e2d307a7-c947-4bf1-9b80-83965beb8198 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<info> [2021-04-06, 19:42:05] trial runner stopped successfully experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:42:05] trial stopped successfully experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial" | |
<info> [2021-04-06, 19:42:05] resources are released for /experiments/816/a35b9598-94c5-4dcd-9399-0d94d90c3bf9 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:42:05] resources are released for /experiments/816/a35b9598-94c5-4dcd-9399-0d94d90c3bf9 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:42:05] resources are released for /experiments/816/a35b9598-94c5-4dcd-9399-0d94d90c3bf9 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:42:05] resources are released for /experiments/816/a35b9598-94c5-4dcd-9399-0d94d90c3bf9 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:42:05] experiment state changed to STOPPING_COMPLETED id="816" system="master" type="experiment" | |
<info> [2021-04-06, 19:42:05] experiment state changed to COMPLETED id="816" system="master" type="experiment" | |
<info> [2021-04-06, 19:42:05] resources are requested by /experiment-816-checkpoint-gc (Task ID: 74b7e5de-2d6a-48fb-a244-dd0f52e36a5f) id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:42:05] experiment shut down successfully id="816" system="master" type="experiment" | |
<info> [2021-04-06, 19:42:06] trial completed workload: <CHECKPOINT_MODEL (937 Prior Batches): (819,9184,10)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:42:06] continuing trial: <COMPUTE_VALIDATION_METRICS (937 Prior Batches): (819,9184,10)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:42:07] trial completed workload: <COMPUTE_VALIDATION_METRICS (937 Prior Batches): (819,9184,10)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:42:08] terminating gracefully because there are no more workloads experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:42:08] gracefully terminating trial experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:42:09] stopped container id: 421d59a6-d124-4c9f-a638-b99b53c6cf44 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<info> [2021-04-06, 19:42:09] found container terminated: 421d59a6-d124-4c9f-a638-b99b53c6cf44 experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:42:09] forcibly terminating trial experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:42:09] killing container id: 421d59a6-d124-4c9f-a638-b99b53c6cf44 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<info> [2021-04-06, 19:42:09] trial runner stopped successfully experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:42:09] trial stopped successfully experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial" | |
<info> [2021-04-06, 19:42:09] resources are released for /experiments/819/fe51990b-8878-4df1-8a52-245e85da1301 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:42:09] resources are released for /experiments/819/fe51990b-8878-4df1-8a52-245e85da1301 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:42:09] resources are released for /experiments/819/fe51990b-8878-4df1-8a52-245e85da1301 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:42:09] resources are released for /experiments/819/fe51990b-8878-4df1-8a52-245e85da1301 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:42:09] experiment state changed to STOPPING_COMPLETED id="819" system="master" type="experiment" | |
<info> [2021-04-06, 19:42:10] experiment state changed to COMPLETED id="819" system="master" type="experiment" | |
<info> [2021-04-06, 19:42:10] resources are requested by /experiment-819-checkpoint-gc (Task ID: ba2e7c69-4816-4326-88c1-f7832c004617) id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:42:10] experiment shut down successfully id="819" system="master" type="experiment" | |
<info> [2021-04-06, 19:42:41] setting priority for group of /experiments/820 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:42:41] setting priority for group of /experiments/820 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:42:41] setting priority for group of /experiments/820 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<warning> [2021-04-06, 19:42:41] response already committed | |
<info> [2021-04-06, 19:42:42] experiment state changed to ACTIVE id="820" system="master" type="experiment" | |
<info> [2021-04-06, 19:42:42] resources are requested by /experiments/820/fbaae498-d290-41bd-8c46-f53db000e207 (Task ID: 764df640-c47c-471f-a3ea-3ca172a2833d) id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:42:42] allocated resources to /experiments/820/fbaae498-d290-41bd-8c46-f53db000e207 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:42:43] starting trial container: <RUN_STEP (100 Batches) (0 Prior Batches): (820,9187,1)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:42:43] starting container id: 31d0e91e-66b1-49a7-a309-7ed075437231 slots: 1 task handler: /experiments/820/fbaae498-d290-41bd-8c46-f53db000e207 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<info> [2021-04-06, 19:42:49] setting priority for group of /experiments/821 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<warning> [2021-04-06, 19:42:49] response already committed | |
<info> [2021-04-06, 19:42:49] setting priority for group of /experiments/821 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:42:49] setting priority for group of /experiments/821 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:42:49] experiment state changed to ACTIVE id="821" system="master" type="experiment" | |
<info> [2021-04-06, 19:42:49] resources are requested by /experiments/821/a368d0de-0ec9-47c9-bf88-d4e79b2ae403 (Task ID: ccb4f15d-61bb-4d6b-bd31-298ba4e3c800) id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:42:50] allocated resources to /experiments/821/a368d0de-0ec9-47c9-bf88-d4e79b2ae403 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:42:50] starting trial container: <RUN_STEP (100 Batches) (0 Prior Batches): (821,9188,1)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:42:50] starting container id: 564d56f8-438e-4c30-8087-9c6a6ab9e5c9 slots: 4 task handler: /experiments/821/a368d0de-0ec9-47c9-bf88-d4e79b2ae403 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<info> [2021-04-06, 19:43:05] found container running: d92e3ab2-54ef-47db-aaef-f0583c907afc (rank 0) experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:05] pushing rendezvous information experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:05] found not all containers are connected experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:12] new connection from container d92e3ab2-54ef-47db-aaef-f0583c907afc trial 9186 (experiment 817) at 10.138.0.21:53678 | |
<info> [2021-04-06, 19:43:12] pushing rendezvous information experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:12] found all containers are connected successfully experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:19] trial completed workload: <RUN_STEP (100 Batches) (0 Prior Batches): (817,9186,1)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:19] continuing trial: <RUN_STEP (100 Batches) (100 Prior Batches): (817,9186,2)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:21] trial completed workload: <RUN_STEP (100 Batches) (100 Prior Batches): (817,9186,2)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:21] continuing trial: <RUN_STEP (100 Batches) (200 Prior Batches): (817,9186,3)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:22] trial completed workload: <RUN_STEP (100 Batches) (200 Prior Batches): (817,9186,3)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:23] continuing trial: <RUN_STEP (100 Batches) (300 Prior Batches): (817,9186,4)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:24] trial completed workload: <RUN_STEP (100 Batches) (300 Prior Batches): (817,9186,4)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:24] continuing trial: <RUN_STEP (100 Batches) (400 Prior Batches): (817,9186,5)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:26] trial completed workload: <RUN_STEP (100 Batches) (400 Prior Batches): (817,9186,5)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:26] continuing trial: <RUN_STEP (100 Batches) (500 Prior Batches): (817,9186,6)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:28] trial completed workload: <RUN_STEP (100 Batches) (500 Prior Batches): (817,9186,6)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:28] continuing trial: <RUN_STEP (100 Batches) (600 Prior Batches): (817,9186,7)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:29] trial completed workload: <RUN_STEP (100 Batches) (600 Prior Batches): (817,9186,7)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:29] continuing trial: <RUN_STEP (100 Batches) (700 Prior Batches): (817,9186,8)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:31] trial completed workload: <RUN_STEP (100 Batches) (700 Prior Batches): (817,9186,8)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:31] continuing trial: <RUN_STEP (100 Batches) (800 Prior Batches): (817,9186,9)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:33] trial completed workload: <RUN_STEP (100 Batches) (800 Prior Batches): (817,9186,9)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:33] continuing trial: <RUN_STEP (37 Batches) (900 Prior Batches): (817,9186,10)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:33] trial completed workload: <RUN_STEP (37 Batches) (900 Prior Batches): (817,9186,10)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:34] continuing trial: <CHECKPOINT_MODEL (937 Prior Batches): (817,9186,10)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:36] trial completed workload: <CHECKPOINT_MODEL (937 Prior Batches): (817,9186,10)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:36] continuing trial: <COMPUTE_VALIDATION_METRICS (937 Prior Batches): (817,9186,10)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:38] trial completed workload: <COMPUTE_VALIDATION_METRICS (937 Prior Batches): (817,9186,10)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:39] terminating gracefully because there are no more workloads experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:39] gracefully terminating trial experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:40] stopped container id: d92e3ab2-54ef-47db-aaef-f0583c907afc id="det-agent-release-party-tidy-crawdad" system="master" type="agent" | |
<info> [2021-04-06, 19:43:40] found container terminated: d92e3ab2-54ef-47db-aaef-f0583c907afc experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:40] forcibly terminating trial experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:40] killing container id: d92e3ab2-54ef-47db-aaef-f0583c907afc id="det-agent-release-party-tidy-crawdad" system="master" type="agent" | |
<info> [2021-04-06, 19:43:40] trial runner stopped successfully experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:40] trial stopped successfully experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial" | |
<info> [2021-04-06, 19:43:40] resources are released for /experiments/817/f2a8e7ea-5649-400a-90bf-c979d2fda817 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:43:40] resources are released for /experiments/817/f2a8e7ea-5649-400a-90bf-c979d2fda817 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:43:40] resources are released for /experiments/817/f2a8e7ea-5649-400a-90bf-c979d2fda817 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:43:40] resources are released for /experiments/817/f2a8e7ea-5649-400a-90bf-c979d2fda817 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:43:40] experiment state changed to STOPPING_COMPLETED id="817" system="master" type="experiment" | |
<info> [2021-04-06, 19:43:41] experiment state changed to COMPLETED id="817" system="master" type="experiment" | |
<info> [2021-04-06, 19:43:41] resources are requested by /experiment-817-checkpoint-gc (Task ID: e0eb1f3f-c023-45b4-a714-58945499f6a9) id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:43:41] experiment shut down successfully id="817" system="master" type="experiment" | |
<info> [2021-04-06, 19:43:58] found container running: 31d0e91e-66b1-49a7-a309-7ed075437231 (rank 0) experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:43:58] pushing rendezvous information experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:43:58] found not all containers are connected experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:43:58] found container running: 564d56f8-438e-4c30-8087-9c6a6ab9e5c9 (rank 0) experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:43:58] pushing rendezvous information experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:43:58] found not all containers are connected experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:44:05] new connection from container 564d56f8-438e-4c30-8087-9c6a6ab9e5c9 trial 9188 (experiment 821) at 10.138.0.20:54010 | |
<info> [2021-04-06, 19:44:05] pushing rendezvous information experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:44:05] found all containers are connected successfully experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:44:05] new connection from container 31d0e91e-66b1-49a7-a309-7ed075437231 trial 9187 (experiment 820) at 10.138.0.20:60922 | |
<info> [2021-04-06, 19:44:05] pushing rendezvous information experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:44:05] found all containers are connected successfully experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:44:48] trial completed workload: <RUN_STEP (100 Batches) (0 Prior Batches): (820,9187,1)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:44:48] continuing trial: <RUN_STEP (100 Batches) (100 Prior Batches): (820,9187,2)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:45:41] trial completed workload: <RUN_STEP (100 Batches) (100 Prior Batches): (820,9187,2)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:45:41] continuing trial: <RUN_STEP (100 Batches) (200 Prior Batches): (820,9187,3)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:46:06] trial completed workload: <RUN_STEP (100 Batches) (0 Prior Batches): (821,9188,1)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:46:06] continuing trial: <RUN_STEP (100 Batches) (100 Prior Batches): (821,9188,2)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:46:34] trial completed workload: <RUN_STEP (100 Batches) (200 Prior Batches): (820,9187,3)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:46:34] continuing trial: <RUN_STEP (100 Batches) (300 Prior Batches): (820,9187,4)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:47:10] agent connected ip: 10.138.0.22 resource pool: default-cpu-pool slots: 0 id="det-agent-release-party-accepted-sculpin" system="master" type="agent" | |
<info> [2021-04-06, 19:47:10] adding agent: det-agent-release-party-accepted-sculpin id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:47:10] allocated resources to /notebooks/2464edd4-5da7-4d8d-b01a-a146a9115e58 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:47:10] allocated resources to /experiment-816-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:47:10] allocated resources to /experiment-819-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:47:10] allocated resources to /experiment-817-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:47:10] starting container id: 9ac1b21e-e2a0-4f98-b5be-369cc7d972b5 slots: 0 task handler: /notebooks/2464edd4-5da7-4d8d-b01a-a146a9115e58 id="det-agent-release-party-accepted-sculpin" system="master" type="agent" | |
<info> [2021-04-06, 19:47:10] starting checkpoint garbage collection id="experiment-816-checkpoint-gc" system="master" type="checkpointGCTask" | |
<info> [2021-04-06, 19:47:10] starting container id: d4b208a8-a2f7-40a5-8dff-f70a445650a4 slots: 0 task handler: /experiment-816-checkpoint-gc id="det-agent-release-party-accepted-sculpin" system="master" type="agent" | |
<info> [2021-04-06, 19:47:10] starting checkpoint garbage collection id="experiment-817-checkpoint-gc" system="master" type="checkpointGCTask" | |
<info> [2021-04-06, 19:47:10] starting container id: 1577b782-1e7c-4896-8748-bcf9c81760a7 slots: 0 task handler: /experiment-817-checkpoint-gc id="det-agent-release-party-accepted-sculpin" system="master" type="agent" | |
<info> [2021-04-06, 19:47:11] starting checkpoint garbage collection id="experiment-819-checkpoint-gc" system="master" type="checkpointGCTask" | |
<info> [2021-04-06, 19:47:11] starting container id: e1731976-ed74-4a2e-9d10-58aab2055114 slots: 0 task handler: /experiment-819-checkpoint-gc id="det-agent-release-party-accepted-sculpin" system="master" type="agent" | |
<info> [2021-04-06, 19:47:27] trial completed workload: <RUN_STEP (100 Batches) (300 Prior Batches): (820,9187,4)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:47:27] continuing trial: <RUN_STEP (100 Batches) (400 Prior Batches): (820,9187,5)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:47:49] trial completed workload: <RUN_STEP (100 Batches) (100 Prior Batches): (821,9188,2)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:47:49] continuing trial: <RUN_STEP (100 Batches) (200 Prior Batches): (821,9188,3)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:48:11] registering service: 2464edd4-5da7-4d8d-b01a-a146a9115e58 (http://10.138.0.22:49153) id="proxy" system="master" type="Proxy" | |
<info> [2021-04-06, 19:48:19] readiness check passed: notebook id="2464edd4-5da7-4d8d-b01a-a146a9115e58" system="master" type="command" | |
<info> [2021-04-06, 19:48:19] trial completed workload: <RUN_STEP (100 Batches) (400 Prior Batches): (820,9187,5)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:48:19] continuing trial: <RUN_STEP (100 Batches) (500 Prior Batches): (820,9187,6)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:48:21] stopped container id: 1577b782-1e7c-4896-8748-bcf9c81760a7 id="det-agent-release-party-accepted-sculpin" system="master" type="agent" | |
<info> [2021-04-06, 19:48:21] finished checkpoint garbage collection id="experiment-817-checkpoint-gc" system="master" type="checkpointGCTask" | |
<info> [2021-04-06, 19:48:21] stopped container id: e1731976-ed74-4a2e-9d10-58aab2055114 id="det-agent-release-party-accepted-sculpin" system="master" type="agent" | |
<info> [2021-04-06, 19:48:21] finished checkpoint garbage collection id="experiment-819-checkpoint-gc" system="master" type="checkpointGCTask" | |
<info> [2021-04-06, 19:48:21] stopped container id: d4b208a8-a2f7-40a5-8dff-f70a445650a4 id="det-agent-release-party-accepted-sculpin" system="master" type="agent" | |
<info> [2021-04-06, 19:48:21] finished checkpoint garbage collection id="experiment-816-checkpoint-gc" system="master" type="checkpointGCTask" | |
<info> [2021-04-06, 19:48:21] resources are released for /experiment-819-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:48:21] resources are released for /experiment-817-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:48:22] resources are released for /experiment-816-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 19:49:13] trial completed workload: <RUN_STEP (100 Batches) (500 Prior Batches): (820,9187,6)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:49:13] continuing trial: <RUN_STEP (100 Batches) (600 Prior Batches): (820,9187,7)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:49:31] trial completed workload: <RUN_STEP (100 Batches) (200 Prior Batches): (821,9188,3)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:49:31] continuing trial: <RUN_STEP (100 Batches) (300 Prior Batches): (821,9188,4)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:50:05] trial completed workload: <RUN_STEP (100 Batches) (600 Prior Batches): (820,9187,7)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:50:05] continuing trial: <RUN_STEP (100 Batches) (700 Prior Batches): (820,9187,8)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:50:58] trial completed workload: <RUN_STEP (100 Batches) (700 Prior Batches): (820,9187,8)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:50:58] continuing trial: <RUN_STEP (100 Batches) (800 Prior Batches): (820,9187,9)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:51:13] trial completed workload: <RUN_STEP (100 Batches) (300 Prior Batches): (821,9188,4)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:51:13] continuing trial: <RUN_STEP (100 Batches) (400 Prior Batches): (821,9188,5)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:51:51] trial completed workload: <RUN_STEP (100 Batches) (800 Prior Batches): (820,9187,9)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:51:51] continuing trial: <RUN_STEP (100 Batches) (900 Prior Batches): (820,9187,10)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:52:44] trial completed workload: <RUN_STEP (100 Batches) (900 Prior Batches): (820,9187,10)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:52:44] continuing trial: <RUN_STEP (100 Batches) (1000 Prior Batches): (820,9187,11)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:52:55] trial completed workload: <RUN_STEP (100 Batches) (400 Prior Batches): (821,9188,5)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:52:55] continuing trial: <RUN_STEP (100 Batches) (500 Prior Batches): (821,9188,6)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:53:37] trial completed workload: <RUN_STEP (100 Batches) (1000 Prior Batches): (820,9187,11)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:53:37] continuing trial: <RUN_STEP (100 Batches) (1100 Prior Batches): (820,9187,12)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:54:29] trial completed workload: <RUN_STEP (100 Batches) (1100 Prior Batches): (820,9187,12)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:54:29] continuing trial: <RUN_STEP (100 Batches) (1200 Prior Batches): (820,9187,13)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:54:38] trial completed workload: <RUN_STEP (100 Batches) (500 Prior Batches): (821,9188,6)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:54:38] continuing trial: <RUN_STEP (100 Batches) (600 Prior Batches): (821,9188,7)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:55:22] trial completed workload: <RUN_STEP (100 Batches) (1200 Prior Batches): (820,9187,13)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:55:22] continuing trial: <RUN_STEP (100 Batches) (1300 Prior Batches): (820,9187,14)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:56:14] trial completed workload: <RUN_STEP (100 Batches) (1300 Prior Batches): (820,9187,14)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:56:14] continuing trial: <RUN_STEP (100 Batches) (1400 Prior Batches): (820,9187,15)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:56:20] trial completed workload: <RUN_STEP (100 Batches) (600 Prior Batches): (821,9188,7)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:56:20] continuing trial: <RUN_STEP (100 Batches) (700 Prior Batches): (821,9188,8)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:57:07] trial completed workload: <RUN_STEP (100 Batches) (1400 Prior Batches): (820,9187,15)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:57:07] continuing trial: <RUN_STEP (100 Batches) (1500 Prior Batches): (820,9187,16)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:58:00] trial completed workload: <RUN_STEP (100 Batches) (1500 Prior Batches): (820,9187,16)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:58:00] continuing trial: <RUN_STEP (100 Batches) (1600 Prior Batches): (820,9187,17)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:58:02] trial completed workload: <RUN_STEP (100 Batches) (700 Prior Batches): (821,9188,8)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:58:02] continuing trial: <RUN_STEP (100 Batches) (800 Prior Batches): (821,9188,9)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:58:53] trial completed workload: <RUN_STEP (100 Batches) (1600 Prior Batches): (820,9187,17)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:58:53] continuing trial: <RUN_STEP (100 Batches) (1700 Prior Batches): (820,9187,18)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:59:44] trial completed workload: <RUN_STEP (100 Batches) (800 Prior Batches): (821,9188,9)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:59:44] continuing trial: <RUN_STEP (100 Batches) (900 Prior Batches): (821,9188,10)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 19:59:45] trial completed workload: <RUN_STEP (100 Batches) (1700 Prior Batches): (820,9187,18)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 19:59:45] continuing trial: <RUN_STEP (100 Batches) (1800 Prior Batches): (820,9187,19)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 20:00:39] trial completed workload: <RUN_STEP (100 Batches) (1800 Prior Batches): (820,9187,19)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 20:00:39] continuing trial: <RUN_STEP (100 Batches) (1900 Prior Batches): (820,9187,20)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 20:01:18] experiment state changed to STOPPING_CANCELED id="820" system="master" type="experiment" | |
<info> [2021-04-06, 20:01:18] gracefully terminating trial experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 20:01:18] received killing request experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 20:01:18] forcibly terminating trial experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 20:01:18] killing container id: 31d0e91e-66b1-49a7-a309-7ed075437231 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<error> [2021-04-06, 20:01:18] error while actor was running error="websocket: close 1006 (abnormal closure): unexpected EOF" id="socket-31d0e91e-66b1-49a7-a309-7ed075437231" system="master" type="websocketActor" | |
<error> [2021-04-06, 20:01:18] websocket handler error: websocket: close 1006 (abnormal closure): unexpected EOF | |
<info> [2021-04-06, 20:01:18] found child actor failed, terminating forcibly experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 20:01:18] forcibly terminating trial experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 20:01:18] killing container id: 31d0e91e-66b1-49a7-a309-7ed075437231 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<info> [2021-04-06, 20:01:19] stopped container id: 31d0e91e-66b1-49a7-a309-7ed075437231 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<info> [2021-04-06, 20:01:19] found container terminated: 31d0e91e-66b1-49a7-a309-7ed075437231 experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 20:01:19] forcibly terminating trial experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 20:01:19] killing container id: 31d0e91e-66b1-49a7-a309-7ed075437231 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<info> [2021-04-06, 20:01:19] ignoring trial runner failure since it was killed experiment-id="820" failure="container failed with non-zero exit code: container failed with non-zero exit code: 137 (exit code 137)" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 20:01:19] trial stopped successfully experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial" | |
<info> [2021-04-06, 20:01:19] resources are released for /experiments/820/fbaae498-d290-41bd-8c46-f53db000e207 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:19] resources are released for /experiments/820/fbaae498-d290-41bd-8c46-f53db000e207 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:19] resources are released for /experiments/820/fbaae498-d290-41bd-8c46-f53db000e207 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:19] resources are released for /experiments/820/fbaae498-d290-41bd-8c46-f53db000e207 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:19] experiment state changed to STOPPING_CANCELED id="821" system="master" type="experiment" | |
<info> [2021-04-06, 20:01:19] gracefully terminating trial experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 20:01:19] received killing request experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 20:01:19] forcibly terminating trial experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 20:01:19] killing container id: 564d56f8-438e-4c30-8087-9c6a6ab9e5c9 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<error> [2021-04-06, 20:01:20] error while actor was running error="websocket: close 1006 (abnormal closure): unexpected EOF" id="socket-564d56f8-438e-4c30-8087-9c6a6ab9e5c9" system="master" type="websocketActor" | |
<error> [2021-04-06, 20:01:20] websocket handler error: websocket: close 1006 (abnormal closure): unexpected EOF | |
<info> [2021-04-06, 20:01:20] found child actor failed, terminating forcibly experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 20:01:20] forcibly terminating trial experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 20:01:20] killing container id: 564d56f8-438e-4c30-8087-9c6a6ab9e5c9 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<info> [2021-04-06, 20:01:20] experiment state changed to CANCELED id="820" system="master" type="experiment" | |
<info> [2021-04-06, 20:01:20] resources are requested by /experiment-820-checkpoint-gc (Task ID: 041686f2-cb00-4aac-8f4e-1258373f9e46) id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:20] allocated resources to /experiment-820-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:20] experiment shut down successfully id="820" system="master" type="experiment" | |
<info> [2021-04-06, 20:01:20] starting checkpoint garbage collection id="experiment-820-checkpoint-gc" system="master" type="checkpointGCTask" | |
<info> [2021-04-06, 20:01:20] starting container id: 81fa7b78-9c02-4d35-a84d-56b2548a07c2 slots: 0 task handler: /experiment-820-checkpoint-gc id="det-agent-release-party-accepted-sculpin" system="master" type="agent" | |
<info> [2021-04-06, 20:01:20] stopped container id: 564d56f8-438e-4c30-8087-9c6a6ab9e5c9 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<info> [2021-04-06, 20:01:20] found container terminated: 564d56f8-438e-4c30-8087-9c6a6ab9e5c9 experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 20:01:20] forcibly terminating trial experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 20:01:20] killing container id: 564d56f8-438e-4c30-8087-9c6a6ab9e5c9 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<info> [2021-04-06, 20:01:20] ignoring trial runner failure since it was killed experiment-id="821" failure="container failed with non-zero exit code: container failed with non-zero exit code: 137 (exit code 137)" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 20:01:20] trial stopped successfully experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial" | |
<info> [2021-04-06, 20:01:20] resources are released for /experiments/821/a368d0de-0ec9-47c9-bf88-d4e79b2ae403 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:20] resources are released for /experiments/821/a368d0de-0ec9-47c9-bf88-d4e79b2ae403 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:20] resources are released for /experiments/821/a368d0de-0ec9-47c9-bf88-d4e79b2ae403 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:20] resources are released for /experiments/821/a368d0de-0ec9-47c9-bf88-d4e79b2ae403 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:21] experiment state changed to CANCELED id="821" system="master" type="experiment" | |
<info> [2021-04-06, 20:01:21] resources are requested by /experiment-821-checkpoint-gc (Task ID: e745a022-bd80-41a4-b689-062e26e22b89) id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:21] experiment shut down successfully id="821" system="master" type="experiment" | |
<info> [2021-04-06, 20:01:21] allocated resources to /experiment-821-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:21] starting checkpoint garbage collection id="experiment-821-checkpoint-gc" system="master" type="checkpointGCTask" | |
<info> [2021-04-06, 20:01:21] starting container id: 7bc7dabe-adb8-45fb-b57c-0579e3b8b9e0 slots: 0 task handler: /experiment-821-checkpoint-gc id="det-agent-release-party-accepted-sculpin" system="master" type="agent" | |
<info> [2021-04-06, 20:01:22] experiment state changed to STOPPING_CANCELED id="818" system="master" type="experiment" | |
<info> [2021-04-06, 20:01:22] aborting trial before resources are allocated experiment-id="818" id="7a608ae4-8cbc-4d11-9e16-83b86c74f458" system="master" type="trial" | |
<info> [2021-04-06, 20:01:22] trial runner is aborted successfully experiment-id="818" id="7a608ae4-8cbc-4d11-9e16-83b86c74f458" system="master" type="trial" | |
<info> [2021-04-06, 20:01:22] resources are released for /experiments/818/7a608ae4-8cbc-4d11-9e16-83b86c74f458 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:22] resources are released for /experiments/818/7a608ae4-8cbc-4d11-9e16-83b86c74f458 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:22] resources are released for /experiments/818/7a608ae4-8cbc-4d11-9e16-83b86c74f458 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:22] resources are released for /experiments/818/7a608ae4-8cbc-4d11-9e16-83b86c74f458 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:22] experiment state changed to CANCELED id="818" system="master" type="experiment" | |
<info> [2021-04-06, 20:01:22] resources are requested by /experiment-818-checkpoint-gc (Task ID: 4deb58f1-f950-4554-9729-f746ff69ba21) id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:22] experiment shut down successfully id="818" system="master" type="experiment" | |
<info> [2021-04-06, 20:01:22] allocated resources to /experiment-818-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:22] starting checkpoint garbage collection id="experiment-818-checkpoint-gc" system="master" type="checkpointGCTask" | |
<info> [2021-04-06, 20:01:22] starting container id: 005dc5c4-f179-46ab-8667-209e3bbd3c43 slots: 0 task handler: /experiment-818-checkpoint-gc id="det-agent-release-party-accepted-sculpin" system="master" type="agent" | |
<info> [2021-04-06, 20:01:28] task forcible terminating id="2464edd4-5da7-4d8d-b01a-a146a9115e58" system="master" type="command" | |
<info> [2021-04-06, 20:01:28] killing container id: 9ac1b21e-e2a0-4f98-b5be-369cc7d972b5 id="det-agent-release-party-accepted-sculpin" system="master" type="agent" | |
<info> [2021-04-06, 20:01:29] task forcible terminating id="b53a120e-0541-4139-b208-dcc21a7848e2" system="master" type="command" | |
<info> [2021-04-06, 20:01:29] killing container id: ae9091e2-e545-494b-b3ce-970e72ee73eb id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<info> [2021-04-06, 20:01:29] stopped container id: ae9091e2-e545-494b-b3ce-970e72ee73eb id="det-agent-release-party-outgoing-mosquito" system="master" type="agent" | |
<info> [2021-04-06, 20:01:29] resources are released for /notebooks/b53a120e-0541-4139-b208-dcc21a7848e2 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:29] resources are released for /notebooks/b53a120e-0541-4139-b208-dcc21a7848e2 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:29] resources are released for /notebooks/b53a120e-0541-4139-b208-dcc21a7848e2 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:33] task forcible terminating id="2464edd4-5da7-4d8d-b01a-a146a9115e58" system="master" type="command" | |
<info> [2021-04-06, 20:01:33] killing container id: 9ac1b21e-e2a0-4f98-b5be-369cc7d972b5 id="det-agent-release-party-accepted-sculpin" system="master" type="agent" | |
<info> [2021-04-06, 20:01:46] stopped container id: 9ac1b21e-e2a0-4f98-b5be-369cc7d972b5 id="det-agent-release-party-accepted-sculpin" system="master" type="agent" | |
<info> [2021-04-06, 20:01:46] resources are released for /notebooks/2464edd4-5da7-4d8d-b01a-a146a9115e58 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:46] resources are released for /notebooks/2464edd4-5da7-4d8d-b01a-a146a9115e58 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:01:46] resources are released for /notebooks/2464edd4-5da7-4d8d-b01a-a146a9115e58 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:02:11] stopped container id: 005dc5c4-f179-46ab-8667-209e3bbd3c43 id="det-agent-release-party-accepted-sculpin" system="master" type="agent" | |
<info> [2021-04-06, 20:02:11] finished checkpoint garbage collection id="experiment-818-checkpoint-gc" system="master" type="checkpointGCTask" | |
<info> [2021-04-06, 20:02:11] resources are released for /experiment-818-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:02:26] stopped container id: 7bc7dabe-adb8-45fb-b57c-0579e3b8b9e0 id="det-agent-release-party-accepted-sculpin" system="master" type="agent" | |
<info> [2021-04-06, 20:02:26] finished checkpoint garbage collection id="experiment-821-checkpoint-gc" system="master" type="checkpointGCTask" | |
<info> [2021-04-06, 20:02:26] stopped container id: 81fa7b78-9c02-4d35-a84d-56b2548a07c2 id="det-agent-release-party-accepted-sculpin" system="master" type="agent" | |
<info> [2021-04-06, 20:02:26] finished checkpoint garbage collection id="experiment-820-checkpoint-gc" system="master" type="checkpointGCTask" | |
<info> [2021-04-06, 20:02:26] resources are released for /experiment-820-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:02:26] resources are released for /experiment-821-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:13:46] decided to terminate 1 instances: ,det-agent-release-party-tidy-crawdad (reason: long idle) id="provisioner" resource-pool="preemptible-gpu-pool" system="master" type="Provisioner" | |
<error> [2021-04-06, 20:13:48] error while actor was running error="websocket: close 1006 (abnormal closure): unexpected EOF" id="websocket-769d0dba-d358-4156-9cde-3b96b21487e9" system="master" type="websocketActor" | |
<error> [2021-04-06, 20:13:48] websocket: close 1006 (abnormal closure): unexpected EOF | |
<error> [2021-04-06, 20:13:48] http: connection has been hijacked | |
<error> [2021-04-06, 20:13:48] error while actor was running error="child failed: /agents/det-agent-release-party-tidy-crawdad/websocket-769d0dba-d358-4156-9cde-3b96b21487e9: websocket: close 1006 (abnormal closure): unexpected EOF" id="det-agent-release-party-tidy-crawdad" system="master" type="agent" | |
<info> [2021-04-06, 20:13:48] removing device: gpu4 (Tesla V100-SXM2-16GB) (det-agent-release-party-tidy-crawdad) id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:13:48] removing device: gpu5 (Tesla V100-SXM2-16GB) (det-agent-release-party-tidy-crawdad) id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:13:48] removing device: gpu6 (Tesla V100-SXM2-16GB) (det-agent-release-party-tidy-crawdad) id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:13:48] removing device: gpu7 (Tesla V100-SXM2-16GB) (det-agent-release-party-tidy-crawdad) id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:13:48] removing device: gpu0 (Tesla V100-SXM2-16GB) (det-agent-release-party-tidy-crawdad) id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:13:48] removing device: gpu1 (Tesla V100-SXM2-16GB) (det-agent-release-party-tidy-crawdad) id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:13:48] agent disconnected id="det-agent-release-party-tidy-crawdad" system="master" type="agent" | |
<info> [2021-04-06, 20:13:48] removing device: gpu2 (Tesla V100-SXM2-16GB) (det-agent-release-party-tidy-crawdad) id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:13:48] removing device: gpu3 (Tesla V100-SXM2-16GB) (det-agent-release-party-tidy-crawdad) id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:13:48] removing agent: det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool" | |
<info> [2021-04-06, 20:13:52] found state changes in 1 instances: det-agent-release-party-tidy-crawdad (Stopping) id="provisioner" resource-pool="preemptible-gpu-pool" system="master" type="Provisioner" | |
<info> [2021-04-06, 20:14:36] deleted 1/1 GCE instances: det-agent-release-party-tidy-crawdad id="provisioner" resource-pool="preemptible-gpu-pool" system="master" type="Provisioner" | |
<info> [2021-04-06, 20:14:38] found state changes in 0 instances: id="provisioner" resource-pool="preemptible-gpu-pool" system="master" type="Provisioner" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment