Skip to content

Instantly share code, notes, and snippets.

@mackrorysd
Forked from stoksc/master-log.txt
Created December 21, 2023 18:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mackrorysd/dcb7291a0dcc6549f1227fd1d0bf5eb5 to your computer and use it in GitHub Desktop.
Save mackrorysd/dcb7291a0dcc6549f1227fd1d0bf5eb5 to your computer and use it in GitHub Desktop.
<info> [2021-04-05, 22:11:08] master configuration: {"config_file":"","log":{"level":"info","color":true},"db":{"user":"postgres","password":"********","migrations":"file:///usr/share/determined/master/static/migrations","host":"10.108.192.5","port":"5432","name":"postgres","ssl_mode":"disable","ssl_root_cert":""},"tensorboard_timeout":300,"security":{"default_task":{"id":0,"user_id":0,"user":"root","uid":0,"group":"root","gid":0},"tls":{"cert":"/etc/ssl/certs/determined.cer","key":"/etc/ssl/private/determined.key"}},"checkpoint_storage":{"bucket":"release-party","save_experiment_best":0,"save_trial_best":1,"save_trial_latest":1,"type":"gcs"},"task_container_defaults":{"shm_size_bytes":4294967296,"network_mode":"bridge","cpu_pod_spec":null,"gpu_pod_spec":null},"port":443,"harness_path":"/opt/determined","root":"/usr/share/determined/master","telemetry":{"enabled":false,"segment_master_key":"********","segment_webui_key":"********"},"enable_cors":false,"cluster_name":"","logging":{"type":"default"},"hyperparameter_importance":{"workers_limit":0,"queue_limit":16,"cores_per_worker":1,"max_trees":100},"resource_manager":{"default_cpu_resource_pool":"default-cpu-pool","default_gpu_resource_pool":"default-gpu-pool","scheduler":{"default_priority":42,"fitting_policy":"best","preemption":true,"type":"priority"},"type":"agent"},"resource_pools":[{"pool_name":"default-gpu-pool","description":"","provider":{"agent_docker_image":"determinedai/determined-ee-agent:0.14.6","agent_docker_network":"host","agent_docker_runtime":"runc","agent_fluent_image":"fluent/fluent-bit:1.6","base_config":{"minCpuPlatform":"Intel Broadwell"},"boot_disk_size":200,"boot_disk_source_image":"projects/determined-ai/global/images/det-environments-067db2b","container_startup_script":"export HOME=/root\napt-get update \u0026\u0026 apt-get install -y curl docker.io\ncurl -fsSL \"https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v1.5.0/docker-credential-gcr_linux_amd64-1.5.0.tar.gz\" | tar xz --to-stdout \u003e /usr/bin/docker-credential-gcr \u0026\u0026 chmod +x /usr/bin/docker-credential-gcr\ndocker-credential-gcr configure-docker\n","instance_type":{"gpu_num":8,"gpu_type":"nvidia-tesla-v100","machine_type":"n1-standard-32","preemptible":false},"label_key":"managed-by","label_value":"","master_cert_name":"gcloud.determined.ai","master_url":"https://internal-ip:443","max_agent_starting_period":"20m0s","max_idle_agent_period":"30m0s","max_instances":8,"min_instances":0,"name_prefix":"det-agent-release-party-","network_interface":{"external_ip":false,"network":"projects/dai-public/global/networks/restricted-shared-network","subnetwork":"projects/dai-public/regions/us-west1/subnetworks/restricted-shared-network"},"network_tags":["https-server"],"operation_timeout_period":"5m0s","project":"","service_account":{"email":"argo-determined-ai-vm-agent@determined-ai.iam.gserviceaccount.com","scopes":["https://www.googleapis.com/auth/cloud-platform"]},"startup_script":"systemctl stop apt-daily.service\nsystemctl kill --kill-who=all apt-daily.service\nwhile pgrep -f apt.systemd.daily \u003e /dev/null; do echo waiting; sleep 1; done\n\necho \"cc2c8fc7-a61f-4ed1-935f-d6e04445c656\" | docker login -u=\"determinedaicustomer\" --password-stdin\n\nexport HOME=/root\napt-get update \u0026\u0026 apt-get install -y curl docker.io\ncurl -fsSL \"https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v1.5.0/docker-credential-gcr_linux_amd64-1.5.0.tar.gz\" | tar xz --to-stdout \u003e /usr/bin/docker-credential-gcr \u0026\u0026 chmod +x /usr/bin/docker-credential-gcr\ndocker-credential-gcr configure-docker\ndocker pull determinedai/determined-ee-agent:0.14.6 \n\ncurl -sSO https://dl.google.com/cloudagents/add-monitoring-agent-repo.sh\nbash add-monitoring-agent-repo.sh\napt-get update\napt-get install -y stackdriver-agent\nservice stackdriver-agent start\n\ncurl -sSO https://dl.google.com/cloudagents/add-logging-agent-repo.sh\nbash add-logging-agent-repo.sh\napt-get update\napt-get install google-fluentd\napt-get install -y google-fluentd-catch-all-config\nservice google-fluentd start\nservice google-fluentd status\n\napt-get install -y python-pip\npip install --upgrade pip\npip install --upgrade setuptools\ngit clone https://github.com/GoogleCloudPlatform/tensorflow-inference-tensorrt5-t4-gpu.git\ncd tensorflow-inference-tensorrt5-t4-gpu/metrics_reporting\npip install -r ./requirements.txt\ncp report_gpu_metrics.py /root/\ncat \u003c\u003c-EOH \u003e /lib/systemd/system/gpu_utilization_agent.service\n[Unit]\nDescription=GPU Utilization Metric Agent\n[Service]\nPIDFile=/run/gpu_agent.pid\nExecStart=/bin/bash --login -c '/usr/bin/python /root/report_gpu_metrics.py'\nUser=root\nGroup=root\nWorkingDirectory=/\nRestart=always\n[Install]\nWantedBy=multi-user.target\nEOH\nsystemctl daemon-reload\nsystemctl --no-reload --now enable /lib/systemd/system/gpu_utilization_agent.service\n","type":"gcp","zone":""},"max_cpu_containers_per_agent":0},{"pool_name":"default-cpu-pool","description":"","provider":{"agent_docker_image":"determinedai/determined-ee-agent:0.14.6","agent_docker_network":"host","agent_docker_runtime":"runc","agent_fluent_image":"fluent/fluent-bit:1.6","base_config":{"minCpuPlatform":"Intel Broadwell"},"boot_disk_size":200,"boot_disk_source_image":"projects/determined-ai/global/images/det-environments-067db2b","container_startup_script":"export HOME=/root\napt-get update \u0026\u0026 apt-get install -y curl docker.io\ncurl -fsSL \"https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v1.5.0/docker-credential-gcr_linux_amd64-1.5.0.tar.gz\" | tar xz --to-stdout \u003e /usr/bin/docker-credential-gcr \u0026\u0026 chmod +x /usr/bin/docker-credential-gcr\ndocker-credential-gcr configure-docker\n","instance_type":{"gpu_num":0,"gpu_type":"nvidia-tesla-v100","machine_type":"n1-standard-32","preemptible":false},"label_key":"managed-by","label_value":"","master_cert_name":"gcloud.determined.ai","master_url":"https://internal-ip:443","max_agent_starting_period":"20m0s","max_idle_agent_period":"30m0s","max_instances":8,"min_instances":0,"name_prefix":"det-agent-release-party-","network_interface":{"external_ip":false,"network":"projects/dai-public/global/networks/restricted-shared-network","subnetwork":"projects/dai-public/regions/us-west1/subnetworks/restricted-shared-network"},"network_tags":["https-server"],"operation_timeout_period":"5m0s","project":"","service_account":{"email":"argo-determined-ai-vm-agent@determined-ai.iam.gserviceaccount.com","scopes":["https://www.googleapis.com/auth/cloud-platform"]},"startup_script":"systemctl stop apt-daily.service\nsystemctl kill --kill-who=all apt-daily.service\nwhile pgrep -f apt.systemd.daily \u003e /dev/null; do echo waiting; sleep 1; done\n\necho \"cc2c8fc7-a61f-4ed1-935f-d6e04445c656\" | docker login -u=\"determinedaicustomer\" --password-stdin\n\nexport HOME=/root\napt-get update \u0026\u0026 apt-get install -y curl docker.io\ncurl -fsSL \"https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v1.5.0/docker-credential-gcr_linux_amd64-1.5.0.tar.gz\" | tar xz --to-stdout \u003e /usr/bin/docker-credential-gcr \u0026\u0026 chmod +x /usr/bin/docker-credential-gcr\ndocker-credential-gcr configure-docker\ndocker pull determinedai/determined-ee-agent:0.14.6 \n\ncurl -sSO https://dl.google.com/cloudagents/add-monitoring-agent-repo.sh\nbash add-monitoring-agent-repo.sh\napt-get update\napt-get install -y stackdriver-agent\nservice stackdriver-agent start\n\ncurl -sSO https://dl.google.com/cloudagents/add-logging-agent-repo.sh\nbash add-logging-agent-repo.sh\napt-get update\napt-get install google-fluentd\napt-get install -y google-fluentd-catch-all-config\nservice google-fluentd start\nservice google-fluentd status\n","type":"gcp","zone":""},"max_cpu_containers_per_agent":100},{"pool_name":"preemptible-gpu-pool","description":"","provider":{"agent_docker_image":"determinedai/determined-ee-agent:0.14.6","agent_docker_network":"host","agent_docker_runtime":"runc","agent_fluent_image":"fluent/fluent-bit:1.6","base_config":{"minCpuPlatform":"Intel Broadwell"},"boot_disk_size":200,"boot_disk_source_image":"projects/determined-ai/global/images/det-environments-067db2b","container_startup_script":"export HOME=/root\napt-get update \u0026\u0026 apt-get install -y curl docker.io\ncurl -fsSL \"https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v1.5.0/docker-credential-gcr_linux_amd64-1.5.0.tar.gz\" | tar xz --to-stdout \u003e /usr/bin/docker-credential-gcr \u0026\u0026 chmod +x /usr/bin/docker-credential-gcr\ndocker-credential-gcr configure-docker\n","instance_type":{"gpu_num":8,"gpu_type":"nvidia-tesla-v100","machine_type":"n1-standard-32","preemptible":true},"label_key":"managed-by","label_value":"","master_cert_name":"gcloud.determined.ai","master_url":"https://internal-ip:443","max_agent_starting_period":"20m0s","max_idle_agent_period":"30m0s","max_instances":8,"min_instances":0,"name_prefix":"det-agent-release-party-","network_interface":{"external_ip":false,"network":"projects/dai-public/global/networks/restricted-shared-network","subnetwork":"projects/dai-public/regions/us-west1/subnetworks/restricted-shared-network"},"network_tags":["https-server"],"operation_timeout_period":"5m0s","project":"","service_account":{"email":"argo-determined-ai-vm-agent@determined-ai.iam.gserviceaccount.com","scopes":["https://www.googleapis.com/auth/cloud-platform"]},"startup_script":"systemctl stop apt-daily.service\nsystemctl kill --kill-who=all apt-daily.service\nwhile pgrep -f apt.systemd.daily \u003e /dev/null; do echo waiting; sleep 1; done\n\necho \"cc2c8fc7-a61f-4ed1-935f-d6e04445c656\" | docker login -u=\"determinedaicustomer\" --password-stdin\n\nexport HOME=/root\napt-get update \u0026\u0026 apt-get install -y curl docker.io\ncurl -fsSL \"https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v1.5.0/docker-credential-gcr_linux_amd64-1.5.0.tar.gz\" | tar xz --to-stdout \u003e /usr/bin/docker-credential-gcr \u0026\u0026 chmod +x /usr/bin/docker-credential-gcr\ndocker-credential-gcr configure-docker\ndocker pull determinedai/determined-ee-agent:0.14.6 \n\ncurl -sSO https://dl.google.com/cloudagents/add-monitoring-agent-repo.sh\nbash add-monitoring-agent-repo.sh\napt-get update\napt-get install -y stackdriver-agent\nservice stackdriver-agent start\n\ncurl -sSO https://dl.google.com/cloudagents/add-logging-agent-repo.sh\nbash add-logging-agent-repo.sh\napt-get update\napt-get install google-fluentd\napt-get install -y google-fluentd-catch-all-config\nservice google-fluentd start\nservice google-fluentd status\n\napt-get install -y python-pip\npip install --upgrade pip\npip install --upgrade setuptools\ngit clone https://github.com/GoogleCloudPlatform/tensorflow-inference-tensorrt5-t4-gpu.git\ncd tensorflow-inference-tensorrt5-t4-gpu/metrics_reporting\npip install -r ./requirements.txt\ncp report_gpu_metrics.py /root/\ncat \u003c\u003c-EOH \u003e /lib/systemd/system/gpu_utilization_agent.service\n[Unit]\nDescription=GPU Utilization Metric Agent\n[Service]\nPIDFile=/run/gpu_agent.pid\nExecStart=/bin/bash --login -c '/usr/bin/python /root/report_gpu_metrics.py'\nUser=root\nGroup=root\nWorkingDirectory=/\nRestart=always\n[Install]\nWantedBy=multi-user.target\nEOH\nsystemctl daemon-reload\nsystemctl --no-reload --now enable /lib/systemd/system/gpu_utilization_agent.service\n","type":"gcp","zone":""},"max_cpu_containers_per_agent":0}],"scim":{"enabled":true,"auth":{"type":"oauth"}},"saml":{"enabled":true,"provider":"Okta","idp_recipient_url":"https://gcloud.determined.ai/saml/sso","idp_sso_url":"https://dev-2564556.okta.com/app/dev-2564556_determinedai_1/exkg2xv5x7w517g3G5d6/sso/saml","idp_sso_descriptor_url":"http://www.okta.com/exkg2xv5x7w517g3G5d6","idp_cert_path":"/etc/determined/etc/idp.cert"}}
<info> [2021-04-05, 22:11:08] Determined master 0.14.6 (built with go1.16.2)
<info> [2021-04-05, 22:11:08] connecting to database 10.108.192.5:5432
<info> [2021-04-05, 22:11:08] running migrations from file:///usr/share/determined/master/static/migrations
<info> [2021-04-05, 22:11:08] found golang-migrate version 20210322160616
<info> [2021-04-05, 22:11:08] deleting all snapshots for terminal state experiments
<info> [2021-04-05, 22:11:08] creating resource pool: default-gpu-pool id="agentRM" system="master" type="agentResourceManager"
<info> [2021-04-05, 22:11:08] pool default-gpu-pool using global scheduling config id="agentRM" system="master" type="agentResourceManager"
<info> [2021-04-05, 22:11:08] creating resource pool: default-cpu-pool id="agentRM" system="master" type="agentResourceManager"
<info> [2021-04-05, 22:11:08] pool default-cpu-pool using global scheduling config id="agentRM" system="master" type="agentResourceManager"
<info> [2021-04-05, 22:11:08] creating resource pool: preemptible-gpu-pool id="agentRM" system="master" type="agentResourceManager"
<info> [2021-04-05, 22:11:08] pool preemptible-gpu-pool using global scheduling config id="agentRM" system="master" type="agentResourceManager"
<info> [2021-04-05, 22:11:08] initializing endpoints for agents
<info> [2021-04-05, 22:11:08] found provisioner configuration id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:08] connecting to GCP id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:08] found provisioner configuration id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:08] connecting to GCP id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:08] found provisioner configuration id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:08] connecting to GCP id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] telemetry reporting is disabled
<info> [2021-04-05, 22:11:09] restoring experiment experiment="604"
<info> [2021-04-05, 22:11:09] restoring experiment experiment="571"
<info> [2021-04-05, 22:11:09] restoring experiment experiment="350"
<info> [2021-04-05, 22:11:09] restoring experiment experiment="345"
<info> [2021-04-05, 22:11:09] restoring experiment experiment="569"
<info> [2021-04-05, 22:11:09] restoring experiment experiment="495"
<info> [2021-04-05, 22:11:09] OAuth is enabled at https://10.138.0.18:443/oauth2
<info> [2021-04-05, 22:11:09] SCIM is enabled at https://10.138.0.18:443/scim/v2
<info> [2021-04-05, 22:11:09] SAML is enabled
<info> [2021-04-05, 22:11:09] accepting incoming connections on port 443
<info> [2021-04-05, 22:11:09] Subchannel Connectivity change to READY system="system"
<info> [2021-04-05, 22:11:09] pickfirstBalancer: HandleSubConnStateChange: 0xc0008102d0, {READY <nil>} system="system"
<info> [2021-04-05, 22:11:09] Channel Connectivity change to READY system="system"
<info> [2021-04-05, 22:11:09] no snapshot found experiment-id="604"
<info> [2021-04-05, 22:11:09] no snapshot found experiment-id="571"
<info> [2021-04-05, 22:11:09] restored experiment experiment="604"
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/604 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/604 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] restored experiment experiment="571"
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/571 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/571 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/604 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/571 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] no snapshot found experiment-id="345"
<info> [2021-04-05, 22:11:09] no snapshot found experiment-id="350"
<info> [2021-04-05, 22:11:09] no snapshot found experiment-id="495"
<info> [2021-04-05, 22:11:09] no snapshot found experiment-id="569"
<info> [2021-04-05, 22:11:09] restored experiment experiment="345"
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/345 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/345 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/345 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] restored experiment experiment="495"
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/495 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/495 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/495 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] restored experiment experiment="569"
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/569 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/569 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/569 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] restored experiment experiment="350"
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/350 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/350 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] setting priority for group of /experiments/350 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-05, 22:11:09] aggregated resource allocation statistics for 2021-04-02 00:00:00 +0000 UTC in 534.808897ms
<info> [2021-04-05, 22:11:09] aggregated resource allocation statistics for 2021-04-03 00:00:00 +0000 UTC in 189.601047ms
<info> [2021-04-05, 22:11:10] aggregated resource allocation statistics for 2021-04-04 00:00:00 +0000 UTC in 235.883175ms
<info> [2021-04-05, 22:11:10] scheduling next resource allocation aggregation in 1h49m50s at 2021-04-06 00:01:00 +0000 UTC id="allocation-aggregator" system="master" type="allocationAggregator"
<info> [2021-04-05, 22:12:57] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="GetAgents" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-05T22:12:57Z" grpc.time_ms="0.052" span.kind="server" system="grpc"
<info> [2021-04-05, 22:13:03] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="GetAgents" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-05T22:13:03Z" grpc.time_ms="0.059" span.kind="server" system="grpc"
<info> [2021-04-05, 22:13:09] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="GetAgents" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-05T22:13:09Z" grpc.time_ms="0.043" span.kind="server" system="grpc"
<info> [2021-04-05, 22:17:30] user authorizing an OAuth application request_url="/oauth2/authorize?response_type=code&state=0oag2xv5yhSRnE4855d6%2Cdev-2564556%2C01de8e4cdd7d5bbbcc9b2be32e758750afa6500a0800ee3a71ede70b202fb6bb&client_id=47dabda2c57e0794c713d9a9173144d30910d0f3da875d7fbd9d3282b5625fca&redirect_uri=https%3A%2F%2Fsystem-admin.okta.com%2Fadmin%2Fapp%2Fcpc%2Fdev-2564556_determinedai_1%2Foauth%2Fcallback" username="hoang@determined.ai"
<error> [2021-04-05, 22:17:30] OAuth internal error occurred error="non-admin user hoang@determined.ai cannot authorize OAuth applications"
<error> [2021-04-05, 22:17:30] OAuth response error occurred error="server_error" response="&{server_error 0 The authorization server encountered an unexpected condition that prevented it from fulfilling the request 500 map[]}"
<info> [2021-04-05, 22:17:51] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="GetAgents" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-05T22:17:51Z" grpc.time_ms="0.042" span.kind="server" system="grpc"
<info> [2021-04-05, 22:17:55] user authorizing an OAuth application request_url="/oauth2/authorize?response_type=code&state=0oag2xv5yhSRnE4855d6%2Cdev-2564556%2C01de8e4cdd7d5bbbcc9b2be32e758750afa6500a0800ee3a71ede70b202fb6bb&client_id=47dabda2c57e0794c713d9a9173144d30910d0f3da875d7fbd9d3282b5625fca&redirect_uri=https%3A%2F%2Fsystem-admin.okta.com%2Fadmin%2Fapp%2Fcpc%2Fdev-2564556_determinedai_1%2Foauth%2Fcallback" username="admin"
<error> [2021-04-05, 22:17:55] OAuth internal error occurred error="unknown OAuth client ID \"47dabda2c57e0794c713d9a9173144d30910d0f3da875d7fbd9d3282b5625fca\""
<error> [2021-04-05, 22:17:55] OAuth response error occurred error="server_error" response="&{server_error 0 The authorization server encountered an unexpected condition that prevented it from fulfilling the request 500 map[]}"
<info> [2021-04-05, 22:17:55] user authorizing an OAuth application request_url="/oauth2/authorize?response_type=code&state=0oag2xv5yhSRnE4855d6%2Cdev-2564556%2C01de8e4cdd7d5bbbcc9b2be32e758750afa6500a0800ee3a71ede70b202fb6bb&client_id=47dabda2c57e0794c713d9a9173144d30910d0f3da875d7fbd9d3282b5625fca&redirect_uri=https%3A%2F%2Fsystem-admin.okta.com%2Fadmin%2Fapp%2Fcpc%2Fdev-2564556_determinedai_1%2Foauth%2Fcallback" username="admin"
<error> [2021-04-05, 22:17:56] OAuth internal error occurred error="unknown OAuth client ID \"47dabda2c57e0794c713d9a9173144d30910d0f3da875d7fbd9d3282b5625fca\""
<error> [2021-04-05, 22:17:56] OAuth response error occurred error="server_error" response="&{server_error 0 The authorization server encountered an unexpected condition that prevented it from fulfilling the request 500 map[]}"
<info> [2021-04-05, 22:18:50] user authorizing an OAuth application request_url="/oauth2/authorize?response_type=code&state=0oag2xv5yhSRnE4855d6%2Cdev-2564556%2C01de8e4cdd7d5bbbcc9b2be32e758750afa6500a0800ee3a71ede70b202fb6bb&client_id=3b26fb2b19167f4d114f31d2a5a50766d3e4ccce1d5ecbcd75e43e934e6a4aee&redirect_uri=https%3A%2F%2Fsystem-admin.okta.com%2Fadmin%2Fapp%2Fcpc%2Fdev-2564556_determinedai_1%2Foauth%2Fcallback" username="admin"
<error> [2021-04-05, 22:18:50] OAuth internal error occurred error="unknown OAuth client ID \"3b26fb2b19167f4d114f31d2a5a50766d3e4ccce1d5ecbcd75e43e934e6a4aee\""
<error> [2021-04-05, 22:18:50] OAuth response error occurred error="server_error" response="&{server_error 0 The authorization server encountered an unexpected condition that prevented it from fulfilling the request 500 map[]}"
<info> [2021-04-05, 22:20:41] user authorizing an OAuth application request_url="/oauth2/authorize?response_type=code&state=0oag2xv5yhSRnE4855d6%2Cdev-2564556%2C01de8e4cdd7d5bbbcc9b2be32e758750afa6500a0800ee3a71ede70b202fb6bb&client_id=370946fb2e6381dc5566015de3e4cce2ed23e7a54514f7946e3490fdf2c9e495&redirect_uri=https%3A%2F%2Fsystem-admin.okta.com%2Fadmin%2Fapp%2Fcpc%2Fdev-2564556_determinedai_1%2Foauth%2Fcallback" username="admin"
<error> [2021-04-05, 22:20:41] OAuth internal error occurred error="unknown OAuth client ID \"370946fb2e6381dc5566015de3e4cce2ed23e7a54514f7946e3490fdf2c9e495\""
<error> [2021-04-05, 22:20:41] OAuth response error occurred error="server_error" response="&{server_error 0 The authorization server encountered an unexpected condition that prevented it from fulfilling the request 500 map[]}"
<info> [2021-04-05, 22:24:13] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="GetAgents" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-05T22:24:13Z" grpc.time_ms="0.043" span.kind="server" system="grpc"
<info> [2021-04-06, 00:01:00] aggregated resource allocation statistics for 2021-04-05 00:00:00 +0000 UTC in 244.465402ms
<info> [2021-04-06, 00:01:00] scheduling next resource allocation aggregation in 24h0m0s at 2021-04-07 00:01:00 +0000 UTC id="allocation-aggregator" system="master" type="allocationAggregator"
<info> [2021-04-06, 13:38:26] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="GetAgents" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-06T13:38:26Z" grpc.time_ms="0.042" span.kind="server" system="grpc"
<info> [2021-04-06, 17:48:48] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="CurrentUser" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-06T17:48:48Z" grpc.time_ms="33.059" span.kind="server" system="grpc"
<info> [2021-04-06, 17:48:48] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="GetAgents" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-06T17:48:48Z" grpc.time_ms="33.101" span.kind="server" system="grpc"
<info> [2021-04-06, 17:48:48] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="Logout" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-06T17:48:48Z" grpc.time_ms="32.911" span.kind="server" system="grpc"
<info> [2021-04-06, 18:46:58] finished unary call with code Unauthenticated error="rpc error: code = Unauthenticated desc = invalid credentials" grpc.code="Unauthenticated" grpc.method="GetAgents" grpc.service="determined.api.v1.Determined" grpc.start_time="2021-04-06T18:46:58Z" grpc.time_ms="0.045" span.kind="server" system="grpc"
<info> [2021-04-06, 19:32:11] setting priority for group of /experiments/816 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:32:11] setting priority for group of /experiments/816 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:32:11] setting priority for group of /experiments/816 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<warning> [2021-04-06, 19:32:11] response already committed
<info> [2021-04-06, 19:32:12] experiment state changed to ACTIVE id="816" system="master" type="experiment"
<info> [2021-04-06, 19:32:12] resources are requested by /experiments/816/a35b9598-94c5-4dcd-9399-0d94d90c3bf9 (Task ID: de2356b3-dcf6-4864-9b86-92103c1203da) id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:32:16] decided to launch 1 instances (type n1-standard-32-nvidia-tesla-v100-8) id="provisioner" resource-pool="default-gpu-pool" system="master" type="Provisioner"
<info> [2021-04-06, 19:32:22] found state changes in 1 instances: det-agent-release-party-outgoing-mosquito (Starting) id="provisioner" resource-pool="default-gpu-pool" system="master" type="Provisioner"
<info> [2021-04-06, 19:33:27] setting priority for group of /experiments/817 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:33:27] setting priority for group of /experiments/817 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:33:27] setting priority for group of /experiments/817 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<warning> [2021-04-06, 19:33:27] response already committed
<info> [2021-04-06, 19:33:27] experiment state changed to ACTIVE id="817" system="master" type="experiment"
<info> [2021-04-06, 19:33:27] resources are requested by /experiments/817/f2a8e7ea-5649-400a-90bf-c979d2fda817 (Task ID: bae45824-da61-46aa-a6a3-8d7af98ca3a7) id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:33:28] decided to launch 1 instances (type n1-standard-32-nvidia-tesla-v100-8) id="provisioner" resource-pool="preemptible-gpu-pool" system="master" type="Provisioner"
<info> [2021-04-06, 19:33:34] found state changes in 1 instances: det-agent-release-party-tidy-crawdad (Starting) id="provisioner" resource-pool="preemptible-gpu-pool" system="master" type="Provisioner"
<info> [2021-04-06, 19:33:50] setting priority for group of /experiments/818 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<warning> [2021-04-06, 19:33:50] response already committed
<info> [2021-04-06, 19:33:50] setting priority for group of /experiments/818 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:33:50] setting priority for group of /experiments/818 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:33:50] experiment state changed to ACTIVE id="818" system="master" type="experiment"
<info> [2021-04-06, 19:33:50] resources are requested by /experiments/818/7a608ae4-8cbc-4d11-9e16-83b86c74f458 (Task ID: d2f3e747-060d-4db8-84bc-a41ae9e5ef8c) id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:34:07] setting priority for group of /experiments/819 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<warning> [2021-04-06, 19:34:07] response already committed
<info> [2021-04-06, 19:34:07] setting priority for group of /experiments/819 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:34:07] setting priority for group of /experiments/819 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:34:07] experiment state changed to ACTIVE id="819" system="master" type="experiment"
<info> [2021-04-06, 19:34:07] resources are requested by /experiments/819/fe51990b-8878-4df1-8a52-245e85da1301 (Task ID: 30de80e4-8db7-4e87-b300-f0a48d26ebbb) id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<warning> [2021-04-06, 19:34:17] GCE throws out warning (code DISK_SIZE_LARGER_THAN_IMAGE_SIZE) for operation "4583197155660990894" targeting "https://www.googleapis.com/compute/v1/projects/determined-ai/zones/us-west1-b/instances/det-agent-release-party-outgoing-mosquito": Disk size: '200 GB' is larger than image size: '100 GB'. You might need to resize the root repartition manually if the operating system does not support automatic resizing. See https://cloud.google.com/compute/docs/disks/add-persistent-disk#resize_pd for details. id="track-batch-operation-4e0de6fd-daf2-4445-aa2e-fb31a608eeec" system="master" type="gcpBatchOperationTracker"
<info> [2021-04-06, 19:34:17] inserted 1/1 GCE instances: det-agent-release-party-outgoing-mosquito id="provisioner" resource-pool="default-gpu-pool" system="master" type="Provisioner"
<info> [2021-04-06, 19:34:20] found state changes in 1 instances: det-agent-release-party-outgoing-mosquito (Running) id="provisioner" resource-pool="default-gpu-pool" system="master" type="Provisioner"
<warning> [2021-04-06, 19:35:51] GCE throws out warning (code DISK_SIZE_LARGER_THAN_IMAGE_SIZE) for operation "4591043566989543783" targeting "https://www.googleapis.com/compute/v1/projects/determined-ai/zones/us-west1-b/instances/det-agent-release-party-tidy-crawdad": Disk size: '200 GB' is larger than image size: '100 GB'. You might need to resize the root repartition manually if the operating system does not support automatic resizing. See https://cloud.google.com/compute/docs/disks/add-persistent-disk#resize_pd for details. id="track-batch-operation-d23ce758-5475-440d-8116-51298dbc15fc" system="master" type="gcpBatchOperationTracker"
<info> [2021-04-06, 19:35:51] inserted 1/1 GCE instances: det-agent-release-party-tidy-crawdad id="provisioner" resource-pool="preemptible-gpu-pool" system="master" type="Provisioner"
<info> [2021-04-06, 19:35:53] found state changes in 1 instances: det-agent-release-party-tidy-crawdad (Running) id="provisioner" resource-pool="preemptible-gpu-pool" system="master" type="Provisioner"
<info> [2021-04-06, 19:39:12] agent connected ip: 10.138.0.20 resource pool: default-gpu-pool slots: 8 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<info> [2021-04-06, 19:39:12] adding agent: det-agent-release-party-outgoing-mosquito id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:39:12] adding device: gpu7 (Tesla V100-SXM2-16GB) on det-agent-release-party-outgoing-mosquito id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:39:12] adding device: gpu0 (Tesla V100-SXM2-16GB) on det-agent-release-party-outgoing-mosquito id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:39:12] adding device: gpu1 (Tesla V100-SXM2-16GB) on det-agent-release-party-outgoing-mosquito id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:39:12] adding device: gpu2 (Tesla V100-SXM2-16GB) on det-agent-release-party-outgoing-mosquito id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:39:12] adding device: gpu3 (Tesla V100-SXM2-16GB) on det-agent-release-party-outgoing-mosquito id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:39:12] adding device: gpu4 (Tesla V100-SXM2-16GB) on det-agent-release-party-outgoing-mosquito id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:39:12] adding device: gpu5 (Tesla V100-SXM2-16GB) on det-agent-release-party-outgoing-mosquito id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:39:12] adding device: gpu6 (Tesla V100-SXM2-16GB) on det-agent-release-party-outgoing-mosquito id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:39:13] allocated resources to /experiments/816/a35b9598-94c5-4dcd-9399-0d94d90c3bf9 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:39:13] allocated resources to /experiments/819/fe51990b-8878-4df1-8a52-245e85da1301 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:39:13] starting trial container: <RUN_STEP (100 Batches) (0 Prior Batches): (819,9184,1)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:39:13] starting trial container: <RUN_STEP (100 Batches) (0 Prior Batches): (816,9185,1)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:39:13] starting container id: 421d59a6-d124-4c9f-a638-b99b53c6cf44 slots: 4 task handler: /experiments/819/fe51990b-8878-4df1-8a52-245e85da1301 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<info> [2021-04-06, 19:39:13] starting container id: e2d307a7-c947-4bf1-9b80-83965beb8198 slots: 1 task handler: /experiments/816/a35b9598-94c5-4dcd-9399-0d94d90c3bf9 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<info> [2021-04-06, 19:40:55] agent connected ip: 10.138.0.21 resource pool: preemptible-gpu-pool slots: 8 id="det-agent-release-party-tidy-crawdad" system="master" type="agent"
<info> [2021-04-06, 19:40:55] adding agent: det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:40:55] adding device: gpu7 (Tesla V100-SXM2-16GB) on det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:40:55] adding device: gpu2 (Tesla V100-SXM2-16GB) on det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:40:55] adding device: gpu0 (Tesla V100-SXM2-16GB) on det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:40:55] adding device: gpu1 (Tesla V100-SXM2-16GB) on det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:40:55] adding device: gpu4 (Tesla V100-SXM2-16GB) on det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:40:55] adding device: gpu3 (Tesla V100-SXM2-16GB) on det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:40:55] adding device: gpu5 (Tesla V100-SXM2-16GB) on det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:40:55] adding device: gpu6 (Tesla V100-SXM2-16GB) on det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:40:55] allocated resources to /experiments/817/f2a8e7ea-5649-400a-90bf-c979d2fda817 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:40:55] starting trial container: <RUN_STEP (100 Batches) (0 Prior Batches): (817,9186,1)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:40:55] starting container id: d92e3ab2-54ef-47db-aaef-f0583c907afc slots: 1 task handler: /experiments/817/f2a8e7ea-5649-400a-90bf-c979d2fda817 id="det-agent-release-party-tidy-crawdad" system="master" type="agent"
<info> [2021-04-06, 19:41:26] found container running: 421d59a6-d124-4c9f-a638-b99b53c6cf44 (rank 0) experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:41:26] pushing rendezvous information experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:41:26] found not all containers are connected experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:41:26] found container running: e2d307a7-c947-4bf1-9b80-83965beb8198 (rank 0) experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:26] pushing rendezvous information experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:26] found not all containers are connected experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:33] new connection from container 421d59a6-d124-4c9f-a638-b99b53c6cf44 trial 9184 (experiment 819) at 10.138.0.20:51316
<info> [2021-04-06, 19:41:33] pushing rendezvous information experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:41:33] found all containers are connected successfully experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:41:33] new connection from container e2d307a7-c947-4bf1-9b80-83965beb8198 trial 9185 (experiment 816) at 10.138.0.20:58214
<info> [2021-04-06, 19:41:33] pushing rendezvous information experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:33] found all containers are connected successfully experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:41] trial completed workload: <RUN_STEP (100 Batches) (0 Prior Batches): (816,9185,1)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:41] continuing trial: <RUN_STEP (100 Batches) (100 Prior Batches): (816,9185,2)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:41] creating notebook id="notebooks" system="master" type="notebookManager"
<info> [2021-04-06, 19:41:41] resources are requested by /notebooks/2464edd4-5da7-4d8d-b01a-a146a9115e58 (Task ID: 2464edd4-5da7-4d8d-b01a-a146a9115e58) id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:41:41] created notebook 2464edd4-5da7-4d8d-b01a-a146a9115e58 id="notebooks" system="master" type="notebookManager"
<info> [2021-04-06, 19:41:41] setting priority for group of /notebooks/2464edd4-5da7-4d8d-b01a-a146a9115e58 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:41:41] setting priority for group of /notebooks/2464edd4-5da7-4d8d-b01a-a146a9115e58 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:41:41] setting priority for group of /notebooks/2464edd4-5da7-4d8d-b01a-a146a9115e58 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:41:42] decided to launch 1 instances (type n1-standard-32-nvidia-tesla-v100-0) id="provisioner" resource-pool="default-cpu-pool" system="master" type="Provisioner"
<info> [2021-04-06, 19:41:43] trial completed workload: <RUN_STEP (100 Batches) (100 Prior Batches): (816,9185,2)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:43] continuing trial: <RUN_STEP (100 Batches) (200 Prior Batches): (816,9185,3)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<error> [2021-04-06, 19:41:43] error while actor was running error="websocket: close 1001 (going away)" id="websocket-2da0ee0a-c0fb-4f25-b936-54ad51dcad15" system="master" type="websocketActor"
<error> [2021-04-06, 19:41:43] websocket: close 1001 (going away)
<error> [2021-04-06, 19:41:43] http: connection has been hijacked
<info> [2021-04-06, 19:41:45] trial completed workload: <RUN_STEP (100 Batches) (200 Prior Batches): (816,9185,3)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:45] continuing trial: <RUN_STEP (100 Batches) (300 Prior Batches): (816,9185,4)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:46] creating notebook id="notebooks" system="master" type="notebookManager"
<info> [2021-04-06, 19:41:46] resources are requested by /notebooks/b53a120e-0541-4139-b208-dcc21a7848e2 (Task ID: b53a120e-0541-4139-b208-dcc21a7848e2) id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:41:46] created notebook b53a120e-0541-4139-b208-dcc21a7848e2 id="notebooks" system="master" type="notebookManager"
<info> [2021-04-06, 19:41:46] setting priority for group of /notebooks/b53a120e-0541-4139-b208-dcc21a7848e2 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:41:46] setting priority for group of /notebooks/b53a120e-0541-4139-b208-dcc21a7848e2 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:41:46] setting priority for group of /notebooks/b53a120e-0541-4139-b208-dcc21a7848e2 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:41:46] trial completed workload: <RUN_STEP (100 Batches) (0 Prior Batches): (819,9184,1)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:41:46] continuing trial: <RUN_STEP (100 Batches) (100 Prior Batches): (819,9184,2)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:41:47] trial completed workload: <RUN_STEP (100 Batches) (300 Prior Batches): (816,9185,4)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:47] continuing trial: <RUN_STEP (100 Batches) (400 Prior Batches): (816,9185,5)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:47] allocated resources to /notebooks/b53a120e-0541-4139-b208-dcc21a7848e2 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:41:47] starting container id: ae9091e2-e545-494b-b3ce-970e72ee73eb slots: 1 task handler: /notebooks/b53a120e-0541-4139-b208-dcc21a7848e2 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<info> [2021-04-06, 19:41:48] found state changes in 1 instances: det-agent-release-party-accepted-sculpin (Starting) id="provisioner" resource-pool="default-cpu-pool" system="master" type="Provisioner"
<info> [2021-04-06, 19:41:48] trial completed workload: <RUN_STEP (100 Batches) (100 Prior Batches): (819,9184,2)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:41:48] continuing trial: <RUN_STEP (100 Batches) (200 Prior Batches): (819,9184,3)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<error> [2021-04-06, 19:41:48] error while actor was running error="websocket: close 1006 (abnormal closure): unexpected EOF" id="websocket-a71a73b6-cce3-4443-8b8a-610f694d92c6" system="master" type="websocketActor"
<error> [2021-04-06, 19:41:48] websocket: close 1006 (abnormal closure): unexpected EOF
<error> [2021-04-06, 19:41:48] http: connection has been hijacked
<info> [2021-04-06, 19:41:49] trial completed workload: <RUN_STEP (100 Batches) (400 Prior Batches): (816,9185,5)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:49] continuing trial: <RUN_STEP (100 Batches) (500 Prior Batches): (816,9185,6)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:50] trial completed workload: <RUN_STEP (100 Batches) (200 Prior Batches): (819,9184,3)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:41:50] continuing trial: <RUN_STEP (100 Batches) (300 Prior Batches): (819,9184,4)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:41:51] trial completed workload: <RUN_STEP (100 Batches) (500 Prior Batches): (816,9185,6)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:51] continuing trial: <RUN_STEP (100 Batches) (600 Prior Batches): (816,9185,7)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<warning> [2021-04-06, 19:41:52] GCE throws out warning (code DISK_SIZE_LARGER_THAN_IMAGE_SIZE) for operation "51051865676253080" targeting "https://www.googleapis.com/compute/v1/projects/determined-ai/zones/us-west1-b/instances/det-agent-release-party-accepted-sculpin": Disk size: '200 GB' is larger than image size: '100 GB'. You might need to resize the root repartition manually if the operating system does not support automatic resizing. See https://cloud.google.com/compute/docs/disks/add-persistent-disk#resize_pd for details. id="track-batch-operation-65300447-2877-4c33-bc79-9ee35fe401fd" system="master" type="gcpBatchOperationTracker"
<info> [2021-04-06, 19:41:52] inserted 1/1 GCE instances: det-agent-release-party-accepted-sculpin id="provisioner" resource-pool="default-cpu-pool" system="master" type="Provisioner"
<info> [2021-04-06, 19:41:52] trial completed workload: <RUN_STEP (100 Batches) (300 Prior Batches): (819,9184,4)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:41:52] continuing trial: <RUN_STEP (100 Batches) (400 Prior Batches): (819,9184,5)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:41:53] registering service: b53a120e-0541-4139-b208-dcc21a7848e2 (http://10.138.0.20:49157) id="proxy" system="master" type="Proxy"
<info> [2021-04-06, 19:41:53] trial completed workload: <RUN_STEP (100 Batches) (600 Prior Batches): (816,9185,7)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:53] continuing trial: <RUN_STEP (100 Batches) (700 Prior Batches): (816,9185,8)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:53] found state changes in 1 instances: det-agent-release-party-accepted-sculpin (Running) id="provisioner" resource-pool="default-cpu-pool" system="master" type="Provisioner"
<info> [2021-04-06, 19:41:55] trial completed workload: <RUN_STEP (100 Batches) (400 Prior Batches): (819,9184,5)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:41:55] continuing trial: <RUN_STEP (100 Batches) (500 Prior Batches): (819,9184,6)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:41:55] trial completed workload: <RUN_STEP (100 Batches) (700 Prior Batches): (816,9185,8)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:55] continuing trial: <RUN_STEP (100 Batches) (800 Prior Batches): (816,9185,9)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:56] trial completed workload: <RUN_STEP (100 Batches) (500 Prior Batches): (819,9184,6)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:41:56] continuing trial: <RUN_STEP (100 Batches) (600 Prior Batches): (819,9184,7)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:41:57] trial completed workload: <RUN_STEP (100 Batches) (800 Prior Batches): (816,9185,9)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:57] continuing trial: <RUN_STEP (37 Batches) (900 Prior Batches): (816,9185,10)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:58] trial completed workload: <RUN_STEP (37 Batches) (900 Prior Batches): (816,9185,10)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:58] continuing trial: <CHECKPOINT_MODEL (937 Prior Batches): (816,9185,10)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:41:59] trial completed workload: <RUN_STEP (100 Batches) (600 Prior Batches): (819,9184,7)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:41:59] continuing trial: <RUN_STEP (100 Batches) (700 Prior Batches): (819,9184,8)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:41:59] readiness check passed: notebook id="b53a120e-0541-4139-b208-dcc21a7848e2" system="master" type="command"
<info> [2021-04-06, 19:42:00] trial completed workload: <CHECKPOINT_MODEL (937 Prior Batches): (816,9185,10)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:42:00] continuing trial: <COMPUTE_VALIDATION_METRICS (937 Prior Batches): (816,9185,10)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:42:01] trial completed workload: <RUN_STEP (100 Batches) (700 Prior Batches): (819,9184,8)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:42:01] continuing trial: <RUN_STEP (100 Batches) (800 Prior Batches): (819,9184,9)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:42:03] trial completed workload: <RUN_STEP (100 Batches) (800 Prior Batches): (819,9184,9)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:42:03] continuing trial: <RUN_STEP (37 Batches) (900 Prior Batches): (819,9184,10)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:42:03] trial completed workload: <COMPUTE_VALIDATION_METRICS (937 Prior Batches): (816,9185,10)> experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:42:03] terminating gracefully because there are no more workloads experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:42:03] gracefully terminating trial experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:42:04] trial completed workload: <RUN_STEP (37 Batches) (900 Prior Batches): (819,9184,10)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:42:04] continuing trial: <CHECKPOINT_MODEL (937 Prior Batches): (819,9184,10)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:42:05] stopped container id: e2d307a7-c947-4bf1-9b80-83965beb8198 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<info> [2021-04-06, 19:42:05] found container terminated: e2d307a7-c947-4bf1-9b80-83965beb8198 experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:42:05] forcibly terminating trial experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:42:05] killing container id: e2d307a7-c947-4bf1-9b80-83965beb8198 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<info> [2021-04-06, 19:42:05] trial runner stopped successfully experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:42:05] trial stopped successfully experiment-id="816" id="a35b9598-94c5-4dcd-9399-0d94d90c3bf9" system="master" trial-id="9185" type="trial"
<info> [2021-04-06, 19:42:05] resources are released for /experiments/816/a35b9598-94c5-4dcd-9399-0d94d90c3bf9 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:42:05] resources are released for /experiments/816/a35b9598-94c5-4dcd-9399-0d94d90c3bf9 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:42:05] resources are released for /experiments/816/a35b9598-94c5-4dcd-9399-0d94d90c3bf9 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:42:05] resources are released for /experiments/816/a35b9598-94c5-4dcd-9399-0d94d90c3bf9 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:42:05] experiment state changed to STOPPING_COMPLETED id="816" system="master" type="experiment"
<info> [2021-04-06, 19:42:05] experiment state changed to COMPLETED id="816" system="master" type="experiment"
<info> [2021-04-06, 19:42:05] resources are requested by /experiment-816-checkpoint-gc (Task ID: 74b7e5de-2d6a-48fb-a244-dd0f52e36a5f) id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:42:05] experiment shut down successfully id="816" system="master" type="experiment"
<info> [2021-04-06, 19:42:06] trial completed workload: <CHECKPOINT_MODEL (937 Prior Batches): (819,9184,10)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:42:06] continuing trial: <COMPUTE_VALIDATION_METRICS (937 Prior Batches): (819,9184,10)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:42:07] trial completed workload: <COMPUTE_VALIDATION_METRICS (937 Prior Batches): (819,9184,10)> experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:42:08] terminating gracefully because there are no more workloads experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:42:08] gracefully terminating trial experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:42:09] stopped container id: 421d59a6-d124-4c9f-a638-b99b53c6cf44 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<info> [2021-04-06, 19:42:09] found container terminated: 421d59a6-d124-4c9f-a638-b99b53c6cf44 experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:42:09] forcibly terminating trial experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:42:09] killing container id: 421d59a6-d124-4c9f-a638-b99b53c6cf44 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<info> [2021-04-06, 19:42:09] trial runner stopped successfully experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:42:09] trial stopped successfully experiment-id="819" id="fe51990b-8878-4df1-8a52-245e85da1301" system="master" trial-id="9184" type="trial"
<info> [2021-04-06, 19:42:09] resources are released for /experiments/819/fe51990b-8878-4df1-8a52-245e85da1301 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:42:09] resources are released for /experiments/819/fe51990b-8878-4df1-8a52-245e85da1301 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:42:09] resources are released for /experiments/819/fe51990b-8878-4df1-8a52-245e85da1301 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:42:09] resources are released for /experiments/819/fe51990b-8878-4df1-8a52-245e85da1301 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:42:09] experiment state changed to STOPPING_COMPLETED id="819" system="master" type="experiment"
<info> [2021-04-06, 19:42:10] experiment state changed to COMPLETED id="819" system="master" type="experiment"
<info> [2021-04-06, 19:42:10] resources are requested by /experiment-819-checkpoint-gc (Task ID: ba2e7c69-4816-4326-88c1-f7832c004617) id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:42:10] experiment shut down successfully id="819" system="master" type="experiment"
<info> [2021-04-06, 19:42:41] setting priority for group of /experiments/820 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:42:41] setting priority for group of /experiments/820 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:42:41] setting priority for group of /experiments/820 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<warning> [2021-04-06, 19:42:41] response already committed
<info> [2021-04-06, 19:42:42] experiment state changed to ACTIVE id="820" system="master" type="experiment"
<info> [2021-04-06, 19:42:42] resources are requested by /experiments/820/fbaae498-d290-41bd-8c46-f53db000e207 (Task ID: 764df640-c47c-471f-a3ea-3ca172a2833d) id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:42:42] allocated resources to /experiments/820/fbaae498-d290-41bd-8c46-f53db000e207 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:42:43] starting trial container: <RUN_STEP (100 Batches) (0 Prior Batches): (820,9187,1)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:42:43] starting container id: 31d0e91e-66b1-49a7-a309-7ed075437231 slots: 1 task handler: /experiments/820/fbaae498-d290-41bd-8c46-f53db000e207 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<info> [2021-04-06, 19:42:49] setting priority for group of /experiments/821 to 42 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<warning> [2021-04-06, 19:42:49] response already committed
<info> [2021-04-06, 19:42:49] setting priority for group of /experiments/821 to 42 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:42:49] setting priority for group of /experiments/821 to 42 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:42:49] experiment state changed to ACTIVE id="821" system="master" type="experiment"
<info> [2021-04-06, 19:42:49] resources are requested by /experiments/821/a368d0de-0ec9-47c9-bf88-d4e79b2ae403 (Task ID: ccb4f15d-61bb-4d6b-bd31-298ba4e3c800) id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:42:50] allocated resources to /experiments/821/a368d0de-0ec9-47c9-bf88-d4e79b2ae403 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:42:50] starting trial container: <RUN_STEP (100 Batches) (0 Prior Batches): (821,9188,1)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:42:50] starting container id: 564d56f8-438e-4c30-8087-9c6a6ab9e5c9 slots: 4 task handler: /experiments/821/a368d0de-0ec9-47c9-bf88-d4e79b2ae403 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<info> [2021-04-06, 19:43:05] found container running: d92e3ab2-54ef-47db-aaef-f0583c907afc (rank 0) experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:05] pushing rendezvous information experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:05] found not all containers are connected experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:12] new connection from container d92e3ab2-54ef-47db-aaef-f0583c907afc trial 9186 (experiment 817) at 10.138.0.21:53678
<info> [2021-04-06, 19:43:12] pushing rendezvous information experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:12] found all containers are connected successfully experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:19] trial completed workload: <RUN_STEP (100 Batches) (0 Prior Batches): (817,9186,1)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:19] continuing trial: <RUN_STEP (100 Batches) (100 Prior Batches): (817,9186,2)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:21] trial completed workload: <RUN_STEP (100 Batches) (100 Prior Batches): (817,9186,2)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:21] continuing trial: <RUN_STEP (100 Batches) (200 Prior Batches): (817,9186,3)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:22] trial completed workload: <RUN_STEP (100 Batches) (200 Prior Batches): (817,9186,3)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:23] continuing trial: <RUN_STEP (100 Batches) (300 Prior Batches): (817,9186,4)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:24] trial completed workload: <RUN_STEP (100 Batches) (300 Prior Batches): (817,9186,4)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:24] continuing trial: <RUN_STEP (100 Batches) (400 Prior Batches): (817,9186,5)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:26] trial completed workload: <RUN_STEP (100 Batches) (400 Prior Batches): (817,9186,5)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:26] continuing trial: <RUN_STEP (100 Batches) (500 Prior Batches): (817,9186,6)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:28] trial completed workload: <RUN_STEP (100 Batches) (500 Prior Batches): (817,9186,6)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:28] continuing trial: <RUN_STEP (100 Batches) (600 Prior Batches): (817,9186,7)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:29] trial completed workload: <RUN_STEP (100 Batches) (600 Prior Batches): (817,9186,7)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:29] continuing trial: <RUN_STEP (100 Batches) (700 Prior Batches): (817,9186,8)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:31] trial completed workload: <RUN_STEP (100 Batches) (700 Prior Batches): (817,9186,8)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:31] continuing trial: <RUN_STEP (100 Batches) (800 Prior Batches): (817,9186,9)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:33] trial completed workload: <RUN_STEP (100 Batches) (800 Prior Batches): (817,9186,9)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:33] continuing trial: <RUN_STEP (37 Batches) (900 Prior Batches): (817,9186,10)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:33] trial completed workload: <RUN_STEP (37 Batches) (900 Prior Batches): (817,9186,10)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:34] continuing trial: <CHECKPOINT_MODEL (937 Prior Batches): (817,9186,10)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:36] trial completed workload: <CHECKPOINT_MODEL (937 Prior Batches): (817,9186,10)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:36] continuing trial: <COMPUTE_VALIDATION_METRICS (937 Prior Batches): (817,9186,10)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:38] trial completed workload: <COMPUTE_VALIDATION_METRICS (937 Prior Batches): (817,9186,10)> experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:39] terminating gracefully because there are no more workloads experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:39] gracefully terminating trial experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:40] stopped container id: d92e3ab2-54ef-47db-aaef-f0583c907afc id="det-agent-release-party-tidy-crawdad" system="master" type="agent"
<info> [2021-04-06, 19:43:40] found container terminated: d92e3ab2-54ef-47db-aaef-f0583c907afc experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:40] forcibly terminating trial experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:40] killing container id: d92e3ab2-54ef-47db-aaef-f0583c907afc id="det-agent-release-party-tidy-crawdad" system="master" type="agent"
<info> [2021-04-06, 19:43:40] trial runner stopped successfully experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:40] trial stopped successfully experiment-id="817" id="f2a8e7ea-5649-400a-90bf-c979d2fda817" system="master" trial-id="9186" type="trial"
<info> [2021-04-06, 19:43:40] resources are released for /experiments/817/f2a8e7ea-5649-400a-90bf-c979d2fda817 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:43:40] resources are released for /experiments/817/f2a8e7ea-5649-400a-90bf-c979d2fda817 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:43:40] resources are released for /experiments/817/f2a8e7ea-5649-400a-90bf-c979d2fda817 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:43:40] resources are released for /experiments/817/f2a8e7ea-5649-400a-90bf-c979d2fda817 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:43:40] experiment state changed to STOPPING_COMPLETED id="817" system="master" type="experiment"
<info> [2021-04-06, 19:43:41] experiment state changed to COMPLETED id="817" system="master" type="experiment"
<info> [2021-04-06, 19:43:41] resources are requested by /experiment-817-checkpoint-gc (Task ID: e0eb1f3f-c023-45b4-a714-58945499f6a9) id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:43:41] experiment shut down successfully id="817" system="master" type="experiment"
<info> [2021-04-06, 19:43:58] found container running: 31d0e91e-66b1-49a7-a309-7ed075437231 (rank 0) experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:43:58] pushing rendezvous information experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:43:58] found not all containers are connected experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:43:58] found container running: 564d56f8-438e-4c30-8087-9c6a6ab9e5c9 (rank 0) experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:43:58] pushing rendezvous information experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:43:58] found not all containers are connected experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:44:05] new connection from container 564d56f8-438e-4c30-8087-9c6a6ab9e5c9 trial 9188 (experiment 821) at 10.138.0.20:54010
<info> [2021-04-06, 19:44:05] pushing rendezvous information experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:44:05] found all containers are connected successfully experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:44:05] new connection from container 31d0e91e-66b1-49a7-a309-7ed075437231 trial 9187 (experiment 820) at 10.138.0.20:60922
<info> [2021-04-06, 19:44:05] pushing rendezvous information experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:44:05] found all containers are connected successfully experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:44:48] trial completed workload: <RUN_STEP (100 Batches) (0 Prior Batches): (820,9187,1)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:44:48] continuing trial: <RUN_STEP (100 Batches) (100 Prior Batches): (820,9187,2)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:45:41] trial completed workload: <RUN_STEP (100 Batches) (100 Prior Batches): (820,9187,2)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:45:41] continuing trial: <RUN_STEP (100 Batches) (200 Prior Batches): (820,9187,3)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:46:06] trial completed workload: <RUN_STEP (100 Batches) (0 Prior Batches): (821,9188,1)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:46:06] continuing trial: <RUN_STEP (100 Batches) (100 Prior Batches): (821,9188,2)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:46:34] trial completed workload: <RUN_STEP (100 Batches) (200 Prior Batches): (820,9187,3)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:46:34] continuing trial: <RUN_STEP (100 Batches) (300 Prior Batches): (820,9187,4)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:47:10] agent connected ip: 10.138.0.22 resource pool: default-cpu-pool slots: 0 id="det-agent-release-party-accepted-sculpin" system="master" type="agent"
<info> [2021-04-06, 19:47:10] adding agent: det-agent-release-party-accepted-sculpin id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:47:10] allocated resources to /notebooks/2464edd4-5da7-4d8d-b01a-a146a9115e58 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:47:10] allocated resources to /experiment-816-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:47:10] allocated resources to /experiment-819-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:47:10] allocated resources to /experiment-817-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:47:10] starting container id: 9ac1b21e-e2a0-4f98-b5be-369cc7d972b5 slots: 0 task handler: /notebooks/2464edd4-5da7-4d8d-b01a-a146a9115e58 id="det-agent-release-party-accepted-sculpin" system="master" type="agent"
<info> [2021-04-06, 19:47:10] starting checkpoint garbage collection id="experiment-816-checkpoint-gc" system="master" type="checkpointGCTask"
<info> [2021-04-06, 19:47:10] starting container id: d4b208a8-a2f7-40a5-8dff-f70a445650a4 slots: 0 task handler: /experiment-816-checkpoint-gc id="det-agent-release-party-accepted-sculpin" system="master" type="agent"
<info> [2021-04-06, 19:47:10] starting checkpoint garbage collection id="experiment-817-checkpoint-gc" system="master" type="checkpointGCTask"
<info> [2021-04-06, 19:47:10] starting container id: 1577b782-1e7c-4896-8748-bcf9c81760a7 slots: 0 task handler: /experiment-817-checkpoint-gc id="det-agent-release-party-accepted-sculpin" system="master" type="agent"
<info> [2021-04-06, 19:47:11] starting checkpoint garbage collection id="experiment-819-checkpoint-gc" system="master" type="checkpointGCTask"
<info> [2021-04-06, 19:47:11] starting container id: e1731976-ed74-4a2e-9d10-58aab2055114 slots: 0 task handler: /experiment-819-checkpoint-gc id="det-agent-release-party-accepted-sculpin" system="master" type="agent"
<info> [2021-04-06, 19:47:27] trial completed workload: <RUN_STEP (100 Batches) (300 Prior Batches): (820,9187,4)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:47:27] continuing trial: <RUN_STEP (100 Batches) (400 Prior Batches): (820,9187,5)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:47:49] trial completed workload: <RUN_STEP (100 Batches) (100 Prior Batches): (821,9188,2)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:47:49] continuing trial: <RUN_STEP (100 Batches) (200 Prior Batches): (821,9188,3)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:48:11] registering service: 2464edd4-5da7-4d8d-b01a-a146a9115e58 (http://10.138.0.22:49153) id="proxy" system="master" type="Proxy"
<info> [2021-04-06, 19:48:19] readiness check passed: notebook id="2464edd4-5da7-4d8d-b01a-a146a9115e58" system="master" type="command"
<info> [2021-04-06, 19:48:19] trial completed workload: <RUN_STEP (100 Batches) (400 Prior Batches): (820,9187,5)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:48:19] continuing trial: <RUN_STEP (100 Batches) (500 Prior Batches): (820,9187,6)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:48:21] stopped container id: 1577b782-1e7c-4896-8748-bcf9c81760a7 id="det-agent-release-party-accepted-sculpin" system="master" type="agent"
<info> [2021-04-06, 19:48:21] finished checkpoint garbage collection id="experiment-817-checkpoint-gc" system="master" type="checkpointGCTask"
<info> [2021-04-06, 19:48:21] stopped container id: e1731976-ed74-4a2e-9d10-58aab2055114 id="det-agent-release-party-accepted-sculpin" system="master" type="agent"
<info> [2021-04-06, 19:48:21] finished checkpoint garbage collection id="experiment-819-checkpoint-gc" system="master" type="checkpointGCTask"
<info> [2021-04-06, 19:48:21] stopped container id: d4b208a8-a2f7-40a5-8dff-f70a445650a4 id="det-agent-release-party-accepted-sculpin" system="master" type="agent"
<info> [2021-04-06, 19:48:21] finished checkpoint garbage collection id="experiment-816-checkpoint-gc" system="master" type="checkpointGCTask"
<info> [2021-04-06, 19:48:21] resources are released for /experiment-819-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:48:21] resources are released for /experiment-817-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:48:22] resources are released for /experiment-816-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 19:49:13] trial completed workload: <RUN_STEP (100 Batches) (500 Prior Batches): (820,9187,6)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:49:13] continuing trial: <RUN_STEP (100 Batches) (600 Prior Batches): (820,9187,7)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:49:31] trial completed workload: <RUN_STEP (100 Batches) (200 Prior Batches): (821,9188,3)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:49:31] continuing trial: <RUN_STEP (100 Batches) (300 Prior Batches): (821,9188,4)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:50:05] trial completed workload: <RUN_STEP (100 Batches) (600 Prior Batches): (820,9187,7)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:50:05] continuing trial: <RUN_STEP (100 Batches) (700 Prior Batches): (820,9187,8)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:50:58] trial completed workload: <RUN_STEP (100 Batches) (700 Prior Batches): (820,9187,8)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:50:58] continuing trial: <RUN_STEP (100 Batches) (800 Prior Batches): (820,9187,9)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:51:13] trial completed workload: <RUN_STEP (100 Batches) (300 Prior Batches): (821,9188,4)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:51:13] continuing trial: <RUN_STEP (100 Batches) (400 Prior Batches): (821,9188,5)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:51:51] trial completed workload: <RUN_STEP (100 Batches) (800 Prior Batches): (820,9187,9)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:51:51] continuing trial: <RUN_STEP (100 Batches) (900 Prior Batches): (820,9187,10)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:52:44] trial completed workload: <RUN_STEP (100 Batches) (900 Prior Batches): (820,9187,10)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:52:44] continuing trial: <RUN_STEP (100 Batches) (1000 Prior Batches): (820,9187,11)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:52:55] trial completed workload: <RUN_STEP (100 Batches) (400 Prior Batches): (821,9188,5)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:52:55] continuing trial: <RUN_STEP (100 Batches) (500 Prior Batches): (821,9188,6)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:53:37] trial completed workload: <RUN_STEP (100 Batches) (1000 Prior Batches): (820,9187,11)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:53:37] continuing trial: <RUN_STEP (100 Batches) (1100 Prior Batches): (820,9187,12)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:54:29] trial completed workload: <RUN_STEP (100 Batches) (1100 Prior Batches): (820,9187,12)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:54:29] continuing trial: <RUN_STEP (100 Batches) (1200 Prior Batches): (820,9187,13)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:54:38] trial completed workload: <RUN_STEP (100 Batches) (500 Prior Batches): (821,9188,6)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:54:38] continuing trial: <RUN_STEP (100 Batches) (600 Prior Batches): (821,9188,7)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:55:22] trial completed workload: <RUN_STEP (100 Batches) (1200 Prior Batches): (820,9187,13)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:55:22] continuing trial: <RUN_STEP (100 Batches) (1300 Prior Batches): (820,9187,14)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:56:14] trial completed workload: <RUN_STEP (100 Batches) (1300 Prior Batches): (820,9187,14)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:56:14] continuing trial: <RUN_STEP (100 Batches) (1400 Prior Batches): (820,9187,15)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:56:20] trial completed workload: <RUN_STEP (100 Batches) (600 Prior Batches): (821,9188,7)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:56:20] continuing trial: <RUN_STEP (100 Batches) (700 Prior Batches): (821,9188,8)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:57:07] trial completed workload: <RUN_STEP (100 Batches) (1400 Prior Batches): (820,9187,15)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:57:07] continuing trial: <RUN_STEP (100 Batches) (1500 Prior Batches): (820,9187,16)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:58:00] trial completed workload: <RUN_STEP (100 Batches) (1500 Prior Batches): (820,9187,16)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:58:00] continuing trial: <RUN_STEP (100 Batches) (1600 Prior Batches): (820,9187,17)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:58:02] trial completed workload: <RUN_STEP (100 Batches) (700 Prior Batches): (821,9188,8)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:58:02] continuing trial: <RUN_STEP (100 Batches) (800 Prior Batches): (821,9188,9)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:58:53] trial completed workload: <RUN_STEP (100 Batches) (1600 Prior Batches): (820,9187,17)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:58:53] continuing trial: <RUN_STEP (100 Batches) (1700 Prior Batches): (820,9187,18)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:59:44] trial completed workload: <RUN_STEP (100 Batches) (800 Prior Batches): (821,9188,9)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:59:44] continuing trial: <RUN_STEP (100 Batches) (900 Prior Batches): (821,9188,10)> experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 19:59:45] trial completed workload: <RUN_STEP (100 Batches) (1700 Prior Batches): (820,9187,18)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 19:59:45] continuing trial: <RUN_STEP (100 Batches) (1800 Prior Batches): (820,9187,19)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 20:00:39] trial completed workload: <RUN_STEP (100 Batches) (1800 Prior Batches): (820,9187,19)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 20:00:39] continuing trial: <RUN_STEP (100 Batches) (1900 Prior Batches): (820,9187,20)> experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 20:01:18] experiment state changed to STOPPING_CANCELED id="820" system="master" type="experiment"
<info> [2021-04-06, 20:01:18] gracefully terminating trial experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 20:01:18] received killing request experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 20:01:18] forcibly terminating trial experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 20:01:18] killing container id: 31d0e91e-66b1-49a7-a309-7ed075437231 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<error> [2021-04-06, 20:01:18] error while actor was running error="websocket: close 1006 (abnormal closure): unexpected EOF" id="socket-31d0e91e-66b1-49a7-a309-7ed075437231" system="master" type="websocketActor"
<error> [2021-04-06, 20:01:18] websocket handler error: websocket: close 1006 (abnormal closure): unexpected EOF
<info> [2021-04-06, 20:01:18] found child actor failed, terminating forcibly experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 20:01:18] forcibly terminating trial experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 20:01:18] killing container id: 31d0e91e-66b1-49a7-a309-7ed075437231 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<info> [2021-04-06, 20:01:19] stopped container id: 31d0e91e-66b1-49a7-a309-7ed075437231 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<info> [2021-04-06, 20:01:19] found container terminated: 31d0e91e-66b1-49a7-a309-7ed075437231 experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 20:01:19] forcibly terminating trial experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 20:01:19] killing container id: 31d0e91e-66b1-49a7-a309-7ed075437231 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<info> [2021-04-06, 20:01:19] ignoring trial runner failure since it was killed experiment-id="820" failure="container failed with non-zero exit code: container failed with non-zero exit code: 137 (exit code 137)" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 20:01:19] trial stopped successfully experiment-id="820" id="fbaae498-d290-41bd-8c46-f53db000e207" system="master" trial-id="9187" type="trial"
<info> [2021-04-06, 20:01:19] resources are released for /experiments/820/fbaae498-d290-41bd-8c46-f53db000e207 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:19] resources are released for /experiments/820/fbaae498-d290-41bd-8c46-f53db000e207 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:19] resources are released for /experiments/820/fbaae498-d290-41bd-8c46-f53db000e207 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:19] resources are released for /experiments/820/fbaae498-d290-41bd-8c46-f53db000e207 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:19] experiment state changed to STOPPING_CANCELED id="821" system="master" type="experiment"
<info> [2021-04-06, 20:01:19] gracefully terminating trial experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 20:01:19] received killing request experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 20:01:19] forcibly terminating trial experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 20:01:19] killing container id: 564d56f8-438e-4c30-8087-9c6a6ab9e5c9 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<error> [2021-04-06, 20:01:20] error while actor was running error="websocket: close 1006 (abnormal closure): unexpected EOF" id="socket-564d56f8-438e-4c30-8087-9c6a6ab9e5c9" system="master" type="websocketActor"
<error> [2021-04-06, 20:01:20] websocket handler error: websocket: close 1006 (abnormal closure): unexpected EOF
<info> [2021-04-06, 20:01:20] found child actor failed, terminating forcibly experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 20:01:20] forcibly terminating trial experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 20:01:20] killing container id: 564d56f8-438e-4c30-8087-9c6a6ab9e5c9 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<info> [2021-04-06, 20:01:20] experiment state changed to CANCELED id="820" system="master" type="experiment"
<info> [2021-04-06, 20:01:20] resources are requested by /experiment-820-checkpoint-gc (Task ID: 041686f2-cb00-4aac-8f4e-1258373f9e46) id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:20] allocated resources to /experiment-820-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:20] experiment shut down successfully id="820" system="master" type="experiment"
<info> [2021-04-06, 20:01:20] starting checkpoint garbage collection id="experiment-820-checkpoint-gc" system="master" type="checkpointGCTask"
<info> [2021-04-06, 20:01:20] starting container id: 81fa7b78-9c02-4d35-a84d-56b2548a07c2 slots: 0 task handler: /experiment-820-checkpoint-gc id="det-agent-release-party-accepted-sculpin" system="master" type="agent"
<info> [2021-04-06, 20:01:20] stopped container id: 564d56f8-438e-4c30-8087-9c6a6ab9e5c9 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<info> [2021-04-06, 20:01:20] found container terminated: 564d56f8-438e-4c30-8087-9c6a6ab9e5c9 experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 20:01:20] forcibly terminating trial experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 20:01:20] killing container id: 564d56f8-438e-4c30-8087-9c6a6ab9e5c9 id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<info> [2021-04-06, 20:01:20] ignoring trial runner failure since it was killed experiment-id="821" failure="container failed with non-zero exit code: container failed with non-zero exit code: 137 (exit code 137)" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 20:01:20] trial stopped successfully experiment-id="821" id="a368d0de-0ec9-47c9-bf88-d4e79b2ae403" system="master" trial-id="9188" type="trial"
<info> [2021-04-06, 20:01:20] resources are released for /experiments/821/a368d0de-0ec9-47c9-bf88-d4e79b2ae403 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:20] resources are released for /experiments/821/a368d0de-0ec9-47c9-bf88-d4e79b2ae403 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:20] resources are released for /experiments/821/a368d0de-0ec9-47c9-bf88-d4e79b2ae403 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:20] resources are released for /experiments/821/a368d0de-0ec9-47c9-bf88-d4e79b2ae403 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:21] experiment state changed to CANCELED id="821" system="master" type="experiment"
<info> [2021-04-06, 20:01:21] resources are requested by /experiment-821-checkpoint-gc (Task ID: e745a022-bd80-41a4-b689-062e26e22b89) id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:21] experiment shut down successfully id="821" system="master" type="experiment"
<info> [2021-04-06, 20:01:21] allocated resources to /experiment-821-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:21] starting checkpoint garbage collection id="experiment-821-checkpoint-gc" system="master" type="checkpointGCTask"
<info> [2021-04-06, 20:01:21] starting container id: 7bc7dabe-adb8-45fb-b57c-0579e3b8b9e0 slots: 0 task handler: /experiment-821-checkpoint-gc id="det-agent-release-party-accepted-sculpin" system="master" type="agent"
<info> [2021-04-06, 20:01:22] experiment state changed to STOPPING_CANCELED id="818" system="master" type="experiment"
<info> [2021-04-06, 20:01:22] aborting trial before resources are allocated experiment-id="818" id="7a608ae4-8cbc-4d11-9e16-83b86c74f458" system="master" type="trial"
<info> [2021-04-06, 20:01:22] trial runner is aborted successfully experiment-id="818" id="7a608ae4-8cbc-4d11-9e16-83b86c74f458" system="master" type="trial"
<info> [2021-04-06, 20:01:22] resources are released for /experiments/818/7a608ae4-8cbc-4d11-9e16-83b86c74f458 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:22] resources are released for /experiments/818/7a608ae4-8cbc-4d11-9e16-83b86c74f458 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:22] resources are released for /experiments/818/7a608ae4-8cbc-4d11-9e16-83b86c74f458 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:22] resources are released for /experiments/818/7a608ae4-8cbc-4d11-9e16-83b86c74f458 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:22] experiment state changed to CANCELED id="818" system="master" type="experiment"
<info> [2021-04-06, 20:01:22] resources are requested by /experiment-818-checkpoint-gc (Task ID: 4deb58f1-f950-4554-9729-f746ff69ba21) id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:22] experiment shut down successfully id="818" system="master" type="experiment"
<info> [2021-04-06, 20:01:22] allocated resources to /experiment-818-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:22] starting checkpoint garbage collection id="experiment-818-checkpoint-gc" system="master" type="checkpointGCTask"
<info> [2021-04-06, 20:01:22] starting container id: 005dc5c4-f179-46ab-8667-209e3bbd3c43 slots: 0 task handler: /experiment-818-checkpoint-gc id="det-agent-release-party-accepted-sculpin" system="master" type="agent"
<info> [2021-04-06, 20:01:28] task forcible terminating id="2464edd4-5da7-4d8d-b01a-a146a9115e58" system="master" type="command"
<info> [2021-04-06, 20:01:28] killing container id: 9ac1b21e-e2a0-4f98-b5be-369cc7d972b5 id="det-agent-release-party-accepted-sculpin" system="master" type="agent"
<info> [2021-04-06, 20:01:29] task forcible terminating id="b53a120e-0541-4139-b208-dcc21a7848e2" system="master" type="command"
<info> [2021-04-06, 20:01:29] killing container id: ae9091e2-e545-494b-b3ce-970e72ee73eb id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<info> [2021-04-06, 20:01:29] stopped container id: ae9091e2-e545-494b-b3ce-970e72ee73eb id="det-agent-release-party-outgoing-mosquito" system="master" type="agent"
<info> [2021-04-06, 20:01:29] resources are released for /notebooks/b53a120e-0541-4139-b208-dcc21a7848e2 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:29] resources are released for /notebooks/b53a120e-0541-4139-b208-dcc21a7848e2 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:29] resources are released for /notebooks/b53a120e-0541-4139-b208-dcc21a7848e2 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:33] task forcible terminating id="2464edd4-5da7-4d8d-b01a-a146a9115e58" system="master" type="command"
<info> [2021-04-06, 20:01:33] killing container id: 9ac1b21e-e2a0-4f98-b5be-369cc7d972b5 id="det-agent-release-party-accepted-sculpin" system="master" type="agent"
<info> [2021-04-06, 20:01:46] stopped container id: 9ac1b21e-e2a0-4f98-b5be-369cc7d972b5 id="det-agent-release-party-accepted-sculpin" system="master" type="agent"
<info> [2021-04-06, 20:01:46] resources are released for /notebooks/2464edd4-5da7-4d8d-b01a-a146a9115e58 id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:46] resources are released for /notebooks/2464edd4-5da7-4d8d-b01a-a146a9115e58 id="default-gpu-pool" resource-pool="default-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:01:46] resources are released for /notebooks/2464edd4-5da7-4d8d-b01a-a146a9115e58 id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:02:11] stopped container id: 005dc5c4-f179-46ab-8667-209e3bbd3c43 id="det-agent-release-party-accepted-sculpin" system="master" type="agent"
<info> [2021-04-06, 20:02:11] finished checkpoint garbage collection id="experiment-818-checkpoint-gc" system="master" type="checkpointGCTask"
<info> [2021-04-06, 20:02:11] resources are released for /experiment-818-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:02:26] stopped container id: 7bc7dabe-adb8-45fb-b57c-0579e3b8b9e0 id="det-agent-release-party-accepted-sculpin" system="master" type="agent"
<info> [2021-04-06, 20:02:26] finished checkpoint garbage collection id="experiment-821-checkpoint-gc" system="master" type="checkpointGCTask"
<info> [2021-04-06, 20:02:26] stopped container id: 81fa7b78-9c02-4d35-a84d-56b2548a07c2 id="det-agent-release-party-accepted-sculpin" system="master" type="agent"
<info> [2021-04-06, 20:02:26] finished checkpoint garbage collection id="experiment-820-checkpoint-gc" system="master" type="checkpointGCTask"
<info> [2021-04-06, 20:02:26] resources are released for /experiment-820-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:02:26] resources are released for /experiment-821-checkpoint-gc id="default-cpu-pool" resource-pool="default-cpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:13:46] decided to terminate 1 instances: ,det-agent-release-party-tidy-crawdad (reason: long idle) id="provisioner" resource-pool="preemptible-gpu-pool" system="master" type="Provisioner"
<error> [2021-04-06, 20:13:48] error while actor was running error="websocket: close 1006 (abnormal closure): unexpected EOF" id="websocket-769d0dba-d358-4156-9cde-3b96b21487e9" system="master" type="websocketActor"
<error> [2021-04-06, 20:13:48] websocket: close 1006 (abnormal closure): unexpected EOF
<error> [2021-04-06, 20:13:48] http: connection has been hijacked
<error> [2021-04-06, 20:13:48] error while actor was running error="child failed: /agents/det-agent-release-party-tidy-crawdad/websocket-769d0dba-d358-4156-9cde-3b96b21487e9: websocket: close 1006 (abnormal closure): unexpected EOF" id="det-agent-release-party-tidy-crawdad" system="master" type="agent"
<info> [2021-04-06, 20:13:48] removing device: gpu4 (Tesla V100-SXM2-16GB) (det-agent-release-party-tidy-crawdad) id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:13:48] removing device: gpu5 (Tesla V100-SXM2-16GB) (det-agent-release-party-tidy-crawdad) id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:13:48] removing device: gpu6 (Tesla V100-SXM2-16GB) (det-agent-release-party-tidy-crawdad) id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:13:48] removing device: gpu7 (Tesla V100-SXM2-16GB) (det-agent-release-party-tidy-crawdad) id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:13:48] removing device: gpu0 (Tesla V100-SXM2-16GB) (det-agent-release-party-tidy-crawdad) id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:13:48] removing device: gpu1 (Tesla V100-SXM2-16GB) (det-agent-release-party-tidy-crawdad) id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:13:48] agent disconnected id="det-agent-release-party-tidy-crawdad" system="master" type="agent"
<info> [2021-04-06, 20:13:48] removing device: gpu2 (Tesla V100-SXM2-16GB) (det-agent-release-party-tidy-crawdad) id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:13:48] removing device: gpu3 (Tesla V100-SXM2-16GB) (det-agent-release-party-tidy-crawdad) id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:13:48] removing agent: det-agent-release-party-tidy-crawdad id="preemptible-gpu-pool" resource-pool="preemptible-gpu-pool" system="master" type="ResourcePool"
<info> [2021-04-06, 20:13:52] found state changes in 1 instances: det-agent-release-party-tidy-crawdad (Stopping) id="provisioner" resource-pool="preemptible-gpu-pool" system="master" type="Provisioner"
<info> [2021-04-06, 20:14:36] deleted 1/1 GCE instances: det-agent-release-party-tidy-crawdad id="provisioner" resource-pool="preemptible-gpu-pool" system="master" type="Provisioner"
<info> [2021-04-06, 20:14:38] found state changes in 0 instances: id="provisioner" resource-pool="preemptible-gpu-pool" system="master" type="Provisioner"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment