Skip to content

Instantly share code, notes, and snippets.

@kk17
Created September 7, 2022 02:19
Show Gist options
  • Save kk17/b159d821e1c52fed7727308a81ef2211 to your computer and use it in GitHub Desktop.
Save kk17/b159d821e1c52fed7727308a81ef2211 to your computer and use it in GitHub Desktop.
train_on_ray2 Tuner Error
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-09-06 11:09:38.638392: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
"2022-09-06 11:09:38.638426: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
]
}
],
"source": [
"import tensorflow as tf\n",
"from tensorflow.keras.callbacks import Callback\n",
"from tensorflow import keras\n",
"from tensorflow.keras import layers\n",
"from tensorflow.keras.models import Sequential\n",
"\n",
"import ray\n",
"from ray.data.datasource import ImageFolderDatasource\n",
"from ray.data.preprocessors import OrdinalEncoder\n",
"\n",
"from ray import train\n",
"from ray.air import session\n",
"from ray.air.config import ScalingConfig\n",
"from ray.air.callbacks.keras import Callback as KerasCallback\n",
"from ray.train.tensorflow import (\n",
" TensorflowTrainer,\n",
" TensorflowCheckpoint,\n",
" prepare_dataset_shard,\n",
")\n",
"import configparser\n",
"import os\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# load dataset"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# 配置连接到ray集群的运行时信息,非必须,因为这里是准备连接到远端的测试ray集群读取aws s3上的数据,所以要将aws的认证信息以环境变量的方式传递。\n",
"# pip参数可以传递需要在远端ray集群安装的packages,如果本地集群或集群已有package在不需要。测试的ray集群已安装jupyter一样的package,\n",
"# 这里只是作为演示\n",
"# https://docs.ray.io/en/latest/ray-core/handling-dependencies.html#runtime-environments\n",
"env_vars = {}\n",
"runtime_env = {\"env_vars\": env_vars, \"pip\": ['tensorflow']}\n",
"os.environ['LD_LIBRARY_PATH'] = '/usr/lib/hadoop/lib/native'\n",
"try:\n",
" # 数据集可以存放在hdfs或aws s3上,s3的凭证可以放到home目录的.aws目录下,home目录下的文件会永久保存\n",
" aws_config = configparser.RawConfigParser()\n",
" path = os.path.join(os.path.expanduser('~'), '.aws/credentials')\n",
" aws_config.read(path)\n",
" env_vars['AWS_ACCESS_KEY_ID'] = aws_config.get('test', 'aws_access_key_id')\n",
" env_vars['AWS_SECRET_ACCESS_KEY'] = aws_config.get('test', 'aws_secret_access_key')\n",
"except:\n",
" pass"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 连接到ray集群,不传address或者address='auto'会自动判断是否有RAY_ADDRESS配置了ray集群连接地址,jupyer已经配置了RAY_ADDRESS到对应ray集群\n",
"# 如果不传address则会本地启动ray集群,并连接。这是ray的交互式使用方式。调用ray.init之后当前线程会作为ray job的driver(待验证)。\n",
"ray.init(runtime_env=runtime_env)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'accelerator_type:T4': 1.0,\n",
" 'object_store_memory': 11675090439.0,\n",
" 'GPU': 1.0,\n",
" 'CPU': 13.0,\n",
" 'node:10.59.84.90': 1.0,\n",
" 'memory': 39728447488.0,\n",
" 'node:10.59.105.207': 1.0,\n",
" 'node:10.59.208.210': 1.0,\n",
" 'node:10.59.244.124': 1.0}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 可以调用api获取当前集群的资源,也可以从ray Dashboard查看,测试集群 http://ray-dashboard.test.niubit.io/#/node\n",
"ray.available_resources()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# 使用ray.data直接读取本地或远程存储图片目录数据集的方法。ray.xx的api需要连接到ray集群,如上面介绍,如果没有使用ray.init连接到ray集群,\n",
"# 调用ray.data api的时候也会自动启动本地ray集群。返回结果为ray data dataset。 关于ray dataset可以看https://docs.ray.io/en/latest/data/dataset.html。\n",
"# 后面进行tensorflow模型训练的时候会将ray dataset转换成tf dataset\n",
"import pyarrow as pa\n",
"def load_image_dataset(directory, image_size):\n",
" ds = ray.data.read_datasource( \n",
" ImageFolderDatasource(),\n",
" root=directory,\n",
" size=image_size\n",
" )\n",
" oe = OrdinalEncoder(columns=[\"label\"]) \n",
" ds = oe.fit_transform(ds)\n",
" return ds\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"image_height: 800\n",
"image_width: 360\n"
]
}
],
"source": [
"# 一些图片数据集相关的参数\n",
"batch_size = 16\n",
"img_height = 2400//3\n",
"img_width = 1080//3\n",
"image_size = (img_height, img_width)\n",
"image_shape = (img_height, img_width, 3)\n",
"num_classes = 2\n",
"print(\"image_height:\", img_height)\n",
"print(\"image_width:\", img_width)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"local_train_data_dir = './demo-data-train'\n",
"local_val_data_dir = './demo-data-test'\n",
"s3_train_data_dir = \"s3://xxx/demo-data-train\"\n",
"s3_val_data_dir = \"s3://xxx/ray_test/demo_scripts/demo-data-test\"\n",
"hdfs_train_data_dir = \"hdfs://xxx/demo-data-train\"\n",
"hdfs_val_data_dir = \"hdfs://xxx/demo-data-test\""
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Read->Map_Batches: 100%|██████████| 20/20 [00:13<00:00, 1.46it/s]\n",
"Read->Map_Batches: 100%|██████████| 20/20 [00:09<00:00, 2.01it/s]\n"
]
}
],
"source": [
"# 实际开始load数据,如果没有连接到ray集群,会自动生成创建本地集群。因为我们想用测试集群,所以需要在此之前手动调用ray.init\n",
"train_ds, val_ds = load_image_dataset(hdfs_train_data_dir, image_size=image_size).train_test_split(test_size=0.2)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# 可以查看dataset信息\n",
"# train_ds"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# train_ds.take(1)[0]['image'].shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Training the model"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-09-06 11:10:50.803218: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/hadoop/lib/native\n",
"2022-09-06 11:10:50.803327: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)\n",
"2022-09-06 11:10:50.803358: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (jupyter-kyle-2echen): /proc/driver/nvidia/version does not exist\n",
"2022-09-06 11:10:50.822268: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
}
],
"source": [
"### Data augmentation step\n",
"with tf.device('/CPU:0'):\n",
" data_augmentation = Sequential(\n",
" [\n",
" layers.Rescaling(1./255, input_shape = (img_height, img_width,3)),\n",
" layers.experimental.preprocessing.RandomZoom([-0.1, 0.1]),\n",
" layers.CenterCrop(600,300),\n",
" layers.RandomCrop(256,256)\n",
" ]\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# 构建TensorFlow Keras模型\n",
"def build_and_compile_cnn_model():\n",
" model = Sequential([\n",
" # data_augmentation,\n",
" layers.Conv2D(16, 3, padding='same', activation='relu'),\n",
" layers.MaxPooling2D(),\n",
" layers.Conv2D(32, 3, padding='same', activation='relu'),\n",
" layers.MaxPooling2D(),\n",
" layers.Conv2D(64, 3, padding='same', activation='relu'),\n",
" layers.MaxPooling2D(),\n",
" layers.Flatten(),\n",
" layers.Dropout(0.5),\n",
" layers.Dense(128, activation='relu'),\n",
" layers.Dropout(0.5),\n",
" layers.Dense(num_classes)\n",
" ])\n",
" return model"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# 图片ray dataset转换为tf_dataset\n",
"def to_tf_dataset(dataset, batch_size, image_shape):\n",
" def to_tensor_iterator():\n",
" data_iterator = dataset.iter_tf_batches(\n",
" batch_size=batch_size, dtypes=tf.float32\n",
" )\n",
" for d in data_iterator:\n",
" yield d[\"image\"], d[\"label\"]\n",
"\n",
" output_signature = (\n",
" tf.TensorSpec(shape=(None, *image_shape), dtype=tf.float32),\n",
" tf.TensorSpec(shape=(None), dtype=tf.int32),\n",
" )\n",
" tf_dataset = tf.data.Dataset.from_generator(\n",
" to_tensor_iterator, output_signature=output_signature\n",
" )\n",
" return prepare_dataset_shard(tf_dataset)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# tf_ds = to_tf_dataset(dataset=val_ds, batch_size=batch_size, image_shape=image_shape)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# next(iter(tf_ds))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# 在每一个worker上的训练方法,可以自己定义config,callbacks和模型保存待进一步研究\n",
"\n",
"def train_loop_per_worker(config):\n",
" tf.get_logger().setLevel('ERROR')\n",
" batch_size = config[\"batch_size\"]\n",
" image_shape = config[\"image_shape\"]\n",
" lr = config[\"lr\"]\n",
" epochs = config[\"num_epochs\"]\n",
"\n",
" # Get the Ray Dataset shard for this data parallel worker,\n",
" # and convert it to a Tensorflow Dataset.\n",
" train_data = train.get_dataset_shard(\"train\")\n",
" val_data = train.get_dataset_shard(\"val\")\n",
"\n",
" strategy = tf.distribute.MultiWorkerMirroredStrategy()\n",
" with strategy.scope():\n",
" # Model building/compiling need to be within `strategy.scope()`.\n",
" multi_worker_model = build_and_compile_cnn_model()\n",
"\n",
" multi_worker_model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=lr),\n",
" loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
" metrics=[\"categorical_accuracy\"])\n",
" # tf.get_logger().setLevel('ERROR')\n",
" callbacks = [\n",
" KerasCallback(),\n",
" tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights = True),\n",
" tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5),\n",
" # tf.keras.callbacks.TensorBoard(log_dir='./logs'),\n",
" ]\n",
"\n",
" results = []\n",
" for _ in range(epochs):\n",
" train_tf_dataset = to_tf_dataset(dataset=train_data, batch_size=batch_size, image_shape=image_shape)\n",
" val_tf_dataset = to_tf_dataset(dataset=val_data, batch_size=batch_size,image_shape=image_shape)\n",
" history = multi_worker_model.fit(\n",
" train_tf_dataset,\n",
" validation_data = val_tf_dataset,\n",
" callbacks=callbacks,\n",
" verbose=0,\n",
" )\n",
" return results"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## MLflow"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-09-06 11:10:52,263\tWARNING mlflow.py:80 -- When using MLflowLoggerCallback with Ray Client, it is recommended to use a remote tracking server. If you are using a MLflow tracking server backed by the local filesystem, then it must be setup on the server side and not on the client side.\n"
]
}
],
"source": [
"from ray.air.callbacks.mlflow import MLflowLoggerCallback\n",
"\n",
"tracking_uri = \"http://mlflow.xxx\"\n",
"mlflow_callback = MLflowLoggerCallback(\n",
" experiment_name=\"demo_model\",\n",
" tags={\"type\": \"keras\"},\n",
" save_artifact=True,\n",
" tracking_uri=tracking_uri\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Trianer"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"from ray.air.config import RunConfig, ScalingConfig\n",
"from ray.tune.stopper import TimeoutStopper\n",
"from datetime import timedelta\n",
"\n",
"# 调用Ray 2.0的trainer API。Ray 1.x也有对应的trainer API,但是官网也不推荐使用。Ray 1.x可以见train_on_ray.ipynb\n",
"def build_trainer(num_workers = 1):\n",
" num_cluster_gpus = 1.0\n",
" \n",
" stopper = TimeoutStopper(timedelta(minutes=10))\n",
" \n",
" trainer = TensorflowTrainer(\n",
" train_loop_per_worker=train_loop_per_worker,\n",
" train_loop_config={\n",
" \"batch_size\": batch_size,\n",
" \"image_shape\": image_shape,\n",
" \"num_epochs\": 10,\n",
" \"lr\": 0.001\n",
" },\n",
" scaling_config=ScalingConfig(\n",
" num_workers=num_workers, # Number of data parallel training workers\n",
" use_gpu=True,\n",
" resources_per_worker={\"GPU\": num_cluster_gpus/num_workers}\n",
" ),\n",
" run_config=RunConfig(\n",
" name=\"my_train_run\",\n",
" callbacks=[mlflow_callback],\n",
" stop = stopper,\n",
" verbose=1\n",
" ),\n",
" datasets={\"train\": train_ds, \"val\": val_ds},\n",
" )\n",
" return trainer\n",
" \n",
"trainer = build_trainer()\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(pid=16397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:11.559432: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Current time: 2022-09-06 04:11:13 (running for 00:00:03.89)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Memory usage on this node: 11.9/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Result logdir: /home/ray/ray_results/my_train_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:15.215909: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:16.733473: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:16.741428: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:16.742063: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:16.743378: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:16.743724: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:16.744335: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:16.744920: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:17.369606: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:17.370228: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:17.370752: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:17.371252: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13795 MB memory: -> device: 0, name: Tesla T4, pci bus id: 0000:00:1e.0, compute capability: 7.5\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:17.372217: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:17.372826: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:17.373356: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:17.373924: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:17.374436: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:17.374917: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:worker/replica:0/task:0/device:GPU:0 with 13795 MB memory: -> device: 0, name: Tesla T4, pci bus id: 0000:00:1e.0, compute capability: 7.5\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:17.379158: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> 10.59.208.210:37137}\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:17.379228: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> 10.59.208.210:37137}\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:17.379852: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:438] Started server with target: grpc://10.59.208.210:37137\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Current time: 2022-09-06 04:11:18 (running for 00:00:08.89)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Memory usage on this node: 11.9/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Result logdir: /home/ray/ray_results/my_train_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m 2022-09-06 04:11:21.624355: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8101\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Current time: 2022-09-06 04:11:23 (running for 00:00:13.90)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Memory usage on this node: 12.0/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Result logdir: /home/ray/ray_results/my_train_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Current time: 2022-09-06 04:11:34 (running for 00:00:25.31)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Memory usage on this node: 12.0/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Result logdir: /home/ray/ray_results/my_train_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Current time: 2022-09-06 04:11:43 (running for 00:00:33.47)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Memory usage on this node: 12.2/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Result logdir: /home/ray/ray_results/my_train_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Current time: 2022-09-06 04:11:48 (running for 00:00:38.84)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Memory usage on this node: 12.0/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Result logdir: /home/ray/ray_results/my_train_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Current time: 2022-09-06 04:15:52 (running for 00:04:43.22)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Memory usage on this node: 11.8/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Result logdir: /home/ray/ray_results/my_train_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m 2022-09-06 04:15:52,813\tWARNING util.py:220 -- The `callbacks.on_trial_result` operation took 240.720 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m 2022-09-06 04:15:52,814\tWARNING util.py:220 -- The `process_trial_result` operation took 240.721 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m 2022-09-06 04:15:52,814\tWARNING util.py:220 -- Processing trial results took 240.721 s, which may be a performance bottleneck. Please consider reporting results less frequently to Ray Tune.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m 2022-09-06 04:15:52,814\tWARNING util.py:220 -- The `process_trial_result` operation took 240.721 s, which may be a performance bottleneck.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Current time: 2022-09-06 04:15:57 (running for 00:04:48.28)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Memory usage on this node: 12.0/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Result logdir: /home/ray/ray_results/my_train_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m 2022-09-06 04:15:57,872\tWARNING util.py:220 -- The `process_trial_save` operation took 5.047 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m 2022-09-06 04:15:57,872\tWARNING trial_runner.py:950 -- Consider turning off forced head-worker trial checkpoint syncs by setting sync_on_checkpoint=False. Note that this may result in faulty trial restoration if a failure occurs while the checkpoint is being synced from the worker to the head node.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m 2022-09-06 04:16:01,755\tWARNING util.py:220 -- The `process_trial_save` operation took 3.223 s, which may be a performance bottleneck.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Current time: 2022-09-06 04:16:05 (running for 00:04:56.32)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Memory usage on this node: 12.0/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Result logdir: /home/ray/ray_results/my_train_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m 2022-09-06 04:16:05,913\tWARNING util.py:220 -- The `process_trial_save` operation took 3.470 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m 2022-09-06 04:16:09,816\tWARNING util.py:220 -- The `process_trial_save` operation took 3.269 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m 2022-09-06 04:16:10,646\tWARNING util.py:220 -- The `callbacks.on_trial_result` operation took 0.604 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m 2022-09-06 04:16:10,647\tWARNING util.py:220 -- The `process_trial_result` operation took 0.606 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m 2022-09-06 04:16:10,647\tWARNING util.py:220 -- Processing trial results took 0.606 s, which may be a performance bottleneck. Please consider reporting results less frequently to Ray Tune.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m 2022-09-06 04:16:10,647\tWARNING util.py:220 -- The `process_trial_result` operation took 0.606 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m E0906 04:16:14.104124775 16459 chttp2_transport.cc:1103] Received a GOAWAY with error code ENHANCE_YOUR_CALM and debug data equal to \"too_many_pings\"\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m 2022-09-06 04:16:14,095\tWARNING util.py:220 -- The `process_trial_save` operation took 3.445 s, which may be a performance bottleneck.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Current time: 2022-09-06 04:16:14 (running for 00:05:04.50)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Memory usage on this node: 12.0/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Result logdir: /home/ray/ray_results/my_train_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m Exception ignored in: <function Pool.__del__ at 0x7fec9aef60d0>\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m Traceback (most recent call last):\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/pool.py\", line 268, in __del__\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m self._change_notifier.put(None)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/queues.py\", line 368, in put\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m self._writer.send_bytes(obj)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/connection.py\", line 200, in send_bytes\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m self._send_bytes(m[offset:offset + size])\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/connection.py\", line 411, in _send_bytes\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m self._send(header + buf)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/connection.py\", line 368, in _send\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m n = write(self._handle, buf)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m OSError: [Errno 9] Bad file descriptor\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m Exception ignored in: <function Pool.__del__ at 0x7fec9aef60d0>\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m Traceback (most recent call last):\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/pool.py\", line 268, in __del__\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m self._change_notifier.put(None)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/queues.py\", line 368, in put\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m self._writer.send_bytes(obj)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/connection.py\", line 200, in send_bytes\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m self._send_bytes(m[offset:offset + size])\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/connection.py\", line 411, in _send_bytes\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m self._send(header + buf)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/connection.py\", line 368, in _send\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m n = write(self._handle, buf)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=16431, ip=10.59.208.210)\u001b[0m OSError: [Errno 9] Bad file descriptor\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m 2022-09-06 04:16:16,624\tWARNING util.py:220 -- The `process_trial_save` operation took 2.506 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m /home/ray/anaconda3/lib/python3.8/site-packages/mlflow/store/artifact/hdfs_artifact_repo.py:181: FutureWarning: pyarrow.hdfs.connect is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m connected = pa.hdfs.connect(\n",
"2022-09-06 11:16:40,256\tERROR checkpoint_manager.py:133 -- The requested checkpoint is not available on this node, most likely because you are using Ray client or disabled checkpoint synchronization. To avoid this, enable checkpoint synchronization to cloud storage by specifying a `SyncConfig`. The checkpoint may be available on a different node - please check this location on worker nodes: /home/ray/ray_results/my_train_run/TensorflowTrainer_9ab58_00000_0_2022-09-06_04-11-09/checkpoint_000009\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Current time: 2022-09-06 04:16:40 (running for 00:05:30.50)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Memory usage on this node: 12.8/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Resources requested: 0/13 CPUs, 0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Result logdir: /home/ray/ray_results/my_train_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Number of trials: 1/1 (1 TERMINATED)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Current time: 2022-09-06 04:16:40 (running for 00:05:30.50)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Memory usage on this node: 12.8/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Resources requested: 0/13 CPUs, 0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Result logdir: /home/ray/ray_results/my_train_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m Number of trials: 1/1 (1 TERMINATED)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m \n",
"Last result: {'loss': nan, 'categorical_accuracy': 0.0, 'val_loss': nan, 'val_categorical_accuracy': 1.0, '_timestamp': 1662462969, '_time_this_iter_s': 3.9068286418914795, '_training_iteration': 10, 'time_this_iter_s': 3.927581310272217, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 10, 'trial_id': '9ab58_00000', 'experiment_id': '04c4813cb2864b5aa636f7c2db7bd0e6', 'date': '2022-09-06_04-16-10', 'timestamp': 1662462970, 'time_total_s': 296.5531668663025, 'pid': 16397, 'hostname': 'ray-cluster-kuberay-worker-workergroup-7qjzd', 'node_ip': '10.59.208.210', 'config': {}, 'time_since_restore': 296.5531668663025, 'timesteps_since_restore': 0, 'iterations_since_restore': 10, 'warmup_time': 0.006241798400878906, 'experiment_tag': '0'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=3150)\u001b[0m 2022-09-06 04:16:40,210\tINFO tune.py:758 -- Total run time: 333.26 seconds (330.50 seconds for the tuning loop).\n"
]
}
],
"source": [
"from pprint import pprint\n",
"result = trainer.fit()\n",
"print(f\"Last result: {result.metrics}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tune"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:16:54,530\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(pid=17380, ip=10.59.208.210)\u001b[0m 2022-09-06 04:16:56.089196: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:16:58 (running for 00:00:03.94)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 13.0/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 PENDING, 1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:16:59.717878: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.221640: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.230588: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.231194: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.232550: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.232950: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.233768: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.234315: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.856984: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.857675: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.858269: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.858807: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13795 MB memory: -> device: 0, name: Tesla T4, pci bus id: 0000:00:1e.0, compute capability: 7.5\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.859791: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.860392: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.860961: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.861615: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.862159: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.862645: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:worker/replica:0/task:0/device:GPU:0 with 13795 MB memory: -> device: 0, name: Tesla T4, pci bus id: 0000:00:1e.0, compute capability: 7.5\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.866988: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> 10.59.208.210:51561}\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.867071: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> 10.59.208.210:51561}\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:01.867697: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:438] Started server with target: grpc://10.59.208.210:51561\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:17:03 (running for 00:00:08.95)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 13.0/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 PENDING, 1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:03,018\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m 2022-09-06 04:17:06.112282: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8101\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:17:08 (running for 00:00:13.95)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 13.0/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 PENDING, 1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:08,025\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:11,173\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:17:19 (running for 00:00:25.34)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.1/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 PENDING, 1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:19,408\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:20,088\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:23,554\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:24,205\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:17:27 (running for 00:00:33.74)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.1/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 PENDING, 1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:27,809\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:28,539\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:31,842\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:32,551\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:17:36 (running for 00:00:41.93)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.1/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 PENDING, 1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:36,000\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:36,952\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:40,398\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:17:41 (running for 00:00:47.05)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.1/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 PENDING, 1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:41,119\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:44,664\tWARNING util.py:220 -- The `process_trial_save` operation took 3.543 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:44,664\tWARNING trial_runner.py:950 -- Consider turning off forced head-worker trial checkpoint syncs by setting sync_on_checkpoint=False. Note that this may result in faulty trial restoration if a failure occurs while the checkpoint is being synced from the worker to the head node.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:44,702\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:45,458\tWARNING util.py:220 -- The `callbacks.on_trial_result` operation took 0.566 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:45,459\tWARNING util.py:220 -- The `process_trial_result` operation took 0.567 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:45,459\tWARNING util.py:220 -- Processing trial results took 0.567 s, which may be a performance bottleneck. Please consider reporting results less frequently to Ray Tune.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:45,459\tWARNING util.py:220 -- The `process_trial_result` operation took 0.567 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:45,498\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:17:48 (running for 00:00:54.91)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.1/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 PENDING, 1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:48,945\tWARNING util.py:220 -- The `process_trial_save` operation took 3.447 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:48,987\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:49,710\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:53,190\tWARNING util.py:220 -- The `process_trial_save` operation took 3.480 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:53,238\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:53,937\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:17:57 (running for 00:01:03.31)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.4/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 PENDING, 1 RUNNING)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:57,333\tWARNING util.py:220 -- The `process_trial_save` operation took 3.396 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:57,380\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:17:57,937\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m Exception ignored in: <function Pool.__del__ at 0x7f98b19b90d0>\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m Traceback (most recent call last):\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/pool.py\", line 268, in __del__\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m self._change_notifier.put(None)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/queues.py\", line 368, in put\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m self._writer.send_bytes(obj)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/connection.py\", line 200, in send_bytes\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m self._send_bytes(m[offset:offset + size])\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/connection.py\", line 411, in _send_bytes\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m self._send(header + buf)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/connection.py\", line 368, in _send\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m n = write(self._handle, buf)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m OSError: [Errno 9] Bad file descriptor\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m Exception ignored in: <function Pool.__del__ at 0x7f98b19b90d0>\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m Traceback (most recent call last):\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/pool.py\", line 268, in __del__\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m self._change_notifier.put(None)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/queues.py\", line 368, in put\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m self._writer.send_bytes(obj)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/connection.py\", line 200, in send_bytes\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m self._send_bytes(m[offset:offset + size])\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/connection.py\", line 411, in _send_bytes\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m self._send(header + buf)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/connection.py\", line 368, in _send\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m n = write(self._handle, buf)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=17414, ip=10.59.208.210)\u001b[0m OSError: [Errno 9] Bad file descriptor\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:18:00,455\tWARNING util.py:220 -- The `process_trial_save` operation took 2.517 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m /home/ray/anaconda3/lib/python3.8/site-packages/mlflow/store/artifact/hdfs_artifact_repo.py:181: FutureWarning: pyarrow.hdfs.connect is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m connected = pa.hdfs.connect(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:18:25 (running for 00:01:31.78)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.9/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 0/13 CPUs, 0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 PENDING, 1 TERMINATED)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:18:25,850\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:18:26,698\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(pid=18363, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:28.223785: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:18:31 (running for 00:01:37.68)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.7/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 RUNNING, 1 TERMINATED)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:18:31,749\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:31.858132: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.321128: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.332169: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.332799: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.334139: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.334503: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.335093: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.335804: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.950805: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.951397: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.951987: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.952498: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13795 MB memory: -> device: 0, name: Tesla T4, pci bus id: 0000:00:1e.0, compute capability: 7.5\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.953394: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.953973: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.954482: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.955028: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.955542: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.956054: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:worker/replica:0/task:0/device:GPU:0 with 13795 MB memory: -> device: 0, name: Tesla T4, pci bus id: 0000:00:1e.0, compute capability: 7.5\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.960180: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> 10.59.208.210:56255}\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.960247: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> 10.59.208.210:56255}\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:33.960923: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:438] Started server with target: grpc://10.59.208.210:56255\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:18:36 (running for 00:01:42.72)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.7/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 RUNNING, 1 TERMINATED)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:18:36,792\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m 2022-09-06 04:18:38.148832: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8101\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:18:41 (running for 00:01:47.78)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.7/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 RUNNING, 1 TERMINATED)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:18:41,849\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:18:43,262\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:18:51,258\tWARNING util.py:220 -- The `process_trial_save` operation took 7.996 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:18:51,315\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:18:51 (running for 00:01:57.24)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.8/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 RUNNING, 1 TERMINATED)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:18:51,993\tWARNING util.py:220 -- The `callbacks.on_trial_result` operation took 0.505 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:18:51,994\tWARNING util.py:220 -- The `process_trial_result` operation took 0.506 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:18:51,994\tWARNING util.py:220 -- Processing trial results took 0.506 s, which may be a performance bottleneck. Please consider reporting results less frequently to Ray Tune.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:18:51,994\tWARNING util.py:220 -- The `process_trial_result` operation took 0.506 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:18:52,040\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:18:55,759\tWARNING util.py:220 -- The `process_trial_save` operation took 3.718 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:18:55,826\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:18:56 (running for 00:02:02.40)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.8/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 RUNNING, 1 TERMINATED)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:18:56,476\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:00,093\tWARNING util.py:220 -- The `process_trial_save` operation took 3.615 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:00,146\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:00,829\tWARNING util.py:220 -- The `callbacks.on_trial_result` operation took 0.502 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:00,830\tWARNING util.py:220 -- The `process_trial_result` operation took 0.504 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:00,830\tWARNING util.py:220 -- Processing trial results took 0.504 s, which may be a performance bottleneck. Please consider reporting results less frequently to Ray Tune.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:00,831\tWARNING util.py:220 -- The `process_trial_result` operation took 0.505 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:00,881\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:04,318\tWARNING util.py:220 -- The `process_trial_save` operation took 3.437 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:04,385\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:19:04 (running for 00:02:10.31)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.7/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 RUNNING, 1 TERMINATED)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:05,022\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:08,647\tWARNING util.py:220 -- The `process_trial_save` operation took 3.625 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:08,699\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:09,383\tWARNING util.py:220 -- The `callbacks.on_trial_result` operation took 0.527 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:09,384\tWARNING util.py:220 -- The `process_trial_result` operation took 0.528 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:09,385\tWARNING util.py:220 -- Processing trial results took 0.529 s, which may be a performance bottleneck. Please consider reporting results less frequently to Ray Tune.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:09,385\tWARNING util.py:220 -- The `process_trial_result` operation took 0.529 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:09,434\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:19:09 (running for 00:02:15.36)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.7/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 RUNNING, 1 TERMINATED)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:12,829\tWARNING util.py:220 -- The `process_trial_save` operation took 3.394 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:12,893\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:13,841\tWARNING util.py:220 -- The `callbacks.on_trial_result` operation took 0.794 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:13,842\tWARNING util.py:220 -- The `process_trial_result` operation took 0.796 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:13,842\tWARNING util.py:220 -- Processing trial results took 0.796 s, which may be a performance bottleneck. Please consider reporting results less frequently to Ray Tune.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:13,842\tWARNING util.py:220 -- The `process_trial_result` operation took 0.796 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:13,896\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:19:19 (running for 00:02:25.57)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.9/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 RUNNING, 1 TERMINATED)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:19,593\tWARNING util.py:220 -- The `process_trial_save` operation took 5.696 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:19,641\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:20,358\tWARNING util.py:220 -- The `callbacks.on_trial_result` operation took 0.555 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:20,359\tWARNING util.py:220 -- The `process_trial_result` operation took 0.556 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:20,360\tWARNING util.py:220 -- Processing trial results took 0.556 s, which may be a performance bottleneck. Please consider reporting results less frequently to Ray Tune.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:20,360\tWARNING util.py:220 -- The `process_trial_result` operation took 0.557 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:20,410\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:23,804\tWARNING util.py:220 -- The `process_trial_save` operation took 3.393 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:23,872\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:24,542\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:28,039\tWARNING util.py:220 -- The `process_trial_save` operation took 3.497 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:28,089\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:19:28 (running for 00:02:34.02)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.8/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 RUNNING, 1 TERMINATED)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:28,779\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:32,299\tWARNING util.py:220 -- The `process_trial_save` operation took 3.519 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:32,360\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:19:33 (running for 00:02:39.09)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.8/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 1.0/13 CPUs, 1.0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (1 RUNNING, 1 TERMINATED)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:33,161\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m Exception ignored in: <function Pool.__del__ at 0x7f994c82a0d0>\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m Traceback (most recent call last):\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/pool.py\", line 268, in __del__\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m self._change_notifier.put(None)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/queues.py\", line 368, in put\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m self._writer.send_bytes(obj)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/connection.py\", line 200, in send_bytes\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m self._send_bytes(m[offset:offset + size])\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/connection.py\", line 411, in _send_bytes\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m self._send(header + buf)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/connection.py\", line 368, in _send\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m n = write(self._handle, buf)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m OSError: [Errno 9] Bad file descriptor\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m Exception ignored in: <function Pool.__del__ at 0x7f994c82a0d0>\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m Traceback (most recent call last):\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/pool.py\", line 268, in __del__\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m self._change_notifier.put(None)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/queues.py\", line 368, in put\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m self._writer.send_bytes(obj)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/connection.py\", line 200, in send_bytes\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m self._send_bytes(m[offset:offset + size])\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/connection.py\", line 411, in _send_bytes\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m self._send(header + buf)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/multiprocessing/connection.py\", line 368, in _send\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m n = write(self._handle, buf)\n",
"\u001b[2m\u001b[36m(RayTrainWorker pid=18397, ip=10.59.208.210)\u001b[0m OSError: [Errno 9] Bad file descriptor\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:35,817\tWARNING util.py:220 -- The `process_trial_save` operation took 2.655 s, which may be a performance bottleneck.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m /home/ray/anaconda3/lib/python3.8/site-packages/mlflow/store/artifact/hdfs_artifact_repo.py:181: FutureWarning: pyarrow.hdfs.connect is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m connected = pa.hdfs.connect(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:19:49 (running for 00:02:55.05)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.8/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 0/13 CPUs, 0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (2 TERMINATED)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m == Status ==\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Current time: 2022-09-06 04:19:49 (running for 00:02:55.09)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Memory usage on this node: 12.8/15.2 GiB\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Using FIFO scheduling algorithm.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Resources requested: 0/13 CPUs, 0/1 GPUs, 0.0/37.0 GiB heap, 0.0/10.87 GiB objects (0.0/1.0 accelerator_type:T4)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Result logdir: /home/ray/ray_results/my_tune_run\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m Number of trials: 2/2 (2 TERMINATED)\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:49,119\tWARNING trial_runner.py:879 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:49,161\tWARNING tune.py:730 -- Trial Runner checkpointing failed: Lineage-based serialization is not supported for this dataset, which means that it cannot be used as a tunable hyperparameter. Lineage-based serialization is explicitly NOT supported for unioned or zipped datasets (see docstrings for those methods), and is only supported for Datasets created from data that we know will still exist at deserialization time, e.g. external data in persistent cloud object stores or in-memory data from long-lived clusters. Concretely, all ray.data.read_*() APIs should support lineage-based serialization, while all of the ray.data.from_*() APIs do not. To allow this Dataset to be serialized to storage, write the data to an external store (such as AWS S3, GCS, or Azure Blob Storage) using the Dataset.write_*() APIs, and serialize a new dataset reading from the external store using the ray.data.read_*() APIs.\n",
"\u001b[2m\u001b[36m(TunerInternal pid=4606)\u001b[0m 2022-09-06 04:19:49,275\tINFO tune.py:758 -- Total run time: 177.96 seconds (175.05 seconds for the tuning loop).\n"
]
},
{
"ename": "TuneError",
"evalue": "Tune run failed. Please use tuner = Tuner.restore(\"/home/ray/ray_results/my_tune_run\") to resume.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mRayTaskError(ValueError)\u001b[0m Traceback (most recent call last)",
"File \u001b[0;32m/opt/conda/envs/py38-ml/lib/python3.8/site-packages/ray/tune/tuner.py:246\u001b[0m, in \u001b[0;36mTuner.fit\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 245\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 246\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mray\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_remote_tuner\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mremote\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 247\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n",
"File \u001b[0;32m/opt/conda/envs/py38-ml/lib/python3.8/site-packages/ray/_private/client_mode_hook.py:104\u001b[0m, in \u001b[0;36mclient_mode_hook.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m func\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minit\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m is_client_mode_enabled_by_default:\n\u001b[0;32m--> 104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mray\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__name__\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 105\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
"File \u001b[0;32m/opt/conda/envs/py38-ml/lib/python3.8/site-packages/ray/util/client/api.py:42\u001b[0m, in \u001b[0;36m_ClientAPI.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;124;03m\"\"\"get is the hook stub passed on to replace `ray.get`\u001b[39;00m\n\u001b[1;32m 37\u001b[0m \n\u001b[1;32m 38\u001b[0m \u001b[38;5;124;03mArgs:\u001b[39;00m\n\u001b[1;32m 39\u001b[0m \u001b[38;5;124;03m vals: [Client]ObjectRef or list of these refs to retrieve.\u001b[39;00m\n\u001b[1;32m 40\u001b[0m \u001b[38;5;124;03m timeout: Optional timeout in milliseconds\u001b[39;00m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m---> 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mworker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvals\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/opt/conda/envs/py38-ml/lib/python3.8/site-packages/ray/util/client/worker.py:434\u001b[0m, in \u001b[0;36mWorker.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m 433\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 434\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get\u001b[49m\u001b[43m(\u001b[49m\u001b[43mto_get\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mop_timeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 435\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n",
"File \u001b[0;32m/opt/conda/envs/py38-ml/lib/python3.8/site-packages/ray/util/client/worker.py:462\u001b[0m, in \u001b[0;36mWorker._get\u001b[0;34m(self, ref, timeout)\u001b[0m\n\u001b[1;32m 461\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[0;32m--> 462\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[1;32m 463\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m>\u001b[39m OBJECT_TRANSFER_WARNING_SIZE \u001b[38;5;129;01mand\u001b[39;00m log_once(\n\u001b[1;32m 464\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mclient_object_transfer_size_warning\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 465\u001b[0m ):\n",
"\u001b[0;31mRayTaskError(ValueError)\u001b[0m: \u001b[36mray::TunerInternal.fit()\u001b[39m (pid=4606, ip=10.59.244.124, repr=<ray.tune.impl.tuner_internal.TunerInternal object at 0x7f1811e40700>)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/impl/tuner_internal.py\", line 283, in fit\n analysis = self._fit_internal(trainable, param_space)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/impl/tuner_internal.py\", line 380, in _fit_internal\n analysis = run(\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/tune.py\", line 771, in run\n return ExperimentAnalysis(\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/analysis/experiment_analysis.py\", line 89, in __init__\n self._load_checkpoints(experiment_checkpoint_path)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/analysis/experiment_analysis.py\", line 130, in _load_checkpoints\n latest_checkpoint = self._get_latest_checkpoint(experiment_checkpoint_path)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/analysis/experiment_analysis.py\", line 184, in _get_latest_checkpoint\n raise ValueError(\nValueError: The file `/home/ray/ray_results/my_tune_run/experiment_state-2022-09-06_04-16-53.json` does not exist and cannot be loaded for experiment analysis.",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[0;31mTuneError\u001b[0m Traceback (most recent call last)",
"Input \u001b[0;32mIn [20]\u001b[0m, in \u001b[0;36m<cell line: 26>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 15\u001b[0m stopper \u001b[38;5;241m=\u001b[39m TimeoutStopper(timedelta(minutes\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m20\u001b[39m))\n\u001b[1;32m 17\u001b[0m tuner \u001b[38;5;241m=\u001b[39m Tuner(\n\u001b[1;32m 18\u001b[0m trainable\u001b[38;5;241m=\u001b[39mbuild_trainer(),\n\u001b[1;32m 19\u001b[0m param_space\u001b[38;5;241m=\u001b[39mparam_space,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 24\u001b[0m verbose \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 25\u001b[0m ))\n\u001b[0;32m---> 26\u001b[0m analysis \u001b[38;5;241m=\u001b[39m \u001b[43mtuner\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28mprint\u001b[39m(analysis\u001b[38;5;241m.\u001b[39mget_best_result(metric\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mval_loss\u001b[39m\u001b[38;5;124m\"\u001b[39m, mode\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmin\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mconfig)\n",
"File \u001b[0;32m/opt/conda/envs/py38-ml/lib/python3.8/site-packages/ray/tune/tuner.py:248\u001b[0m, in \u001b[0;36mTuner.fit\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 246\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ray\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_remote_tuner\u001b[38;5;241m.\u001b[39mfit\u001b[38;5;241m.\u001b[39mremote())\n\u001b[1;32m 247\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m--> 248\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m TuneError(\n\u001b[1;32m 249\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTune run failed. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 250\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mPlease use tuner = Tuner.restore(\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 251\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mexperiment_checkpoint_dir\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m) to resume.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 252\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n",
"\u001b[0;31mTuneError\u001b[0m: Tune run failed. Please use tuner = Tuner.restore(\"/home/ray/ray_results/my_tune_run\") to resume."
]
}
],
"source": [
"from ray import tune\n",
"from ray.tune.tuner import Tuner\n",
"\n",
"\n",
"param_space = {\n",
" \"train_loop_config\" : {\n",
" \"lr\": tune.grid_search([0.001, 0.01]),\n",
" \"batch_size\": batch_size,\n",
" \"image_shape\": image_shape,\n",
" \"num_epochs\": 10,\n",
" },\n",
" \"datasets\": {\"train\": train_ds, \"val\": val_ds}\n",
"}\n",
"\n",
"stopper = TimeoutStopper(timedelta(minutes=20))\n",
"\n",
"tuner = Tuner(\n",
" trainable=build_trainer(),\n",
" param_space=param_space,\n",
" run_config=RunConfig(\n",
" name=\"my_tune_run\",\n",
" callbacks=[mlflow_callback],\n",
" stop = stopper,\n",
" verbose = 1\n",
" ))\n",
"analysis = tuner.fit()\n",
"print(analysis.get_best_result(metric=\"val_loss\", mode=\"min\").config)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 断开远程ray集群连接,停止driver,可以及时释放driver资源。如果是本地集群还会关闭集群。\n",
"# 只有调用了ray.shutdown()才可以重新进行ray.init()。\n",
"ray.shutdown()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## References\n",
"\n",
"- [Ray 2.0.0 documentation](https://docs.ray.io/en/latest/index.html)\n",
"- [Ray AIR Trainer](https://docs.ray.io/en/latest/ray-air/package-ref.html?highlight=mlflow#trainer)\n",
"- [Ray AIR Tuner](https://docs.ray.io/en/latest/ray-air/package-ref.html?highlight=mlflow#tuner)\n",
"- [Ray AIR mlflow intergration](https://docs.ray.io/en/latest/ray-air/package-ref.html?highlight=mlflow#mlflow)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "py38-ml",
"language": "python",
"name": "py38-ml"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
},
"vscode": {
"interpreter": {
"hash": "d6a2992697e4c4895ef60090a0228ef0ab508f2664f2639f6710c9329411c8b9"
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
@xwjiang2010
Copy link

Can you try removing datasets from param_space?

param_space = {
    "train_loop_config" : {
        "lr": tune.grid_search([0.001, 0.01]),
        "batch_size": batch_size,
        "image_shape": image_shape,
        "num_epochs": 10,
    },
    "datasets": {"train": train_ds, "val": val_ds}
}

basically the above blob becomes

param_space = {
    "train_loop_config" : {
        "lr": tune.grid_search([0.001, 0.01]),
        "batch_size": batch_size,
        "image_shape": image_shape,
        "num_epochs": 10,
    },
}

@xwjiang2010
Copy link

The idea is that if you don't need to tune the dataset, you only need to supply it through Trainer() args as you did. No need to do it through param_space again. Actually similarly for all the none-tuning parameters.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment