skliarpawlo/huggingface_example.py

## huggingface_example.py
from datasets import load_dataset
import transformers
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

import ray
from ray import tune, air
from ray.train.huggingface import HuggingFaceTrainer
from ray.air.config import ScalingConfig
import os

# If using GPUs, set this to True.
use_gpu = False

model_checkpoint = "gpt2"
tokenizer_checkpoint = "sgugger/gpt2-like-tokenizer"
block_size = 128


datasets = load_dataset("wikitext", "wikitext-2-raw-v1")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["text"])

tokenized_datasets = datasets.map(
    tokenize_function, batched=True, num_proc=1, remove_columns=["text"]
)


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {
        k: sum(examples[k], []) for k in examples.keys()
    }
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model
    # supported it.
    # instead of this drop, you can customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [
            t[i : i + block_size]
            for i in range(0, total_length, block_size)
        ]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=1,
)

ray_train_ds = ray.data.from_huggingface(lm_datasets["train"])
ray_evaluation_ds = ray.data.from_huggingface(
    lm_datasets["validation"]
)

def trainer_init_per_worker(train_dataset, eval_dataset, **config):
    model_config = AutoConfig.from_pretrained(model_checkpoint)
    model = AutoModelForCausalLM.from_config(model_config)
    args = transformers.TrainingArguments(
        output_dir=f"/tmp/{model_checkpoint}-wikitext2",

        # evaluation_strategy="epoch",
        # save_strategy="epoch",
        # logging_strategy="epoch",

        save_steps=2,
        logging_steps=2,
        metric_for_best_model='loss',
        save_total_limit=1,

        learning_rate=config.get('learning_rate'),
        weight_decay=config.get('weight_decay'),
        max_steps=30,
        num_train_epochs=3,
        no_cuda=(not use_gpu),
    )
    return transformers.Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu)
trainer = HuggingFaceTrainer(
    trainer_init_per_worker=trainer_init_per_worker,
    scaling_config=scaling_config,
    datasets={"train": ray_train_ds, "evaluation": ray_evaluation_ds},
)


if __name__ == '__main__':

    S3_BUCKET = os.environ['S3_BUCKET']
    upload_dir = os.environ['UPLOAD_DIR']
    name = os.environ['EXPERIMENT_NAME']

    tuner = tune.Tuner(
        trainer,
        param_space={
            'trainer_init_config': {
                'weight_decay': tune.grid_search([0.01, 0.02]),
                'learning_rate': tune.grid_search([2e-5, 2e-4]),
            },
        },
        tune_config=tune.TuneConfig(
            num_samples=1,
            max_concurrent_trials=20,
        ),
        run_config=air.RunConfig(
            name=name,
            local_dir='/tmp/experiment_dir',
            sync_config=tune.SyncConfig(
                upload_dir=upload_dir,
            ),
            checkpoint_config=air.CheckpointConfig(
                num_to_keep=2,
                checkpoint_score_attribute='loss',
                checkpoint_score_order='min',
            ),
            failure_config=air.FailureConfig(
                max_failures=1,
            ),
        ),
    )

    results = tuner.fit()
    print(results.get_best_result(metric="loss", mode="min").config)

## main.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a7519b16-21f4-4033-a1ef-6a5a81874e93",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ray/anaconda3/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "# 1. Run the script hugginface_example.py as a job\n",
    "import ray\n",
    "from ray.job_submission import JobSubmissionClient\n",
    "import random\n",
    "\n",
    "RAY_HEAD_IP = '<head ip>'\n",
    "S3_BUCKET = '<some s3 bucket>'\n",
    "upload_dir = f\"s3://{S3_BUCKET}/examples/example-huggingface-tune\"\n",
    "\n",
    "runtime_env = {\n",
    "    'working_dir': '/home/jovyan/sparta/examples/ray/huggingface_trainer',\n",
    "    # 'pip': [\n",
    "    #     'datasets', \n",
    "    #     'transformers',\n",
    "    #     'torch',\n",
    "    # ], \n",
    "    'env_vars': {\n",
    "        'S3_BUCKET': S3_BUCKET,\n",
    "        'UPLOAD_DIR': upload_dir,\n",
    "        'EXPERIMENT_NAME': 'experiment-1',\n",
    "        'CHECKSUM': str(random.randint(1, 1000)),\n",
    "    },\n",
    "}\n",
    "\n",
    "upload_dir = f\"s3://{S3_BUCKET}/examples/example-huggingface-tune\"\n",
    "\n",
    "client = JobSubmissionClient(f\"http://{RAY_HEAD_IP}:8265\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0b2b6ec8-fd10-497b-a32f-ff2ef607040f",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "raysubmit_2ic6Q1xczLeM81AS\n"
     ]
    }
   ],
   "source": [
    "job_id = client.submit_job(\n",
    "    entrypoint=\"python huggingface_example.py\",\n",
    "    runtime_env=runtime_env,\n",
    ")\n",
    "print(job_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b7cbd870-df93-4ee6-8559-c27de78ba7b6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<JobStatus.PENDING: 'PENDING'>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "client.get_job_status(job_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ff04d651-028f-43b5-8eb6-92c8977aa885",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# client.stop_job(job_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "649e7c2d-7146-490b-bea9-3e01c9fe66c4",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using FIFO scheduling algorithm.\n",
      "Resources requested: 4.0/7 CPUs, 0/0 GPUs, 0.0/46.57 GiB heap, 0.0/13.81 GiB objects\n",
      "Result logdir: /tmp/experiment_dir/experiment-1\n",
      "Number of trials: 4/4 (3 ERROR, 1 RUNNING)\n",
      "+--------------------------------+----------+-------------------+------------------------+------------------------+\n",
      "| Trial name                     | status   | loc               |   trainer_init_config/ |   trainer_init_config/ |\n",
      "|                                |          |                   |          learning_rate |           weight_decay |\n",
      "|--------------------------------+----------+-------------------+------------------------+------------------------|\n",
      "| HuggingFaceTrainer_ff7fc_00003 | RUNNING  | 10.85.50.212:3386 |                 0.0002 |                   0.02 |\n",
      "| HuggingFaceTrainer_ff7fc_00000 | ERROR    | 10.85.51.17:1827  |                 2e-05  |                   0.01 |\n",
      "| HuggingFaceTrainer_ff7fc_00001 | ERROR    | 10.85.51.17:2329  |                 0.0002 |                   0.01 |\n",
      "| HuggingFaceTrainer_ff7fc_00002 | ERROR    | 10.85.53.81:5966  |                 2e-05  |                   0.02 |\n",
      "+--------------------------------+----------+-------------------+------------------------+------------------------+\n",
      "Number of errored trials: 4\n",
      "+--------------------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------------+\n",
      "| Trial name                     |   # failures | error file                                                                                                                               |\n",
      "|--------------------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------------|\n",
      "| HuggingFaceTrainer_ff7fc_00000 |            2 | /tmp/experiment_dir/experiment-1/HuggingFaceTrainer_ff7fc_00000_0_learning_rate=0.0000,weight_decay=0.0100_2023-05-06_05-40-18/error.txt |\n",
      "| HuggingFaceTrainer_ff7fc_00001 |            2 | /tmp/experiment_dir/experiment-1/HuggingFaceTrainer_ff7fc_00001_1_learning_rate=0.0002,weight_decay=0.0100_2023-05-06_05-40-35/error.txt |\n",
      "| HuggingFaceTrainer_ff7fc_00002 |            2 | /tmp/experiment_dir/experiment-1/HuggingFaceTrainer_ff7fc_00002_2_learning_rate=0.0000,weight_decay=0.0200_2023-05-06_05-40-51/error.txt |\n",
      "| HuggingFaceTrainer_ff7fc_00003 |            1 | /tmp/experiment_dir/experiment-1/HuggingFaceTrainer_ff7fc_00003_3_learning_rate=0.0002,weight_decay=0.0200_2023-05-06_05-42-13/error.txt |\n",
      "+--------------------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------------+\n",
      "\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=3534, ip=10.85.50.212)\u001b[0m /tmp/ray/session_2023-05-06_05-36-47_937693_8/runtime_resources/pip/e7bbfb436fcfd4ffbf565474a140c798bb4e314a/virtualenv/lib/python3.8/site-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=3534, ip=10.85.50.212)\u001b[0m   warnings.warn(\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=6628, ip=10.85.53.81)\u001b[0m /tmp/ray/session_2023-05-06_05-36-47_937693_8/runtime_resources/pip/e7bbfb436fcfd4ffbf565474a140c798bb4e314a/virtualenv/lib/python3.8/site-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=6628, ip=10.85.53.81)\u001b[0m   warnings.warn(\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=6627, ip=10.85.53.81)\u001b[0m /tmp/ray/session_2023-05-06_05-36-47_937693_8/runtime_resources/pip/e7bbfb436fcfd4ffbf565474a140c798bb4e314a/virtualenv/lib/python3.8/site-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=6627, ip=10.85.53.81)\u001b[0m   warnings.warn(\n",
      "2023-05-06 05:43:36,354\tERROR trial_runner.py:1062 -- Trial HuggingFaceTrainer_ff7fc_00003: Error processing event.\n",
      "ray.exceptions.RayTaskError(RuntimeError): \u001b[36mray::_Inner.train()\u001b[39m (pid=3386, ip=10.85.50.212, repr=HuggingFaceTrainer)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 368, in train\n",
      "    raise skipped from exception_cause(skipped)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/utils.py\", line 54, in check_for_failure\n",
      "    ray.get(object_ref)\n",
      "ray.exceptions.RayTaskError(RuntimeError): \u001b[36mray::RayTrainWorker._RayTrainWorker__execute()\u001b[39m (pid=6628, ip=10.85.53.81, repr=<ray.train._internal.worker_group.RayTrainWorker object at 0x7fa9304d2eb0>)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/worker_group.py\", line 31, in __execute\n",
      "    raise skipped from exception_cause(skipped)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/utils.py\", line 129, in discard_return_wrapper\n",
      "    train_func(*args, **kwargs)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/huggingface/huggingface_trainer.py\", line 417, in _huggingface_train_loop_per_worker\n",
      "    trainer.train()\n",
      "  File \"/tmp/ray/session_2023-05-06_05-36-47_937693_8/runtime_resources/pip/e7bbfb436fcfd4ffbf565474a140c798bb4e314a/virtualenv/lib/python3.8/site-packages/transformers/trainer.py\", line 1662, in train\n",
      "    return inner_training_loop(\n",
      "  File \"/tmp/ray/session_2023-05-06_05-36-47_937693_8/runtime_resources/pip/e7bbfb436fcfd4ffbf565474a140c798bb4e314a/virtualenv/lib/python3.8/site-packages/transformers/trainer.py\", line 1749, in _inner_training_loop\n",
      "    model = self._wrap_model(self.model_wrapped)\n",
      "  File \"/tmp/ray/session_2023-05-06_05-36-47_937693_8/runtime_resources/pip/e7bbfb436fcfd4ffbf565474a140c798bb4e314a/virtualenv/lib/python3.8/site-packages/transformers/trainer.py\", line 1569, in _wrap_model\n",
      "    model = nn.parallel.DistributedDataParallel(\n",
      "  File \"/tmp/ray/session_2023-05-06_05-36-47_937693_8/runtime_resources/pip/e7bbfb436fcfd4ffbf565474a140c798bb4e314a/virtualenv/lib/python3.8/site-packages/torch/nn/parallel/distributed.py\", line 676, in __init__\n",
      "    _sync_module_states(\n",
      "  File \"/tmp/ray/session_2023-05-06_05-36-47_937693_8/runtime_resources/pip/e7bbfb436fcfd4ffbf565474a140c798bb4e314a/virtualenv/lib/python3.8/site-packages/torch/distributed/utils.py\", line 142, in _sync_module_states\n",
      "    _sync_params_and_buffers(\n",
      "  File \"/tmp/ray/session_2023-05-06_05-36-47_937693_8/runtime_resources/pip/e7bbfb436fcfd4ffbf565474a140c798bb4e314a/virtualenv/lib/python3.8/site-packages/torch/distributed/utils.py\", line 160, in _sync_params_and_buffers\n",
      "    dist._broadcast_coalesced(\n",
      "RuntimeError: Invalid scalar type\n",
      "Result for HuggingFaceTrainer_ff7fc_00003:\n",
      "  date: 2023-05-06_05-43-25\n",
      "  experiment_id: 20f76ecf6eb44e2996e4ef2cfcd632b0\n",
      "  hostname: pavlo-cluster-kuberay-worker-cpugroup-4ft4f\n",
      "  node_ip: 10.85.50.212\n",
      "  pid: 3386\n",
      "  timestamp: 1683377005\n",
      "  trial_id: ff7fc_00003\n",
      "  \n",
      "== Status ==\n",
      "Current time: 2023-05-06 05:43:41 (running for 00:04:31.25)\n",
      "Memory usage on this node: 4.0/68.6 GiB \n",
      "Using FIFO scheduling algorithm.\n",
      "Resources requested: 0/7 CPUs, 0/0 GPUs, 0.0/46.57 GiB heap, 0.0/13.81 GiB objects\n",
      "Result logdir: /tmp/experiment_dir/experiment-1\n",
      "Number of trials: 4/4 (4 ERROR)\n",
      "+--------------------------------+----------+-------------------+------------------------+------------------------+\n",
      "| Trial name                     | status   | loc               |   trainer_init_config/ |   trainer_init_config/ |\n",
      "|                                |          |                   |          learning_rate |           weight_decay |\n",
      "|--------------------------------+----------+-------------------+------------------------+------------------------|\n",
      "| HuggingFaceTrainer_ff7fc_00000 | ERROR    | 10.85.51.17:1827  |                 2e-05  |                   0.01 |\n",
      "| HuggingFaceTrainer_ff7fc_00001 | ERROR    | 10.85.51.17:2329  |                 0.0002 |                   0.01 |\n",
      "| HuggingFaceTrainer_ff7fc_00002 | ERROR    | 10.85.53.81:5966  |                 2e-05  |                   0.02 |\n",
      "| HuggingFaceTrainer_ff7fc_00003 | ERROR    | 10.85.50.212:3386 |                 0.0002 |                   0.02 |\n",
      "+--------------------------------+----------+-------------------+------------------------+------------------------+\n",
      "Number of errored trials: 4\n",
      "+--------------------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------------+\n",
      "| Trial name                     |   # failures | error file                                                                                                                               |\n",
      "|--------------------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------------|\n",
      "| HuggingFaceTrainer_ff7fc_00000 |            2 | /tmp/experiment_dir/experiment-1/HuggingFaceTrainer_ff7fc_00000_0_learning_rate=0.0000,weight_decay=0.0100_2023-05-06_05-40-18/error.txt |\n",
      "| HuggingFaceTrainer_ff7fc_00001 |            2 | /tmp/experiment_dir/experiment-1/HuggingFaceTrainer_ff7fc_00001_1_learning_rate=0.0002,weight_decay=0.0100_2023-05-06_05-40-35/error.txt |\n",
      "| HuggingFaceTrainer_ff7fc_00002 |            2 | /tmp/experiment_dir/experiment-1/HuggingFaceTrainer_ff7fc_00002_2_learning_rate=0.0000,weight_decay=0.0200_2023-05-06_05-40-51/error.txt |\n",
      "| HuggingFaceTrainer_ff7fc_00003 |            2 | /tmp/experiment_dir/experiment-1/HuggingFaceTrainer_ff7fc_00003_3_learning_rate=0.0002,weight_decay=0.0200_2023-05-06_05-42-13/error.txt |\n",
      "+--------------------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------------+\n",
      "\n",
      "2023-05-06 05:43:41,218\tERROR tune.py:794 -- Trials did not complete: [HuggingFaceTrainer_ff7fc_00000, HuggingFaceTrainer_ff7fc_00001, HuggingFaceTrainer_ff7fc_00002, HuggingFaceTrainer_ff7fc_00003]\n",
      "2023-05-06 05:43:41,218\tINFO tune.py:798 -- Total run time: 271.44 seconds (266.39 seconds for the tuning loop).\n",
      "2023-05-06 05:43:41,220\tWARNING experiment_analysis.py:621 -- Could not find best trial. Did you pass the correct `metric` parameter?\n",
      "Traceback (most recent call last):\n",
      "  File \"huggingface_example.py\", line 134, in <module>\n",
      "    print(results.get_best_result(metric=\"loss\", mode=\"min\").config)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/result_grid.py\", line 134, in get_best_result\n",
      "    raise RuntimeError(error_msg)\n",
      "RuntimeError: No best trial found for the given metric: loss. This means that no trial has reported this metric, or all values reported for this metric are NaN. To not ignore NaN values, you can set the `filter_nan_and_inf` arg to False.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print('\\n'.join(client.get_job_logs(job_id).split('\\n')[-100:]))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "337c95aa-e474-4eb8-965d-4712daf9599f",
   "metadata": {},
   "source": [
    "#### Check results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8ce0f4d4-0e27-4e2f-baa4-3a913edb11d6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-05-06 07:48:35,727\tINFO packaging.py:520 -- Creating a file package for local directory '/home/jovyan/sparta/examples/ray/huggingface_trainer'.\n",
      "2023-05-06 07:48:35,731\tINFO packaging.py:347 -- Pushing file package 'gcs://_ray_pkg_3fe65fa36e07269d.zip' (0.14MiB) to Ray cluster...\n",
      "2023-05-06 07:48:35,736\tINFO packaging.py:360 -- Successfully pushed file package 'gcs://_ray_pkg_3fe65fa36e07269d.zip'.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "    <div style=\"margin-left: 50px;display: flex;flex-direction: row;align-items: center\">\n",
       "        <h3 style=\"color: var(--jp-ui-font-color0)\">Ray</h3>\n",
       "        <svg version=\"1.1\" id=\"ray\" width=\"3em\" viewBox=\"0 0 144.5 144.6\" style=\"margin-left: 3em;margin-right: 3em\">\n",
       "            <g id=\"layer-1\">\n",
       "                <path fill=\"#00a2e9\" class=\"st0\" d=\"M97.3,77.2c-3.8-1.1-6.2,0.9-8.3,5.1c-3.5,6.8-9.9,9.9-17.4,9.6S58,88.1,54.8,81.2c-1.4-3-3-4-6.3-4.1\n",
       "                    c-5.6-0.1-9.9,0.1-13.1,6.4c-3.8,7.6-13.6,10.2-21.8,7.6C5.2,88.4-0.4,80.5,0,71.7c0.1-8.4,5.7-15.8,13.8-18.2\n",
       "                    c8.4-2.6,17.5,0.7,22.3,8c1.3,1.9,1.3,5.2,3.6,5.6c3.9,0.6,8,0.2,12,0.2c1.8,0,1.9-1.6,2.4-2.8c3.5-7.8,9.7-11.8,18-11.9\n",
       "                    c8.2-0.1,14.4,3.9,17.8,11.4c1.3,2.8,2.9,3.6,5.7,3.3c1-0.1,2,0.1,3,0c2.8-0.5,6.4,1.7,8.1-2.7s-2.3-5.5-4.1-7.5\n",
       "                    c-5.1-5.7-10.9-10.8-16.1-16.3C84,38,81.9,37.1,78,38.3C66.7,42,56.2,35.7,53,24.1C50.3,14,57.3,2.8,67.7,0.5\n",
       "                    C78.4-2,89,4.7,91.5,15.3c0.1,0.3,0.1,0.5,0.2,0.8c0.7,3.4,0.7,6.9-0.8,9.8c-1.7,3.2-0.8,5,1.5,7.2c6.7,6.5,13.3,13,19.8,19.7\n",
       "                    c1.8,1.8,3,2.1,5.5,1.2c9.1-3.4,17.9-0.6,23.4,7c4.8,6.9,4.6,16.1-0.4,22.9c-5.4,7.2-14.2,9.9-23.1,6.5c-2.3-0.9-3.5-0.6-5.1,1.1\n",
       "                    c-6.7,6.9-13.6,13.7-20.5,20.4c-1.8,1.8-2.5,3.2-1.4,5.9c3.5,8.7,0.3,18.6-7.7,23.6c-7.9,5-18.2,3.8-24.8-2.9\n",
       "                    c-6.4-6.4-7.4-16.2-2.5-24.3c4.9-7.8,14.5-11,23.1-7.8c3,1.1,4.7,0.5,6.9-1.7C91.7,98.4,98,92.3,104.2,86c1.6-1.6,4.1-2.7,2.6-6.2\n",
       "                    c-1.4-3.3-3.8-2.5-6.2-2.6C99.8,77.2,98.9,77.2,97.3,77.2z M72.1,29.7c5.5,0.1,9.9-4.3,10-9.8c0-0.1,0-0.2,0-0.3\n",
       "                    C81.8,14,77,9.8,71.5,10.2c-5,0.3-9,4.2-9.3,9.2c-0.2,5.5,4,10.1,9.5,10.3C71.8,29.7,72,29.7,72.1,29.7z M72.3,62.3\n",
       "                    c-5.4-0.1-9.9,4.2-10.1,9.7c0,0.2,0,0.3,0,0.5c0.2,5.4,4.5,9.7,9.9,10c5.1,0.1,9.9-4.7,10.1-9.8c0.2-5.5-4-10-9.5-10.3\n",
       "                    C72.6,62.3,72.4,62.3,72.3,62.3z M115,72.5c0.1,5.4,4.5,9.7,9.8,9.9c5.6-0.2,10-4.8,10-10.4c-0.2-5.4-4.6-9.7-10-9.7\n",
       "                    c-5.3-0.1-9.8,4.2-9.9,9.5C115,72.1,115,72.3,115,72.5z M19.5,62.3c-5.4,0.1-9.8,4.4-10,9.8c-0.1,5.1,5.2,10.4,10.2,10.3\n",
       "                    c5.6-0.2,10-4.9,9.8-10.5c-0.1-5.4-4.5-9.7-9.9-9.6C19.6,62.3,19.5,62.3,19.5,62.3z M71.8,134.6c5.9,0.2,10.3-3.9,10.4-9.6\n",
       "                    c0.5-5.5-3.6-10.4-9.1-10.8c-5.5-0.5-10.4,3.6-10.8,9.1c0,0.5,0,0.9,0,1.4c-0.2,5.3,4,9.8,9.3,10\n",
       "                    C71.6,134.6,71.7,134.6,71.8,134.6z\"/>\n",
       "            </g>\n",
       "        </svg>\n",
       "        <table>\n",
       "            <tr>\n",
       "                <td style=\"text-align: left\"><b>Python version:</b></td>\n",
       "                <td style=\"text-align: left\"><b>3.8.13</b></td>\n",
       "            </tr>\n",
       "            <tr>\n",
       "                <td style=\"text-align: left\"><b>Ray version:</b></td>\n",
       "                <td style=\"text-align: left\"><b> 3.0.0.dev0</b></td>\n",
       "            </tr>\n",
       "            <tr>\n",
       "    <td style=\"text-align: left\"><b>Dashboard:</b></td>\n",
       "    <td style=\"text-align: left\"><b><a href=\"http://10.85.55.163:8265\" target=\"_blank\">http://10.85.55.163:8265</a></b></td>\n",
       "</tr>\n",
       "\n",
       "        </table>\n",
       "    </div>\n",
       "</div>\n"
      ],
      "text/plain": [
       "ClientContext(dashboard_url='10.85.55.163:8265', python_version='3.8.13', ray_version='3.0.0.dev0', ray_commit='9432c9ec6e083177bda0cf6fa6e64c940f44156f', protocol_version='2022-12-06', _num_clients=1, _context_to_restore=<ray.util.client._ClientContext object at 0x7feeff1aaaf0>)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import ray\n",
    "from ray import tune\n",
    "ray.init(\n",
    "    address=f'ray://{RAY_HEAD_IP}:10001',\n",
    "    runtime_env=runtime_env,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d4733e65-25f4-42f2-8030-64b3e9a1d9e6",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading builder script: 100%|██████████| 8.48k/8.48k [00:00<00:00, 4.94MB/s]\n",
      "Downloading metadata: 100%|██████████| 6.84k/6.84k [00:00<00:00, 4.44MB/s]\n",
      "Downloading readme: 100%|██████████| 9.25k/9.25k [00:00<00:00, 6.20MB/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset wikitext/wikitext-2-raw-v1 to /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading data: 100%|██████████| 4.72M/4.72M [00:00<00:00, 71.9MB/s]\n",
      "                                                                                       \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset wikitext downloaded and prepared to /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 3/3 [00:00<00:00, 644.32it/s]\n",
      "Downloading (…)okenizer_config.json: 100%|██████████| 236/236 [00:00<00:00, 35.0kB/s]\n",
      "Downloading (…)olve/main/vocab.json: 100%|██████████| 396k/396k [00:00<00:00, 40.5MB/s]\n",
      "Downloading (…)olve/main/merges.txt: 100%|██████████| 232k/232k [00:00<00:00, 70.7MB/s]\n",
      "Downloading (…)/main/tokenizer.json: 100%|██████████| 678k/678k [00:00<00:00, 132MB/s]\n",
      "Downloading (…)cial_tokens_map.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 52.5kB/s]\n",
      "2023-05-06 07:49:05,838\tWARNING datastream.py:251 -- \u001b[33m[IMPORTANT]: Ray Data strict mode is on by default in Ray 2.5. When in strict mode, data schemas are required, standalone Python objects are no longer supported, and the default batch format changes to `numpy` from `pandas`. To disable strict mode temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes. Strict mode will not be possible to disable in future releases.\n",
      "\n",
      "Learn more here: https://docs.ray.io/en/master/data/faq.html#what-is-strict-mode\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "from huggingface_example import trainer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "399e48db-060b-4cc3-95b9-0abf99375eed",
   "metadata": {},
   "outputs": [],
   "source": [
    "tuner = tune.Tuner.restore(\n",
    "    f's3://{S3_BUCKET}/examples/example-huggingface-tune/experiment-1/',\n",
    "    trainable=trainer,\n",
    "    resume_unfinished=False,\n",
    "    restart_errored=False,\n",
    "    resume_errored=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30d4ebd7-2158-4d4f-909c-d1c3793b552a",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[33m[IMPORTANT]: Ray Data strict mode is on by default in Ray 2.5. When in strict mode, data schemas are required, standalone Python objects are no longer supported, and the default batch format changes to `numpy` from `pandas`. To disable strict mode temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes. Strict mode will not be possible to disable in future releases.\n",
      "\n",
      "Learn more here: https://docs.ray.io/en/master/data/faq.html#what-is-strict-mode\u001b[0m\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m Found cached dataset wikitext (/home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)\n",
      "100%|██████████| 3/3 [00:00<00:00, 1044.66it/s]\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m Loading cached processed dataset at /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-5c8dd88d71bb2d99.arrow\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m Loading cached processed dataset at /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-74437d7c87fbab78.arrow\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m Loading cached processed dataset at /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-4bde759af7353b26.arrow\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m Loading cached processed dataset at /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-5e93c765ae22bb1c.arrow\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m Loading cached processed dataset at /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-ac91e1da5e5aa5f6.arrow\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m Loading cached processed dataset at /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-a1cf333f1d657df0.arrow\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m 2023-05-06 07:49:56,113\tWARNING datastream.py:251 -- \u001b[33m[IMPORTANT]: Ray Data strict mode is on by default in Ray 2.5. When in strict mode, data schemas are required, standalone Python objects are no longer supported, and the default batch format changes to `numpy` from `pandas`. To disable strict mode temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes. Strict mode will not be possible to disable in future releases.\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m \n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#what-is-strict-mode\u001b[0m\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m 2023-05-06 07:49:56,532\tINFO experiment_analysis.py:966 -- No trial data passed in during `ExperimentAnalysis` initialization -- you are most likely loading the experiment after it has completed.\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m Loading trial data from the experiment checkpoint file. This may result in loading some stale information, since checkpointing is periodic.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div class=\"tuneStatus\">\n",
       "  <div style=\"display: flex;flex-direction: row\">\n",
       "    <div style=\"display: flex;flex-direction: column;\">\n",
       "      <h3>Tune Status</h3>\n",
       "      <table>\n",
       "<tbody>\n",
       "<tr><td>Current time:</td><td>2023-05-06 07:51:09</td></tr>\n",
       "<tr><td>Running for: </td><td>00:01:10.68        </td></tr>\n",
       "<tr><td>Memory:      </td><td>5.0/68.6 GiB       </td></tr>\n",
       "</tbody>\n",
       "</table>\n",
       "    </div>\n",
       "    <div class=\"vDivider\"></div>\n",
       "    <div class=\"systemInfo\">\n",
       "      <h3>System Info</h3>\n",
       "      Using FIFO scheduling algorithm.<br>Logical resource usage: 4.0/3 CPUs, 0/0 GPUs\n",
       "    </div>\n",
       "    <div class=\"vDivider\"></div>\n",
       "<div class=\"messages\">\n",
       "  <h3>Messages</h3>\n",
       "  \n",
       "  \n",
       "  Number of errored trials: 1<br><table>\n",
       "<thead>\n",
       "<tr><th>Trial name                    </th><th style=\"text-align: right;\">  # failures</th><th>error file                                                                                       </th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "<tr><td>HuggingFaceTrainer_45442_00000</td><td style=\"text-align: right;\">           1</td><td>/home/ray/ray_results/experiment-1/HuggingFaceTrainer_45442_00000_0_2023-05-06_07-49-58/error.txt</td></tr>\n",
       "</tbody>\n",
       "</table>\n",
       "</div>\n",
       "<style>\n",
       ".messages {\n",
       "  color: var(--jp-ui-font-color1);\n",
       "  display: flex;\n",
       "  flex-direction: column;\n",
       "  padding-left: 1em;\n",
       "  overflow-y: auto;\n",
       "}\n",
       ".messages h3 {\n",
       "  font-weight: bold;\n",
       "}\n",
       ".vDivider {\n",
       "  border-left-width: var(--jp-border-width);\n",
       "  border-left-color: var(--jp-border-color0);\n",
       "  border-left-style: solid;\n",
       "  margin: 0.5em 1em 0.5em 1em;\n",
       "}\n",
       "</style>\n",
       "\n",
       "  </div>\n",
       "  <div class=\"hDivider\"></div>\n",
       "  <div class=\"trialStatus\">\n",
       "    <h3>Trial Status</h3>\n",
       "    <table>\n",
       "<thead>\n",
       "<tr><th>Trial name                    </th><th>status  </th><th>loc             </th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "<tr><td>HuggingFaceTrainer_45442_00000</td><td>PENDING </td><td>10.85.48.134:176</td></tr>\n",
       "</tbody>\n",
       "</table>\n",
       "  </div>\n",
       "</div>\n",
       "<style>\n",
       ".tuneStatus {\n",
       "  color: var(--jp-ui-font-color1);\n",
       "}\n",
       ".tuneStatus .systemInfo {\n",
       "  display: flex;\n",
       "  flex-direction: column;\n",
       "}\n",
       ".tuneStatus td {\n",
       "  white-space: nowrap;\n",
       "}\n",
       ".tuneStatus .trialStatus {\n",
       "  display: flex;\n",
       "  flex-direction: column;\n",
       "}\n",
       ".tuneStatus h3 {\n",
       "  font-weight: bold;\n",
       "}\n",
       ".tuneStatus .hDivider {\n",
       "  border-bottom-width: var(--jp-border-width);\n",
       "  border-bottom-color: var(--jp-border-color0);\n",
       "  border-bottom-style: solid;\n",
       "}\n",
       ".tuneStatus .vDivider {\n",
       "  border-left-width: var(--jp-border-width);\n",
       "  border-left-color: var(--jp-border-color0);\n",
       "  border-left-style: solid;\n",
       "  margin: 0.5em 1em 0.5em 1em;\n",
       "}\n",
       "</style>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m 2023-05-06 07:49:56,747\tWARNING experiment_analysis.py:910 -- Failed to read the results for 1 trials:\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m - /tmp/experiment_analysis_xq6c_pmm/HuggingFaceTrainer_6acd8_00000_0_2023-05-06_07-43-51\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m 2023-05-06 07:49:57,938\tINFO experiment_state.py:388 -- Trying to find and download experiment checkpoint at s3://prod-green-sparta/examples/example-huggingface-tune/experiment-1\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m 2023-05-06 07:49:58,347\tWARNING experiment_state.py:401 -- Got error when trying to sync down: Sync process failed: prod-green-sparta/examples/example-huggingface-tune/experiment-1/ \n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m Please check this error message for potential access problems - if a directory was not found, that is expected at this stage when you're starting a new experiment.\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m 2023-05-06 07:49:58,347\tINFO experiment_state.py:408 -- No remote checkpoint was found or an error occurred when trying to download the experiment checkpoint. Please check the previous warning message for more details. Ray Tune will now start a new experiment.\n",
      "Downloading builder script: 100%|██████████| 8.48k/8.48k [00:00<00:00, 12.8MB/s]\n",
      "Downloading metadata: 100%|██████████| 6.84k/6.84k [00:00<00:00, 11.1MB/s]\n",
      "Downloading readme: 100%|██████████| 9.25k/9.25k [00:00<00:00, 14.8MB/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[2m\u001b[36m(TrainTrainable pid=176, ip=10.85.48.134)\u001b[0m Downloading and preparing dataset wikitext/wikitext-2-raw-v1 to /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading data:   0%|          | 0.00/4.72M [00:00<?, ?B/s]\n",
      "Downloading data: 100%|██████████| 4.72M/4.72M [00:00<00:00, 103MB/s]\n",
      "Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]\n",
      "Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]\n",
      "Generating train split:  20%|██        | 7506/36718 [00:00<00:00, 74955.71 examples/s]\n",
      "Generating train split:  51%|█████     | 18798/36718 [00:00<00:00, 75167.24 examples/s]\n",
      "Generating train split:  82%|████████▏ | 30017/36718 [00:00<00:00, 74978.43 examples/s]\n",
      "100%|██████████| 3/3 [00:00<00:00, 1078.41it/s]                                        \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[2m\u001b[36m(TrainTrainable pid=176, ip=10.85.48.134)\u001b[0m Dataset wikitext downloaded and prepared to /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading (…)okenizer_config.json: 100%|██████████| 236/236 [00:00<00:00, 38.8kB/s]\n",
      "Downloading (…)olve/main/vocab.json: 100%|██████████| 396k/396k [00:00<00:00, 26.7MB/s]\n",
      "Downloading (…)olve/main/merges.txt: 100%|██████████| 232k/232k [00:00<00:00, 66.1MB/s]\n",
      "Downloading (…)/main/tokenizer.json: 100%|██████████| 678k/678k [00:00<00:00, 78.9MB/s]\n",
      "Downloading (…)cial_tokens_map.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 91.8kB/s]\n",
      "Map:   0%|          | 0/4358 [00:00<?, ? examples/s]0m \n",
      "Map:  23%|██▎       | 1000/4358 [00:00<00:00, 9960.47 examples/s]\n",
      "Map:  69%|██████▉   | 3000/4358 [00:00<00:00, 10230.25 examples/s]\n",
      "Map:   0%|          | 0/36718 [00:00<?, ? examples/s]             \n",
      "Map:   5%|▌         | 2000/36718 [00:00<00:05, 6183.18 examples/s]\n",
      "Map:  11%|█         | 4000/36718 [00:00<00:03, 8288.63 examples/s]\n",
      "Map:  16%|█▋        | 6000/36718 [00:00<00:03, 9763.49 examples/s]\n",
      "Map:  22%|██▏       | 8000/36718 [00:00<00:02, 9652.65 examples/s]\n",
      "Map:  27%|██▋       | 10000/36718 [00:01<00:02, 10076.29 examples/s]\n",
      "Map:  33%|███▎      | 12000/36718 [00:01<00:02, 10275.84 examples/s]\n",
      "Map:  38%|███▊      | 14000/36718 [00:01<00:02, 10393.79 examples/s]\n",
      "Map:  44%|████▎     | 16000/36718 [00:01<00:01, 10378.47 examples/s]\n",
      "Map:  49%|████▉     | 18000/36718 [00:01<00:01, 10445.27 examples/s]\n",
      "Map:  54%|█████▍    | 20000/36718 [00:01<00:01, 10750.11 examples/s]\n",
      "Map:  60%|█████▉    | 22000/36718 [00:02<00:01, 10629.08 examples/s]\n",
      "Map:  65%|██████▌   | 24000/36718 [00:02<00:01, 10983.25 examples/s]\n",
      "Map:  71%|███████   | 26000/36718 [00:02<00:00, 11181.15 examples/s]\n",
      "Map:  76%|███████▋  | 28000/36718 [00:02<00:00, 10730.15 examples/s]\n",
      "Map:  82%|████████▏ | 30000/36718 [00:02<00:00, 10863.51 examples/s]\n",
      "Map:  87%|████████▋ | 32000/36718 [00:03<00:00, 10368.40 examples/s]\n",
      "Map:  93%|█████████▎| 34000/36718 [00:03<00:00, 10230.98 examples/s]\n",
      "Map:  98%|█████████▊| 36000/36718 [00:03<00:00, 10282.45 examples/s]\n",
      "Map:   0%|          | 0/3760 [00:00<?, ? examples/s]                \n",
      "Map:  53%|█████▎    | 2000/3760 [00:00<00:00, 6387.63 examples/s]\n",
      "Map:  80%|███████▉  | 3000/3760 [00:00<00:00, 7390.47 examples/s]\n",
      "Map:   0%|          | 0/4358 [00:00<?, ? examples/s]             \n",
      "Map:  23%|██▎       | 1000/4358 [00:00<00:00, 3684.38 examples/s]\n",
      "Map:  46%|████▌     | 2000/4358 [00:00<00:00, 3890.21 examples/s]\n",
      "Map:  69%|██████▉   | 3000/4358 [00:00<00:00, 3835.12 examples/s]\n",
      "Map:   0%|          | 0/36718 [00:00<?, ? examples/s]            \n",
      "Map:   3%|▎         | 1000/36718 [00:00<00:07, 4561.11 examples/s]\n",
      "Map:   5%|▌         | 2000/36718 [00:00<00:08, 4010.55 examples/s]\n",
      "Map:   8%|▊         | 3000/36718 [00:00<00:08, 4002.65 examples/s]\n",
      "Map:  11%|█         | 4000/36718 [00:00<00:08, 4066.98 examples/s]\n",
      "Map:  14%|█▎        | 5000/36718 [00:01<00:07, 4404.48 examples/s]\n",
      "Map:  16%|█▋        | 6000/36718 [00:01<00:06, 4503.59 examples/s]\n",
      "Map:  19%|█▉        | 7000/36718 [00:01<00:07, 4059.40 examples/s]\n",
      "Map:  22%|██▏       | 8000/36718 [00:01<00:07, 4095.26 examples/s]\n",
      "Map:  25%|██▍       | 9000/36718 [00:02<00:06, 4108.21 examples/s]\n",
      "Map:  27%|██▋       | 10000/36718 [00:02<00:06, 4125.97 examples/s]\n",
      "Map:  30%|██▉       | 11000/36718 [00:02<00:06, 4093.67 examples/s]\n",
      "Map:  33%|███▎      | 12000/36718 [00:02<00:06, 4059.35 examples/s]\n",
      "Map:  35%|███▌      | 13000/36718 [00:03<00:05, 3981.54 examples/s]\n",
      "Map:  38%|███▊      | 14000/36718 [00:03<00:05, 4160.78 examples/s]\n",
      "Map:  41%|████      | 15000/36718 [00:03<00:05, 4183.95 examples/s]\n",
      "Map:  44%|████▎     | 16000/36718 [00:03<00:05, 4072.50 examples/s]\n",
      "Map:  46%|████▋     | 17000/36718 [00:04<00:04, 4060.12 examples/s]\n",
      "Map:  49%|████▉     | 18000/36718 [00:04<00:04, 4133.24 examples/s]\n",
      "Map:  52%|█████▏    | 19000/36718 [00:04<00:04, 4136.90 examples/s]\n",
      "Map:  54%|█████▍    | 20000/36718 [00:04<00:03, 4238.01 examples/s]\n",
      "Map:  57%|█████▋    | 21000/36718 [00:05<00:03, 4093.51 examples/s]\n",
      "Map:  60%|█████▉    | 22000/36718 [00:05<00:03, 3982.07 examples/s]\n",
      "Map:  63%|██████▎   | 23000/36718 [00:05<00:03, 4081.90 examples/s]\n",
      "Map:  65%|██████▌   | 24000/36718 [00:05<00:02, 4239.97 examples/s]\n",
      "Map:  68%|██████▊   | 25000/36718 [00:06<00:02, 4424.35 examples/s]\n",
      "Map:  71%|███████   | 26000/36718 [00:06<00:02, 4425.57 examples/s]\n",
      "Map:  74%|███████▎  | 27000/36718 [00:06<00:02, 4170.93 examples/s]\n",
      "Map:  76%|███████▋  | 28000/36718 [00:06<00:02, 3999.16 examples/s]\n",
      "Map:  79%|███████▉  | 29000/36718 [00:06<00:01, 4247.98 examples/s]\n",
      "Map:  82%|████████▏ | 30000/36718 [00:07<00:01, 4183.68 examples/s]\n",
      "Map:  84%|████████▍ | 31000/36718 [00:07<00:01, 3941.07 examples/s]\n",
      "Map:  87%|████████▋ | 32000/36718 [00:07<00:01, 3962.91 examples/s]\n",
      "Map:  90%|████████▉ | 33000/36718 [00:08<00:00, 3904.97 examples/s]\n",
      "Map:  93%|█████████▎| 34000/36718 [00:08<00:00, 3874.85 examples/s]\n",
      "Map:  95%|█████████▌| 35000/36718 [00:08<00:00, 4020.16 examples/s]\n",
      "Map:  98%|█████████▊| 36000/36718 [00:08<00:00, 3938.73 examples/s]\n",
      "Map: 100%|██████████| 36718/36718 [00:08<00:00, 4229.15 examples/s]\n",
      "Map:   0%|          | 0/3760 [00:00<?, ? examples/s]               \n",
      "Map:  27%|██▋       | 1000/3760 [00:00<00:00, 4595.28 examples/s]\n",
      "Map:  53%|█████▎    | 2000/3760 [00:00<00:00, 3977.48 examples/s]\n",
      "Map:  80%|███████▉  | 3000/3760 [00:00<00:00, 3808.57 examples/s]\n",
      "                                                                 \n",
      "\u001b[2m\u001b[36m(TrainTrainable pid=176, ip=10.85.48.134)\u001b[0m 2023-05-06 07:50:28,775\tWARNING datastream.py:251 -- \u001b[33m[IMPORTANT]: Ray Data strict mode is on by default in Ray 2.5. When in strict mode, data schemas are required, standalone Python objects are no longer supported, and the default batch format changes to `numpy` from `pandas`. To disable strict mode temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes. Strict mode will not be possible to disable in future releases.\n",
      "\u001b[2m\u001b[36m(TrainTrainable pid=176, ip=10.85.48.134)\u001b[0m \n",
      "\u001b[2m\u001b[36m(TrainTrainable pid=176, ip=10.85.48.134)\u001b[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#what-is-strict-mode\u001b[0m\n",
      "\u001b[2m\u001b[36m(TrainTrainable pid=176, ip=10.85.48.134)\u001b[0m 2023-05-06 07:50:30,252\tINFO trainable.py:172 -- Trainable.setup took 19.951 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.\n",
      "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=176, ip=10.85.48.134)\u001b[0m 2023-05-06 07:50:31,760\tINFO backend_executor.py:128 -- Starting distributed worker processes: ['271 (10.85.48.134)', '247 (10.85.55.90)', '248 (10.85.55.90)']\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=271, ip=10.85.48.134)\u001b[0m 2023-05-06 07:50:33,027\tINFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=3]\n",
      "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=176, ip=10.85.48.134)\u001b[0m 2023-05-06 07:50:33,197\tINFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[RandomizeBlockOrder]\n",
      "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=176, ip=10.85.48.134)\u001b[0m 2023-05-06 07:50:33,197\tINFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
      "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=176, ip=10.85.48.134)\u001b[0m 2023-05-06 07:50:33,197\tINFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n",
      "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=176, ip=10.85.48.134)\u001b[0m 2023-05-06 07:50:33,208\tINFO streaming_executor.py:149 -- Shutting down <StreamingExecutor(Thread-5, stopped daemon 140525826995968)>.\n",
      "Downloading builder script: 100%|██████████| 8.48k/8.48k [00:00<00:00, 13.0MB/s]\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=271, ip=10.85.48.134)\u001b[0m Found cached dataset wikitext (/home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)\n",
      "100%|██████████| 3/3 [00:00<00:00, 1041.29it/s]34)\u001b[0m \n",
      "Downloading metadata: 100%|██████████| 6.84k/6.84k [00:00<00:00, 11.9MB/s]\n",
      "Downloading builder script: 100%|██████████| 8.48k/8.48k [00:00<00:00, 12.9MB/s]\n",
      "Downloading readme:   0%|          | 0.00/9.25k [00:00<?, ?B/s]\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=271, ip=10.85.48.134)\u001b[0m Loading cached processed dataset at /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-5c8dd88d71bb2d99.arrow\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=271, ip=10.85.48.134)\u001b[0m Loading cached processed dataset at /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-74437d7c87fbab78.arrow\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=271, ip=10.85.48.134)\u001b[0m Loading cached processed dataset at /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-4bde759af7353b26.arrow\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=271, ip=10.85.48.134)\u001b[0m Loading cached processed dataset at /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-5e93c765ae22bb1c.arrow\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=271, ip=10.85.48.134)\u001b[0m Loading cached processed dataset at /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-ac91e1da5e5aa5f6.arrow\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=271, ip=10.85.48.134)\u001b[0m Loading cached processed dataset at /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-a1cf333f1d657df0.arrow\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m Downloading and preparing dataset wikitext/wikitext-2-raw-v1 to /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading readme: 100%|██████████| 9.25k/9.25k [00:00<00:00, 15.4MB/s]\n",
      "Downloading readme: 100%|██████████| 9.25k/9.25k [00:00<00:00, 12.7MB/s]\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=271, ip=10.85.48.134)\u001b[0m 2023-05-06 07:50:36,890\tWARNING datastream.py:251 -- \u001b[33m[IMPORTANT]: Ray Data strict mode is on by default in Ray 2.5. When in strict mode, data schemas are required, standalone Python objects are no longer supported, and the default batch format changes to `numpy` from `pandas`. To disable strict mode temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes. Strict mode will not be possible to disable in future releases.\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=271, ip=10.85.48.134)\u001b[0m \n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=271, ip=10.85.48.134)\u001b[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#what-is-strict-mode\u001b[0m\n",
      "Downloading data:   0%|          | 0.00/4.72M [00:00<?, ?B/s]\n",
      "Downloading data: 100%|██████████| 4.72M/4.72M [00:00<00:00, 101MB/s]\n",
      "Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]\n",
      "Generating train split:  21%|██        | 7674/36718 [00:00<00:00, 76644.32 examples/s]\n",
      "Generating train split:  52%|█████▏    | 19000/36718 [00:00<00:00, 75607.56 examples/s]\n",
      "Generating train split:  73%|███████▎  | 26684/36718 [00:00<00:00, 76088.75 examples/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m Dataset wikitext downloaded and prepared to /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 3/3 [00:00<00:00, 1101.25it/s]                                        \n",
      "Downloading (…)okenizer_config.json: 100%|██████████| 236/236 [00:00<00:00, 39.4kB/s]\n",
      "Downloading (…)olve/main/vocab.json: 100%|██████████| 396k/396k [00:00<00:00, 45.4MB/s]\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=247, ip=10.85.55.90)\u001b[0m Found cached dataset wikitext (/home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)\n",
      "100%|██████████| 3/3 [00:00<00:00, 996.51it/s]90)\u001b[0m \n",
      "Downloading (…)olve/main/merges.txt: 100%|██████████| 232k/232k [00:00<00:00, 23.5MB/s]\n",
      "Downloading (…)/main/tokenizer.json: 100%|██████████| 678k/678k [00:00<00:00, 76.8MB/s]\n",
      "Map:   0%|          | 0/4358 [00:00<?, ? examples/s]m \n",
      "Downloading (…)cial_tokens_map.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 69.4kB/s]\n",
      "Map:   0%|          | 0/4358 [00:00<?, ? examples/s]m \n",
      "Map:  23%|██▎       | 1000/4358 [00:00<00:00, 7016.96 examples/s]\n",
      "Map:  23%|██▎       | 1000/4358 [00:00<00:00, 6662.41 examples/s]\n",
      "Map:  46%|████▌     | 2000/4358 [00:00<00:00, 7076.47 examples/s]\n",
      "Map:  46%|████▌     | 2000/4358 [00:00<00:00, 6155.06 examples/s]\n",
      "Map:  69%|██████▉   | 3000/4358 [00:00<00:00, 5677.45 examples/s]\n",
      "Map:  69%|██████▉   | 3000/4358 [00:00<00:00, 6057.58 examples/s]\n",
      "Map:   0%|          | 0/36718 [00:00<?, ? examples/s]            \n",
      "Map:   0%|          | 0/36718 [00:00<?, ? examples/s]            \n",
      "Map:   3%|▎         | 1000/36718 [00:00<00:04, 7395.67 examples/s]\n",
      "Map:   3%|▎         | 1000/36718 [00:00<00:06, 5628.49 examples/s]\n",
      "Map:   5%|▌         | 2000/36718 [00:00<00:06, 5352.98 examples/s]\n",
      "Map:   5%|▌         | 2000/36718 [00:00<00:06, 5768.76 examples/s]\n",
      "Map:   8%|▊         | 3000/36718 [00:00<00:05, 5703.38 examples/s]\n",
      "Map:   8%|▊         | 3000/36718 [00:00<00:05, 6268.37 examples/s]\n",
      "Map:  11%|█         | 4000/36718 [00:00<00:05, 5598.05 examples/s]\n",
      "Map:  11%|█         | 4000/36718 [00:00<00:05, 5907.88 examples/s]\n",
      "Map:  14%|█▎        | 5000/36718 [00:00<00:04, 6596.04 examples/s]\n",
      "Map:  14%|█▎        | 5000/36718 [00:00<00:05, 6272.06 examples/s]\n",
      "Map:  16%|█▋        | 6000/36718 [00:00<00:04, 6316.69 examples/s]\n",
      "Map:  16%|█▋        | 6000/36718 [00:00<00:05, 6054.43 examples/s]\n",
      "Map:  19%|█▉        | 7000/36718 [00:01<00:05, 5833.46 examples/s]\n",
      "Map:  19%|█▉        | 7000/36718 [00:01<00:05, 5730.37 examples/s]\n",
      "Map:  22%|██▏       | 8000/36718 [00:01<00:05, 5660.36 examples/s]\n",
      "Map:  22%|██▏       | 8000/36718 [00:01<00:04, 6082.13 examples/s]\n",
      "Map:  25%|██▍       | 9000/36718 [00:01<00:04, 5804.77 examples/s]\n",
      "Map:  25%|██▍       | 9000/36718 [00:01<00:04, 6081.22 examples/s]\n",
      "Map:  27%|██▋       | 10000/36718 [00:01<00:04, 5794.84 examples/s]\n",
      "Map:  27%|██▋       | 10000/36718 [00:01<00:04, 5779.32 examples/s]\n",
      "Map:  30%|██▉       | 11000/36718 [00:01<00:04, 5612.70 examples/s]\n",
      "Map:  30%|██▉       | 11000/36718 [00:01<00:04, 6084.44 examples/s]\n",
      "Map:  33%|███▎      | 12000/36718 [00:02<00:04, 5963.63 examples/s]\n",
      "Map:  33%|███▎      | 12000/36718 [00:02<00:03, 6182.96 examples/s]\n",
      "Map:  35%|███▌      | 13000/36718 [00:02<00:04, 5887.47 examples/s]\n",
      "Map:  35%|███▌      | 13000/36718 [00:02<00:04, 5741.75 examples/s]\n",
      "Map:  38%|███▊      | 14000/36718 [00:02<00:03, 6172.62 examples/s]\n",
      "Map:  38%|███▊      | 14000/36718 [00:02<00:03, 5789.20 examples/s]\n",
      "Map:  41%|████      | 15000/36718 [00:02<00:03, 5986.10 examples/s]\n",
      "Map:  41%|████      | 15000/36718 [00:02<00:03, 5723.73 examples/s]\n",
      "Map:  44%|████▎     | 16000/36718 [00:02<00:03, 6010.39 examples/s]\n",
      "Map:  44%|████▎     | 16000/36718 [00:02<00:04, 4379.99 examples/s]\n",
      "Map:  46%|████▋     | 17000/36718 [00:02<00:03, 5870.11 examples/s]\n",
      "Map:  49%|████▉     | 18000/36718 [00:02<00:02, 6418.62 examples/s]\n",
      "Map:  46%|████▋     | 17000/36718 [00:03<00:03, 4958.37 examples/s]\n",
      "Map:  52%|█████▏    | 19000/36718 [00:03<00:02, 6231.12 examples/s]\n",
      "Map:  49%|████▉     | 18000/36718 [00:03<00:03, 5094.94 examples/s]\n",
      "Map:  54%|█████▍    | 20000/36718 [00:03<00:02, 6512.07 examples/s]\n",
      "Map:  52%|█████▏    | 19000/36718 [00:03<00:03, 5741.13 examples/s]\n",
      "Map:  57%|█████▋    | 21000/36718 [00:03<00:02, 6205.68 examples/s]\n",
      "Map:  54%|█████▍    | 20000/36718 [00:03<00:02, 5653.83 examples/s]\n",
      "Map:  60%|█████▉    | 22000/36718 [00:03<00:02, 5872.97 examples/s]\n",
      "Map:  57%|█████▋    | 21000/36718 [00:03<00:02, 5500.76 examples/s]\n",
      "Map:  63%|██████▎   | 23000/36718 [00:03<00:02, 5839.64 examples/s]\n",
      "Map:  60%|█████▉    | 22000/36718 [00:03<00:02, 5456.31 examples/s]\n",
      "Map:  65%|██████▌   | 24000/36718 [00:03<00:02, 6323.67 examples/s]\n",
      "Map:  63%|██████▎   | 23000/36718 [00:04<00:02, 5868.76 examples/s]\n",
      "Map:  65%|██████▌   | 24000/36718 [00:04<00:02, 6023.35 examples/s]\n",
      "Map:  68%|██████▊   | 25000/36718 [00:04<00:01, 6665.28 examples/s]\n",
      "Map:  68%|██████▊   | 25000/36718 [00:04<00:02, 4796.26 examples/s]\n",
      "Map:  71%|███████   | 26000/36718 [00:04<00:02, 5048.73 examples/s]\n",
      "Map:  71%|███████   | 26000/36718 [00:04<00:01, 6212.77 examples/s]\n",
      "Map:  74%|███████▎  | 27000/36718 [00:04<00:01, 5065.81 examples/s]\n",
      "Map:  74%|███████▎  | 27000/36718 [00:04<00:01, 6333.29 examples/s]\n",
      "Map:  76%|███████▋  | 28000/36718 [00:04<00:01, 5118.86 examples/s]\n",
      "Map:  76%|███████▋  | 28000/36718 [00:04<00:01, 5968.66 examples/s]\n",
      "Map:  79%|███████▉  | 29000/36718 [00:04<00:01, 5749.06 examples/s]\n",
      "Map:  79%|███████▉  | 29000/36718 [00:05<00:01, 5915.70 examples/s]\n",
      "Map:  82%|████████▏ | 30000/36718 [00:05<00:01, 5655.29 examples/s]\n",
      "Map:  82%|████████▏ | 30000/36718 [00:05<00:01, 5778.83 examples/s]\n",
      "Map:  84%|████████▍ | 31000/36718 [00:05<00:01, 5476.08 examples/s]\n",
      "Map:  84%|████████▍ | 31000/36718 [00:05<00:01, 5522.06 examples/s]\n",
      "Map:  87%|████████▋ | 32000/36718 [00:05<00:00, 5413.16 examples/s]\n",
      "Map:  87%|████████▋ | 32000/36718 [00:05<00:00, 5844.37 examples/s]\n",
      "Map:  90%|████████▉ | 33000/36718 [00:05<00:00, 5809.50 examples/s]\n",
      "Map:  90%|████████▉ | 33000/36718 [00:05<00:00, 5593.37 examples/s]\n",
      "Map:  93%|█████████▎| 34000/36718 [00:05<00:00, 5663.20 examples/s]\n",
      "Map:  93%|█████████▎| 34000/36718 [00:05<00:00, 5553.75 examples/s]\n",
      "Map:  95%|█████████▌| 35000/36718 [00:06<00:00, 5627.72 examples/s]\n",
      "Map:  95%|█████████▌| 35000/36718 [00:06<00:00, 5561.34 examples/s]\n",
      "Map:  98%|█████████▊| 36000/36718 [00:06<00:00, 5858.12 examples/s]\n",
      "Map:  98%|█████████▊| 36000/36718 [00:06<00:00, 5476.50 examples/s]\n",
      "Map:   0%|          | 0/3760 [00:00<?, ? examples/s]               \n",
      "Map:   0%|          | 0/3760 [00:00<?, ? examples/s]               \n",
      "Map:  27%|██▋       | 1000/3760 [00:00<00:00, 7401.40 examples/s]\n",
      "Map:  27%|██▋       | 1000/3760 [00:00<00:00, 6050.25 examples/s]\n",
      "Map:  53%|█████▎    | 2000/3760 [00:00<00:00, 5449.78 examples/s]\n",
      "Map:  53%|█████▎    | 2000/3760 [00:00<00:00, 5901.37 examples/s]\n",
      "Map:  80%|███████▉  | 3000/3760 [00:00<00:00, 5377.33 examples/s]\n",
      "Map:  80%|███████▉  | 3000/3760 [00:00<00:00, 5536.29 examples/s]\n",
      "Map: 100%|██████████| 3760/3760 [00:00<00:00, 5687.10 examples/s]\n",
      "                                                                 \n",
      "Map:   0%|          | 0/4358 [00:00<?, ? examples/s]m \n",
      "Map:   0%|          | 0/4358 [00:00<?, ? examples/s]             \n",
      "Map:  23%|██▎       | 1000/4358 [00:00<00:01, 3337.16 examples/s]\n",
      "Map:  23%|██▎       | 1000/4358 [00:00<00:00, 3431.18 examples/s]\n",
      "Map:  46%|████▌     | 2000/4358 [00:00<00:00, 3682.05 examples/s]\n",
      "Map:  46%|████▌     | 2000/4358 [00:00<00:00, 3688.32 examples/s]\n",
      "Map:  69%|██████▉   | 3000/4358 [00:00<00:00, 3711.49 examples/s]\n",
      "Map:  69%|██████▉   | 3000/4358 [00:00<00:00, 3697.55 examples/s]\n",
      "Map:  92%|█████████▏| 4000/4358 [00:01<00:00, 4009.48 examples/s]\n",
      "Map:   0%|          | 0/36718 [00:00<?, ? examples/s]            \n",
      "                                                                 \n",
      "Map:   0%|          | 0/36718 [00:00<?, ? examples/s] \n",
      "Map:   3%|▎         | 1000/36718 [00:00<00:07, 4517.93 examples/s]\n",
      "Map:   3%|▎         | 1000/36718 [00:00<00:07, 4528.69 examples/s]\n",
      "Map:   5%|▌         | 2000/36718 [00:00<00:08, 3939.69 examples/s]\n",
      "Map:   5%|▌         | 2000/36718 [00:00<00:08, 3939.65 examples/s]\n",
      "Map:   8%|▊         | 3000/36718 [00:00<00:08, 3981.73 examples/s]\n",
      "Map:   8%|▊         | 3000/36718 [00:00<00:08, 3950.54 examples/s]\n",
      "Map:  11%|█         | 4000/36718 [00:00<00:08, 3965.95 examples/s]\n",
      "Map:  11%|█         | 4000/36718 [00:01<00:08, 3894.84 examples/s]\n",
      "Map:  14%|█▎        | 5000/36718 [00:01<00:07, 4265.58 examples/s]\n",
      "Map:  14%|█▎        | 5000/36718 [00:01<00:07, 4229.76 examples/s]\n",
      "Map:  16%|█▋        | 6000/36718 [00:01<00:07, 4336.29 examples/s]\n",
      "Map:  16%|█▋        | 6000/36718 [00:01<00:08, 3706.95 examples/s]\n",
      "Map:  19%|█▉        | 7000/36718 [00:01<00:07, 3942.15 examples/s]\n",
      "Map:  19%|█▉        | 7000/36718 [00:01<00:08, 3579.12 examples/s]\n",
      "Map:  22%|██▏       | 8000/36718 [00:01<00:07, 3959.13 examples/s]\n",
      "Map:  22%|██▏       | 8000/36718 [00:02<00:07, 3747.00 examples/s]\n",
      "Map:  25%|██▍       | 9000/36718 [00:02<00:06, 4004.27 examples/s]\n",
      "Map:  25%|██▍       | 9000/36718 [00:02<00:07, 3851.95 examples/s]\n",
      "Map:  27%|██▋       | 10000/36718 [00:02<00:06, 3942.89 examples/s]\n",
      "Map:  27%|██▋       | 10000/36718 [00:02<00:06, 3913.87 examples/s]\n",
      "Map:  30%|██▉       | 11000/36718 [00:02<00:06, 3946.42 examples/s]\n",
      "Map:  30%|██▉       | 11000/36718 [00:02<00:06, 3934.97 examples/s]\n",
      "Map:  33%|███▎      | 12000/36718 [00:02<00:06, 3930.83 examples/s]\n",
      "Map:  33%|███▎      | 12000/36718 [00:03<00:06, 3950.59 examples/s]\n",
      "Map:  35%|███▌      | 13000/36718 [00:03<00:06, 3902.39 examples/s]\n",
      "Map:  35%|███▌      | 13000/36718 [00:03<00:06, 3858.84 examples/s]\n",
      "Map:  38%|███▊      | 14000/36718 [00:03<00:05, 3961.49 examples/s]\n",
      "Map:  38%|███▊      | 14000/36718 [00:03<00:05, 3986.22 examples/s]\n",
      "Map:  41%|████      | 15000/36718 [00:03<00:05, 4013.56 examples/s]\n",
      "Map:  41%|████      | 15000/36718 [00:03<00:05, 4056.53 examples/s]\n",
      "Map:  44%|████▎     | 16000/36718 [00:04<00:05, 3911.27 examples/s]\n",
      "Map:  44%|████▎     | 16000/36718 [00:04<00:05, 3978.57 examples/s]\n",
      "Map:  46%|████▋     | 17000/36718 [00:04<00:04, 3958.99 examples/s]\n",
      "Map:  46%|████▋     | 17000/36718 [00:04<00:05, 3907.55 examples/s]\n",
      "Map:  49%|████▉     | 18000/36718 [00:04<00:04, 3993.79 examples/s]\n",
      "Map:  49%|████▉     | 18000/36718 [00:04<00:04, 4015.82 examples/s]\n",
      "Map:  52%|█████▏    | 19000/36718 [00:04<00:04, 4025.61 examples/s]\n",
      "Map:  52%|█████▏    | 19000/36718 [00:04<00:04, 4035.79 examples/s]\n",
      "Map:  54%|█████▍    | 20000/36718 [00:05<00:04, 4157.83 examples/s]\n",
      "Map:  54%|█████▍    | 20000/36718 [00:04<00:04, 4109.83 examples/s]\n",
      "Map:  57%|█████▋    | 21000/36718 [00:05<00:03, 3992.81 examples/s]\n",
      "Map:  57%|█████▋    | 21000/36718 [00:05<00:03, 4035.90 examples/s]\n",
      "Map:  60%|█████▉    | 22000/36718 [00:05<00:03, 3883.74 examples/s]\n",
      "Map:  60%|█████▉    | 22000/36718 [00:05<00:03, 3879.64 examples/s]\n",
      "Map:  63%|██████▎   | 23000/36718 [00:05<00:03, 3976.90 examples/s]\n",
      "Map:  63%|██████▎   | 23000/36718 [00:05<00:03, 3995.93 examples/s]\n",
      "Map:  65%|██████▌   | 24000/36718 [00:06<00:03, 4116.36 examples/s]\n",
      "Map:  65%|██████▌   | 24000/36718 [00:05<00:03, 4087.75 examples/s]\n",
      "Map:  68%|██████▊   | 25000/36718 [00:06<00:02, 4260.91 examples/s]\n",
      "Map:  68%|██████▊   | 25000/36718 [00:06<00:02, 4267.89 examples/s]\n",
      "Map:  71%|███████   | 26000/36718 [00:06<00:02, 4293.16 examples/s]\n",
      "Map:  71%|███████   | 26000/36718 [00:06<00:02, 4305.15 examples/s]\n",
      "Map:  74%|███████▎  | 27000/36718 [00:06<00:02, 4096.98 examples/s]\n",
      "Map:  74%|███████▎  | 27000/36718 [00:06<00:02, 4051.45 examples/s]\n",
      "Map:  76%|███████▋  | 28000/36718 [00:07<00:02, 3932.01 examples/s]\n",
      "Map:  76%|███████▋  | 28000/36718 [00:06<00:02, 3867.04 examples/s]\n",
      "Map:  79%|███████▉  | 29000/36718 [00:07<00:01, 4190.23 examples/s]\n",
      "Map:  79%|███████▉  | 29000/36718 [00:07<00:01, 4118.42 examples/s]\n",
      "Map:  82%|████████▏ | 30000/36718 [00:07<00:01, 4076.24 examples/s]\n",
      "Map:  82%|████████▏ | 30000/36718 [00:07<00:01, 4088.15 examples/s]\n",
      "Map:  84%|████████▍ | 31000/36718 [00:07<00:01, 3858.97 examples/s]\n",
      "Map:  84%|████████▍ | 31000/36718 [00:07<00:01, 3817.97 examples/s]\n",
      "Map:  87%|████████▋ | 32000/36718 [00:08<00:01, 3880.24 examples/s]\n",
      "Map:  87%|████████▋ | 32000/36718 [00:08<00:01, 3844.86 examples/s]\n",
      "Map:  90%|████████▉ | 33000/36718 [00:08<00:00, 3793.07 examples/s]\n",
      "Map:  90%|████████▉ | 33000/36718 [00:08<00:00, 3848.69 examples/s]\n",
      "Map:  93%|█████████▎| 34000/36718 [00:08<00:00, 3767.57 examples/s]\n",
      "Map:  93%|█████████▎| 34000/36718 [00:08<00:00, 3771.67 examples/s]\n",
      "Map:  95%|█████████▌| 35000/36718 [00:08<00:00, 3855.77 examples/s]\n",
      "Map:  95%|█████████▌| 35000/36718 [00:08<00:00, 3906.56 examples/s]\n",
      "Map:  98%|█████████▊| 36000/36718 [00:09<00:00, 3837.94 examples/s]\n",
      "Map:  98%|█████████▊| 36000/36718 [00:09<00:00, 3783.04 examples/s]\n",
      "Map: 100%|██████████| 36718/36718 [00:09<00:00, 4072.96 examples/s]\n",
      "Map: 100%|██████████| 36718/36718 [00:09<00:00, 4131.43 examples/s]\n",
      "Map:   0%|          | 0/3760 [00:00<?, ? examples/s]               \n",
      "Map:   0%|          | 0/3760 [00:00<?, ? examples/s]               \n",
      "Map:  27%|██▋       | 1000/3760 [00:00<00:00, 4595.91 examples/s]\n",
      "Map:  27%|██▋       | 1000/3760 [00:00<00:00, 4562.18 examples/s]\n",
      "Map:  53%|█████▎    | 2000/3760 [00:00<00:00, 3949.92 examples/s]\n",
      "Map:  53%|█████▎    | 2000/3760 [00:00<00:00, 3988.31 examples/s]\n",
      "Map:  80%|███████▉  | 3000/3760 [00:00<00:00, 3822.24 examples/s]\n",
      "Map:  80%|███████▉  | 3000/3760 [00:00<00:00, 3733.57 examples/s]\n",
      "                                                                 \n",
      "                                                                 \n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m 2023-05-06 07:50:57,288\tWARNING datastream.py:251 -- \u001b[33m[IMPORTANT]: Ray Data strict mode is on by default in Ray 2.5. When in strict mode, data schemas are required, standalone Python objects are no longer supported, and the default batch format changes to `numpy` from `pandas`. To disable strict mode temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes. Strict mode will not be possible to disable in future releases.\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m \n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#what-is-strict-mode\u001b[0m\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=271, ip=10.85.48.134)\u001b[0m 2023-05-06 07:50:58,818\tINFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[RandomizeBlockOrder]\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=271, ip=10.85.48.134)\u001b[0m 2023-05-06 07:50:58,819\tINFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=271, ip=10.85.48.134)\u001b[0m 2023-05-06 07:50:58,819\tINFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=271, ip=10.85.48.134)\u001b[0m 2023-05-06 07:50:58,830\tINFO streaming_executor.py:149 -- Shutting down <StreamingExecutor(Thread-4, stopped daemon 140162344408832)>.\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m 2023-05-06 07:50:58,819\tINFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[RandomizeBlockOrder]\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m 2023-05-06 07:50:58,819\tINFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m 2023-05-06 07:50:58,819\tINFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m 2023-05-06 07:50:58,830\tINFO streaming_executor.py:149 -- Shutting down <StreamingExecutor(Thread-4, stopped daemon 139878494893824)>.\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=247, ip=10.85.55.90)\u001b[0m 2023-05-06 07:50:58,742\tWARNING datastream.py:251 -- \u001b[33m[IMPORTANT]: Ray Data strict mode is on by default in Ray 2.5. When in strict mode, data schemas are required, standalone Python objects are no longer supported, and the default batch format changes to `numpy` from `pandas`. To disable strict mode temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes. Strict mode will not be possible to disable in future releases.\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=247, ip=10.85.55.90)\u001b[0m \n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=247, ip=10.85.55.90)\u001b[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#what-is-strict-mode\u001b[0m\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=247, ip=10.85.55.90)\u001b[0m 2023-05-06 07:50:58,818\tINFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[RandomizeBlockOrder]\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=247, ip=10.85.55.90)\u001b[0m 2023-05-06 07:50:58,818\tINFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=247, ip=10.85.55.90)\u001b[0m 2023-05-06 07:50:58,819\tINFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=247, ip=10.85.55.90)\u001b[0m 2023-05-06 07:50:58,830\tINFO streaming_executor.py:149 -- Shutting down <StreamingExecutor(Thread-4, stopped daemon 140514367334144)>.\n",
      "Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 110kB/s]\n",
      "Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 119kB/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m To disable this warning, you can either:\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m To disable this warning, you can either:\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m To disable this warning, you can either:\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[2m\u001b[36m(RayTrainWorker pid=271, ip=10.85.48.134)\u001b[0m /home/ray/anaconda3/lib/python3.8/site-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=271, ip=10.85.48.134)\u001b[0m   warnings.warn(\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m /home/ray/anaconda3/lib/python3.8/site-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=248, ip=10.85.55.90)\u001b[0m   warnings.warn(\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m 2023-05-06 07:51:02,084\tERROR tune_controller.py:857 -- Trial task failed\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m Traceback (most recent call last):\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/air/execution/_internal/event_manager.py\", line 110, in resolve_future\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m     result = ray.get(future)\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/auto_init_hook.py\", line 18, in auto_init_wrapper\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m     return fn(*args, **kwargs)\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/client_mode_hook.py\", line 99, in wrapper\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m     return func(*args, **kwargs)\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/worker.py\", line 2534, in get\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m     raise value.as_instanceof_cause()\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m ray.exceptions.RayTaskError(TypeError): \u001b[36mray::_Inner.train()\u001b[39m (pid=176, ip=10.85.48.134, actor_id=2e9fb4aae1cf2e5e746925b301000000, repr=HuggingFaceTrainer)\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 388, in train\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m     raise skipped from exception_cause(skipped)\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/utils.py\", line 54, in check_for_failure\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m     ray.get(object_ref)\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m ray.exceptions.RayTaskError(TypeError): \u001b[36mray::_RayTrainWorker__execute.get_next()\u001b[39m (pid=248, ip=10.85.55.90, actor_id=0ec6cf9eaa07e60a0fb54a2f01000000, repr=<ray.train._internal.worker_group.RayTrainWorker object at 0x7f38f978b400>)\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/worker_group.py\", line 32, in __execute\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m     raise skipped from exception_cause(skipped)\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/utils.py\", line 129, in discard_return_wrapper\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m     train_func(*args, **kwargs)\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/huggingface/huggingface_trainer.py\", line 470, in _huggingface_train_loop_per_worker\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m     trainer.train()\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/transformers/trainer.py\", line 1662, in train\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m     return inner_training_loop(\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/transformers/trainer.py\", line 1740, in _inner_training_loop\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m     self.create_optimizer_and_scheduler(num_training_steps=max_steps)\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/transformers/trainer.py\", line 1053, in create_optimizer_and_scheduler\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m     self.create_optimizer()\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/transformers/trainer.py\", line 1097, in create_optimizer\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m     self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/transformers/optimization.py\", line 398, in __init__\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m     if lr < 0.0:\n",
      "\u001b[2m\u001b[36m(TunerInternal pid=754)\u001b[0m TypeError: '<' not supported between instances of 'NoneType' and 'float'\n",
      "\u001b[2m\u001b[36m(TrainTrainable pid=577, ip=10.85.48.134)\u001b[0m Found cached dataset wikitext (/home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)\n",
      "100%|██████████| 3/3 [00:00<00:00, 1065.17it/s]34)\u001b[0m \n",
      "\u001b[2m\u001b[36m(TrainTrainable pid=577, ip=10.85.48.134)\u001b[0m Loading cached processed dataset at /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-5c8dd88d71bb2d99.arrow\n",
      "\u001b[2m\u001b[36m(TrainTrainable pid=577, ip=10.85.48.134)\u001b[0m Loading cached processed dataset at /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-74437d7c87fbab78.arrow\n",
      "\u001b[2m\u001b[36m(TrainTrainable pid=577, ip=10.85.48.134)\u001b[0m Loading cached processed dataset at /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-4bde759af7353b26.arrow\n",
      "\u001b[2m\u001b[36m(TrainTrainable pid=577, ip=10.85.48.134)\u001b[0m Loading cached processed dataset at /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-5e93c765ae22bb1c.arrow\n",
      "\u001b[2m\u001b[36m(TrainTrainable pid=577, ip=10.85.48.134)\u001b[0m Loading cached processed dataset at /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-ac91e1da5e5aa5f6.arrow\n",
      "\u001b[2m\u001b[36m(TrainTrainable pid=577, ip=10.85.48.134)\u001b[0m Loading cached processed dataset at /home/ray/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-a1cf333f1d657df0.arrow\n",
      "\u001b[2m\u001b[36m(TrainTrainable pid=577, ip=10.85.48.134)\u001b[0m 2023-05-06 07:51:09,009\tWARNING datastream.py:251 -- \u001b[33m[IMPORTANT]: Ray Data strict mode is on by default in Ray 2.5. When in strict mode, data schemas are required, standalone Python objects are no longer supported, and the default batch format changes to `numpy` from `pandas`. To disable strict mode temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes. Strict mode will not be possible to disable in future releases.\n",
      "\u001b[2m\u001b[36m(TrainTrainable pid=577, ip=10.85.48.134)\u001b[0m \n",
      "\u001b[2m\u001b[36m(TrainTrainable pid=577, ip=10.85.48.134)\u001b[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#what-is-strict-mode\u001b[0m\n",
      "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=577, ip=10.85.48.134)\u001b[0m 2023-05-06 07:51:10,583\tINFO backend_executor.py:128 -- Starting distributed worker processes: ['669 (10.85.48.134)', '719 (10.85.55.90)', '720 (10.85.55.90)']\n",
      "\u001b[2m\u001b[36m(RayTrainWorker pid=669, ip=10.85.48.134)\u001b[0m 2023-05-06 07:51:11,854\tINFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=3]\n",
      "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=577, ip=10.85.48.134)\u001b[0m 2023-05-06 07:51:11,979\tINFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[RandomizeBlockOrder]\n",
      "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=577, ip=10.85.48.134)\u001b[0m 2023-05-06 07:51:11,979\tINFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
      "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=577, ip=10.85.48.134)\u001b[0m 2023-05-06 07:51:11,980\tINFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n",
      "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=577, ip=10.85.48.134)\u001b[0m 2023-05-06 07:51:11,991\tINFO streaming_executor.py:149 -- Shutting down <StreamingExecutor(Thread-5, stopped daemon 139844100441856)>.\n"
     ]
    }
   ],
   "source": [
    "tuner.get_results()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f06f6985-0b96-42db-aac5-956accf34ede",
   "metadata": {},
   "outputs": [],
   "source": [
    "job_id = client.submit_job(\n",
    "    entrypoint=(\n",
    "        'cat /home/ray/ray_results/experiment-1/'\n",
    "        'HuggingFaceTrainer_45442_00000_0_2023-05-06_07-49-58/error.txt'\n",
    "    ),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "3af2d4f1-c60f-489b-b9d4-fcb1e919cd4a",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failure # 1 (occurred at 2023-05-06_07-51-02)\n",
      "\u001b[36mray::_Inner.train()\u001b[39m (pid=176, ip=10.85.48.134, actor_id=2e9fb4aae1cf2e5e746925b301000000, repr=HuggingFaceTrainer)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 388, in train\n",
      "    raise skipped from exception_cause(skipped)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/utils.py\", line 54, in check_for_failure\n",
      "    ray.get(object_ref)\n",
      "ray.exceptions.RayTaskError(TypeError): \u001b[36mray::_RayTrainWorker__execute.get_next()\u001b[39m (pid=248, ip=10.85.55.90, actor_id=0ec6cf9eaa07e60a0fb54a2f01000000, repr=<ray.train._internal.worker_group.RayTrainWorker object at 0x7f38f978b400>)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/worker_group.py\", line 32, in __execute\n",
      "    raise skipped from exception_cause(skipped)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/utils.py\", line 129, in discard_return_wrapper\n",
      "    train_func(*args, **kwargs)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/huggingface/huggingface_trainer.py\", line 470, in _huggingface_train_loop_per_worker\n",
      "    trainer.train()\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/transformers/trainer.py\", line 1662, in train\n",
      "    return inner_training_loop(\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/transformers/trainer.py\", line 1740, in _inner_training_loop\n",
      "    self.create_optimizer_and_scheduler(num_training_steps=max_steps)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/transformers/trainer.py\", line 1053, in create_optimizer_and_scheduler\n",
      "    self.create_optimizer()\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/transformers/trainer.py\", line 1097, in create_optimizer\n",
      "    self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/transformers/optimization.py\", line 398, in __init__\n",
      "    if lr < 0.0:\n",
      "TypeError: '<' not supported between instances of 'NoneType' and 'float'\n",
      "Failure # 2 (occurred at 2023-05-06_07-51-18)\n",
      "\u001b[36mray::_Inner.train()\u001b[39m (pid=577, ip=10.85.48.134, actor_id=16f6b060614ce9e7b17514e701000000, repr=HuggingFaceTrainer)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 388, in train\n",
      "    raise skipped from exception_cause(skipped)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/utils.py\", line 54, in check_for_failure\n",
      "    ray.get(object_ref)\n",
      "ray.exceptions.RayTaskError(TypeError): \u001b[36mray::_RayTrainWorker__execute.get_next()\u001b[39m (pid=669, ip=10.85.48.134, actor_id=5cc6209abac8f04077bed79b01000000, repr=<ray.train._internal.worker_group.RayTrainWorker object at 0x7fc71b2fd460>)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/worker_group.py\", line 32, in __execute\n",
      "    raise skipped from exception_cause(skipped)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/utils.py\", line 129, in discard_return_wrapper\n",
      "    train_func(*args, **kwargs)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/huggingface/huggingface_trainer.py\", line 470, in _huggingface_train_loop_per_worker\n",
      "    trainer.train()\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/transformers/trainer.py\", line 1662, in train\n",
      "    return inner_training_loop(\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/transformers/trainer.py\", line 1740, in _inner_training_loop\n",
      "    self.create_optimizer_and_scheduler(num_training_steps=max_steps)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/transformers/trainer.py\", line 1053, in create_optimizer_and_scheduler\n",
      "    self.create_optimizer()\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/transformers/trainer.py\", line 1097, in create_optimizer\n",
      "    self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)\n",
      "  File \"/home/ray/anaconda3/lib/python3.8/site-packages/transformers/optimization.py\", line 398, in __init__\n",
      "    if lr < 0.0:\n",
      "TypeError: '<' not supported between instances of 'NoneType' and 'float'\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(client.get_job_logs(job_id))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4301c6d0-cc76-42a8-8edd-6db1677aeccf",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	from datasets import load_dataset
	import transformers
	from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

	import ray
	from ray import tune, air
	from ray.train.huggingface import HuggingFaceTrainer
	from ray.air.config import ScalingConfig
	import os

	# If using GPUs, set this to True.
	use_gpu = False

	model_checkpoint = "gpt2"
	tokenizer_checkpoint = "sgugger/gpt2-like-tokenizer"
	block_size = 128


	datasets = load_dataset("wikitext", "wikitext-2-raw-v1")
	tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)

	def tokenize_function(examples):
	return tokenizer(examples["text"])

	tokenized_datasets = datasets.map(
	tokenize_function, batched=True, num_proc=1, remove_columns=["text"]
	)


	def group_texts(examples):
	# Concatenate all texts.
	concatenated_examples = {
	k: sum(examples[k], []) for k in examples.keys()
	}
	total_length = len(concatenated_examples[list(examples.keys())[0]])
	# We drop the small remainder, we could add padding if the model
	# supported it.
	# instead of this drop, you can customize this part to your needs.
	total_length = (total_length // block_size) * block_size
	# Split by chunks of max_len.
	result = {
	k: [
	t[i : i + block_size]
	for i in range(0, total_length, block_size)
	]
	for k, t in concatenated_examples.items()
	}
	result["labels"] = result["input_ids"].copy()
	return result

	lm_datasets = tokenized_datasets.map(
	group_texts,
	batched=True,
	batch_size=1000,
	num_proc=1,
	)

	ray_train_ds = ray.data.from_huggingface(lm_datasets["train"])
	ray_evaluation_ds = ray.data.from_huggingface(
	lm_datasets["validation"]
	)

	def trainer_init_per_worker(train_dataset, eval_dataset, **config):
	model_config = AutoConfig.from_pretrained(model_checkpoint)
	model = AutoModelForCausalLM.from_config(model_config)
	args = transformers.TrainingArguments(
	output_dir=f"/tmp/{model_checkpoint}-wikitext2",

	# evaluation_strategy="epoch",
	# save_strategy="epoch",
	# logging_strategy="epoch",

	save_steps=2,
	logging_steps=2,
	metric_for_best_model='loss',
	save_total_limit=1,

	learning_rate=config.get('learning_rate'),
	weight_decay=config.get('weight_decay'),
	max_steps=30,
	num_train_epochs=3,
	no_cuda=(not use_gpu),
	)
	return transformers.Trainer(
	model=model,
	args=args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset,
	)

	scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu)
	trainer = HuggingFaceTrainer(
	trainer_init_per_worker=trainer_init_per_worker,
	scaling_config=scaling_config,
	datasets={"train": ray_train_ds, "evaluation": ray_evaluation_ds},
	)


	if __name__ == '__main__':

	S3_BUCKET = os.environ['S3_BUCKET']
	upload_dir = os.environ['UPLOAD_DIR']
	name = os.environ['EXPERIMENT_NAME']

	tuner = tune.Tuner(
	trainer,
	param_space={
	'trainer_init_config': {
	'weight_decay': tune.grid_search([0.01, 0.02]),
	'learning_rate': tune.grid_search([2e-5, 2e-4]),
	},
	},
	tune_config=tune.TuneConfig(
	num_samples=1,
	max_concurrent_trials=20,
	),
	run_config=air.RunConfig(
	name=name,
	local_dir='/tmp/experiment_dir',
	sync_config=tune.SyncConfig(
	upload_dir=upload_dir,
	),
	checkpoint_config=air.CheckpointConfig(
	num_to_keep=2,
	checkpoint_score_attribute='loss',
	checkpoint_score_order='min',
	),
	failure_config=air.FailureConfig(
	max_failures=1,
	),
	),
	)

	results = tuner.fit()
	print(results.get_best_result(metric="loss", mode="min").config)