Skip to content

Instantly share code, notes, and snippets.

@davidefiocco
Last active November 5, 2020 17:57
Show Gist options
  • Save davidefiocco/3bbe492033b5675ab03405019a71f9ce to your computer and use it in GitHub Desktop.
Save davidefiocco/3bbe492033b5675ab03405019a71f9ce to your computer and use it in GitHub Desktop.
CoLAfinetuning.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "CoLAfinetuning.ipynb",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyMA6YmhqIGGbIUZST+EiLWu",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/davidefiocco/3bbe492033b5675ab03405019a71f9ce/colafinetuning.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "EJwUFltM4JPE"
},
"source": [
"Reproduce problem in logging losses in https://discuss.huggingface.co/t/how-to-monitor-both-train-and-validation-metrics-at-the-same-step/1301 "
]
},
{
"cell_type": "code",
"metadata": {
"id": "BxOApI-Kj3SZ",
"outputId": "a86f6127-8c47-4e14-bd35-e261ca2cbb78",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 54
}
},
"source": [
"!pip install transformers==3.3.0 --quiet "
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"\u001b[K |████████████████████████████████| 1.1MB 4.8MB/s \n",
"\u001b[K |████████████████████████████████| 3.0MB 13.9MB/s \n",
"\u001b[?25h"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "ivN47u8ltj5I",
"outputId": "f7a43c5a-442e-474a-ab43-4a1eff8d2330",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 220
}
},
"source": [
"!wget https://raw.githubusercontent.com/huggingface/transformers/master/utils/download_glue_data.py"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"--2020-10-21 07:44:49-- https://raw.githubusercontent.com/huggingface/transformers/master/utils/download_glue_data.py\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 8209 (8.0K) [text/plain]\n",
"Saving to: ‘download_glue_data.py.1’\n",
"\n",
"\rdownload_glue_data. 0%[ ] 0 --.-KB/s \rdownload_glue_data. 100%[===================>] 8.02K --.-KB/s in 0s \n",
"\n",
"2020-10-21 07:44:49 (51.5 MB/s) - ‘download_glue_data.py.1’ saved [8209/8209]\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "emgqvSkyuLbo",
"outputId": "38eaf588-34ec-4a9f-ad6d-7f4859c1e83c",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 220
}
},
"source": [
"!wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/text-classification/run_glue.py"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"--2020-10-21 07:44:50-- https://raw.githubusercontent.com/huggingface/transformers/master/examples/text-classification/run_glue.py\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 9517 (9.3K) [text/plain]\n",
"Saving to: ‘run_glue.py.1’\n",
"\n",
"\rrun_glue.py.1 0%[ ] 0 --.-KB/s \rrun_glue.py.1 100%[===================>] 9.29K --.-KB/s in 0s \n",
"\n",
"2020-10-21 07:44:50 (123 MB/s) - ‘run_glue.py.1’ saved [9517/9517]\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "cu-ec-RFtoN8",
"outputId": "1d7c59f9-fc0c-4d58-c075-e62b859ce293",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 54
}
},
"source": [
"!python download_glue_data.py --tasks CoLA"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Downloading and extracting CoLA...\n",
"\tCompleted!\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "n6xIp8YDkAt5",
"outputId": "70b30d33-589d-44cc-d28d-e065e376d264",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
}
},
"source": [
"!python run_glue.py --model_name_or_path bert-base-cased \\\n",
" --task_name CoLA \\\n",
" --do_train \\\n",
" --do_eval \\\n",
" --data_dir ./glue_data/CoLA \\\n",
" --max_seq_length 128 \\\n",
" --per_device_train_batch_size 32 \\\n",
" --learning_rate 2e-5 \\\n",
" --num_train_epochs 3.0 \\\n",
" --output_dir output \\\n",
" --evaluation_strategy steps \\\n",
" --logging_steps 8 \\\n",
" --eval_steps 4"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"2020-10-21 07:44:53.654854: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1\n",
"/usr/local/lib/python3.6/dist-packages/transformers/training_args.py:299: FutureWarning: The `evaluate_during_training` argument is deprecated in favor of `evaluation_strategy` (which has more options)\n",
" FutureWarning,\n",
"10/21/2020 07:44:55 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: False\n",
"10/21/2020 07:44:55 - INFO - __main__ - Training/evaluation parameters TrainingArguments(output_dir='output', overwrite_output_dir=False, do_train=True, do_eval=True, do_predict=False, evaluate_during_training=False, evaluation_strategy=<EvaluationStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=32, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, learning_rate=2e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir='runs/Oct21_07-44-55_aeaa2f6389f6', logging_first_step=False, logging_steps=8, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=4, dataloader_num_workers=0, past_index=-1, run_name=None, disable_tqdm=False, remove_unused_columns=True, label_names=None)\n",
"Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
"- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n",
"- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"10/21/2020 07:45:00 - INFO - filelock - Lock 140317385421600 acquired on ./glue_data/CoLA/cached_train_BertTokenizer_128_cola.lock\n",
"10/21/2020 07:45:00 - INFO - filelock - Lock 140317385421600 released on ./glue_data/CoLA/cached_train_BertTokenizer_128_cola.lock\n",
"10/21/2020 07:45:00 - INFO - filelock - Lock 140317385421488 acquired on ./glue_data/CoLA/cached_dev_BertTokenizer_128_cola.lock\n",
"10/21/2020 07:45:00 - INFO - filelock - Lock 140317385421488 released on ./glue_data/CoLA/cached_dev_BertTokenizer_128_cola.lock\n",
"Epoch: 0% 0/3 [00:00<?, ?it/s]\n",
"Iteration: 0% 0/268 [00:00<?, ?it/s]\u001b[A\n",
"Iteration: 0% 1/268 [00:00<01:51, 2.39it/s]\u001b[A\n",
"Iteration: 1% 2/268 [00:00<01:48, 2.45it/s]\u001b[A\n",
"Iteration: 1% 3/268 [00:01<01:46, 2.49it/s]\u001b[A\n",
"Iteration: 1% 4/268 [00:01<01:44, 2.52it/s]\u001b[A\n",
"Iteration: 2% 5/268 [00:01<01:43, 2.54it/s]\u001b[A\n",
"Iteration: 2% 6/268 [00:02<01:42, 2.56it/s]\u001b[A\n",
"Iteration: 3% 7/268 [00:02<01:41, 2.57it/s]\u001b[A{'loss': 0.6263242363929749, 'learning_rate': 1.9800995024875625e-05, 'epoch': 0.029850746268656716, 'total_flos': 21294968340480, 'step': 8}\n",
"\n",
"Iteration: 3% 8/268 [00:03<01:40, 2.58it/s]\u001b[A\n",
"Iteration: 3% 9/268 [00:03<01:40, 2.58it/s]\u001b[A\n",
"Iteration: 4% 10/268 [00:03<01:39, 2.58it/s]\u001b[A\n",
"Iteration: 4% 11/268 [00:04<01:39, 2.59it/s]\u001b[A\n",
"Iteration: 4% 12/268 [00:04<01:38, 2.59it/s]\u001b[A\n",
"Iteration: 5% 13/268 [00:05<01:38, 2.59it/s]\u001b[A\n",
"Iteration: 5% 14/268 [00:05<01:37, 2.59it/s]\u001b[A\n",
"Iteration: 6% 15/268 [00:05<01:37, 2.59it/s]\u001b[A{'loss': 0.5656715035438538, 'learning_rate': 1.9601990049751245e-05, 'epoch': 0.05970149253731343, 'total_flos': 42589936680960, 'step': 16}\n",
"\n",
"Iteration: 6% 16/268 [00:06<01:37, 2.59it/s]\u001b[A\n",
"Iteration: 6% 17/268 [00:06<01:37, 2.58it/s]\u001b[A\n",
"Iteration: 7% 18/268 [00:06<01:36, 2.59it/s]\u001b[A\n",
"Iteration: 7% 19/268 [00:07<01:36, 2.59it/s]\u001b[A\n",
"Iteration: 7% 20/268 [00:07<01:35, 2.59it/s]\u001b[A\n",
"Iteration: 8% 21/268 [00:08<01:35, 2.59it/s]\u001b[A\n",
"Iteration: 8% 22/268 [00:08<01:34, 2.59it/s]\u001b[A\n",
"Iteration: 9% 23/268 [00:08<01:34, 2.59it/s]\u001b[A{'loss': 0.5884823799133301, 'learning_rate': 1.9402985074626868e-05, 'epoch': 0.08955223880597014, 'total_flos': 63884905021440, 'step': 24}\n",
"\n",
"Iteration: 9% 24/268 [00:09<01:34, 2.58it/s]\u001b[A\n",
"Iteration: 9% 25/268 [00:09<01:33, 2.59it/s]\u001b[A\n",
"Iteration: 10% 26/268 [00:10<01:33, 2.59it/s]\u001b[A\n",
"Iteration: 10% 27/268 [00:10<01:33, 2.59it/s]\u001b[A\n",
"Iteration: 10% 28/268 [00:10<01:32, 2.59it/s]\u001b[A\n",
"Iteration: 11% 29/268 [00:11<01:32, 2.59it/s]\u001b[A\n",
"Iteration: 11% 30/268 [00:11<01:31, 2.59it/s]\u001b[A\n",
"Iteration: 12% 31/268 [00:11<01:31, 2.59it/s]\u001b[A{'loss': 0.6690729856491089, 'learning_rate': 1.9203980099502488e-05, 'epoch': 0.11940298507462686, 'total_flos': 85179873361920, 'step': 32}\n",
"\n",
"Iteration: 12% 32/268 [00:12<01:31, 2.59it/s]\u001b[A\n",
"Iteration: 12% 33/268 [00:12<01:30, 2.59it/s]\u001b[A\n",
"Iteration: 13% 34/268 [00:13<01:30, 2.59it/s]\u001b[A\n",
"Iteration: 13% 35/268 [00:13<01:29, 2.59it/s]\u001b[A\n",
"Iteration: 13% 36/268 [00:13<01:29, 2.59it/s]\u001b[A\n",
"Iteration: 14% 37/268 [00:14<01:29, 2.60it/s]\u001b[A\n",
"Iteration: 14% 38/268 [00:14<01:28, 2.59it/s]\u001b[A\n",
"Iteration: 15% 39/268 [00:15<01:28, 2.59it/s]\u001b[A{'loss': 0.5895624160766602, 'learning_rate': 1.900497512437811e-05, 'epoch': 0.14925373134328357, 'total_flos': 106474841702400, 'step': 40}\n",
"\n",
"Iteration: 15% 40/268 [00:15<01:28, 2.59it/s]\u001b[A\n",
"Iteration: 15% 41/268 [00:15<01:27, 2.59it/s]\u001b[A\n",
"Iteration: 16% 42/268 [00:16<01:27, 2.59it/s]\u001b[A\n",
"Iteration: 16% 43/268 [00:16<01:26, 2.59it/s]\u001b[A\n",
"Iteration: 16% 44/268 [00:17<01:26, 2.59it/s]\u001b[A\n",
"Iteration: 17% 45/268 [00:17<01:25, 2.59it/s]\u001b[A\n",
"Iteration: 17% 46/268 [00:17<01:25, 2.60it/s]\u001b[A\n",
"Iteration: 18% 47/268 [00:18<01:25, 2.59it/s]\u001b[A{'loss': 0.6092815399169922, 'learning_rate': 1.8805970149253735e-05, 'epoch': 0.1791044776119403, 'total_flos': 127769810042880, 'step': 48}\n",
"\n",
"Iteration: 18% 48/268 [00:18<01:24, 2.59it/s]\u001b[A\n",
"Iteration: 18% 49/268 [00:18<01:24, 2.59it/s]\u001b[A\n",
"Iteration: 19% 50/268 [00:19<01:24, 2.59it/s]\u001b[A\n",
"Iteration: 19% 51/268 [00:19<01:23, 2.59it/s]\u001b[A\n",
"Iteration: 19% 52/268 [00:20<01:23, 2.59it/s]\u001b[A\n",
"Iteration: 20% 53/268 [00:20<01:22, 2.59it/s]\u001b[A\n",
"Iteration: 20% 54/268 [00:20<01:22, 2.60it/s]\u001b[A\n",
"Iteration: 21% 55/268 [00:21<01:22, 2.60it/s]\u001b[A{'loss': 0.5546050071716309, 'learning_rate': 1.8606965174129355e-05, 'epoch': 0.208955223880597, 'total_flos': 149064778383360, 'step': 56}\n",
"\n",
"Iteration: 21% 56/268 [00:21<01:21, 2.59it/s]\u001b[A\n",
"Iteration: 21% 57/268 [00:22<01:21, 2.59it/s]\u001b[A\n",
"Iteration: 22% 58/268 [00:22<01:20, 2.59it/s]\u001b[A\n",
"Iteration: 22% 59/268 [00:22<01:20, 2.59it/s]\u001b[A\n",
"Iteration: 22% 60/268 [00:23<01:20, 2.59it/s]\u001b[A\n",
"Iteration: 23% 61/268 [00:23<01:19, 2.60it/s]\u001b[A\n",
"Iteration: 23% 62/268 [00:23<01:19, 2.59it/s]\u001b[A\n",
"Iteration: 24% 63/268 [00:24<01:19, 2.59it/s]\u001b[A{'loss': 0.5408511161804199, 'learning_rate': 1.8407960199004978e-05, 'epoch': 0.23880597014925373, 'total_flos': 170359746723840, 'step': 64}\n",
"\n",
"Iteration: 24% 64/268 [00:24<01:18, 2.59it/s]\u001b[A\n",
"Iteration: 24% 65/268 [00:25<01:18, 2.59it/s]\u001b[A\n",
"Iteration: 25% 66/268 [00:25<01:17, 2.59it/s]\u001b[A\n",
"Iteration: 25% 67/268 [00:25<01:17, 2.60it/s]\u001b[A\n",
"Iteration: 25% 68/268 [00:26<01:17, 2.59it/s]\u001b[A\n",
"Iteration: 26% 69/268 [00:26<01:16, 2.60it/s]\u001b[A\n",
"Iteration: 26% 70/268 [00:27<01:16, 2.59it/s]\u001b[A\n",
"Iteration: 26% 71/268 [00:27<01:16, 2.59it/s]\u001b[A{'loss': 0.5076775550842285, 'learning_rate': 1.8208955223880598e-05, 'epoch': 0.26865671641791045, 'total_flos': 191654715064320, 'step': 72}\n",
"\n",
"Iteration: 27% 72/268 [00:27<01:15, 2.59it/s]\u001b[A\n",
"Iteration: 27% 73/268 [00:28<01:15, 2.59it/s]\u001b[A\n",
"Iteration: 28% 74/268 [00:28<01:14, 2.59it/s]\u001b[A\n",
"Iteration: 28% 75/268 [00:28<01:14, 2.59it/s]\u001b[A\n",
"Iteration: 28% 76/268 [00:29<01:14, 2.59it/s]\u001b[A\n",
"Iteration: 29% 77/268 [00:29<01:13, 2.59it/s]\u001b[A\n",
"Iteration: 29% 78/268 [00:30<01:13, 2.59it/s]\u001b[A\n",
"Iteration: 29% 79/268 [00:30<01:12, 2.60it/s]\u001b[A{'loss': 0.508697509765625, 'learning_rate': 1.800995024875622e-05, 'epoch': 0.29850746268656714, 'total_flos': 212949683404800, 'step': 80}\n",
"\n",
"Iteration: 30% 80/268 [00:30<01:12, 2.59it/s]\u001b[A\n",
"Iteration: 30% 81/268 [00:31<01:12, 2.59it/s]\u001b[A\n",
"Iteration: 31% 82/268 [00:31<01:11, 2.59it/s]\u001b[A\n",
"Iteration: 31% 83/268 [00:32<01:11, 2.59it/s]\u001b[A\n",
"Iteration: 31% 84/268 [00:32<01:11, 2.59it/s]\u001b[A\n",
"Iteration: 32% 85/268 [00:32<01:10, 2.59it/s]\u001b[A\n",
"Iteration: 32% 86/268 [00:33<01:10, 2.59it/s]\u001b[A\n",
"Iteration: 32% 87/268 [00:33<01:09, 2.59it/s]\u001b[A{'loss': 0.4803638458251953, 'learning_rate': 1.7810945273631844e-05, 'epoch': 0.3283582089552239, 'total_flos': 234244651745280, 'step': 88}\n",
"\n",
"Iteration: 33% 88/268 [00:33<01:09, 2.59it/s]\u001b[A\n",
"Iteration: 33% 89/268 [00:34<01:09, 2.59it/s]\u001b[A\n",
"Iteration: 34% 90/268 [00:34<01:08, 2.59it/s]\u001b[A\n",
"Iteration: 34% 91/268 [00:35<01:08, 2.59it/s]\u001b[A\n",
"Iteration: 34% 92/268 [00:35<01:07, 2.59it/s]\u001b[A\n",
"Iteration: 35% 93/268 [00:35<01:07, 2.59it/s]\u001b[A\n",
"Iteration: 35% 94/268 [00:36<01:07, 2.59it/s]\u001b[A\n",
"Iteration: 35% 95/268 [00:36<01:06, 2.59it/s]\u001b[A{'loss': 0.48297643661499023, 'learning_rate': 1.7611940298507464e-05, 'epoch': 0.3582089552238806, 'total_flos': 255539620085760, 'step': 96}\n",
"\n",
"Iteration: 36% 96/268 [00:37<01:06, 2.59it/s]\u001b[A\n",
"Iteration: 36% 97/268 [00:37<01:06, 2.59it/s]\u001b[A\n",
"Iteration: 37% 98/268 [00:37<01:05, 2.59it/s]\u001b[A\n",
"Iteration: 37% 99/268 [00:38<01:05, 2.59it/s]\u001b[A\n",
"Iteration: 37% 100/268 [00:38<01:04, 2.59it/s]\u001b[A\n",
"Iteration: 38% 101/268 [00:38<01:04, 2.60it/s]\u001b[A\n",
"Iteration: 38% 102/268 [00:39<01:03, 2.59it/s]\u001b[A\n",
"Iteration: 38% 103/268 [00:39<01:03, 2.60it/s]\u001b[A{'loss': 0.5067496299743652, 'learning_rate': 1.7412935323383088e-05, 'epoch': 0.3880597014925373, 'total_flos': 276834588426240, 'step': 104}\n",
"\n",
"Iteration: 39% 104/268 [00:40<01:03, 2.59it/s]\u001b[A\n",
"Iteration: 39% 105/268 [00:40<01:02, 2.60it/s]\u001b[A\n",
"Iteration: 40% 106/268 [00:40<01:02, 2.59it/s]\u001b[A\n",
"Iteration: 40% 107/268 [00:41<01:02, 2.59it/s]\u001b[A\n",
"Iteration: 40% 108/268 [00:41<01:01, 2.59it/s]\u001b[A\n",
"Iteration: 41% 109/268 [00:42<01:01, 2.58it/s]\u001b[A\n",
"Iteration: 41% 110/268 [00:42<01:01, 2.58it/s]\u001b[A\n",
"Iteration: 41% 111/268 [00:42<01:00, 2.59it/s]\u001b[A{'loss': 0.4952225685119629, 'learning_rate': 1.7213930348258708e-05, 'epoch': 0.417910447761194, 'total_flos': 298129556766720, 'step': 112}\n",
"\n",
"Iteration: 42% 112/268 [00:43<01:00, 2.59it/s]\u001b[A\n",
"Iteration: 42% 113/268 [00:43<00:59, 2.59it/s]\u001b[A\n",
"Iteration: 43% 114/268 [00:44<00:59, 2.58it/s]\u001b[A\n",
"Iteration: 43% 115/268 [00:44<00:59, 2.59it/s]\u001b[A\n",
"Iteration: 43% 116/268 [00:44<00:58, 2.59it/s]\u001b[A\n",
"Iteration: 44% 117/268 [00:45<00:58, 2.58it/s]\u001b[A\n",
"Iteration: 44% 118/268 [00:45<00:58, 2.59it/s]\u001b[A\n",
"Iteration: 44% 119/268 [00:45<00:57, 2.59it/s]\u001b[A{'loss': 0.4740309715270996, 'learning_rate': 1.701492537313433e-05, 'epoch': 0.44776119402985076, 'total_flos': 319424525107200, 'step': 120}\n",
"\n",
"Iteration: 45% 120/268 [00:46<00:57, 2.59it/s]\u001b[A\n",
"Iteration: 45% 121/268 [00:46<00:56, 2.59it/s]\u001b[A\n",
"Iteration: 46% 122/268 [00:47<00:56, 2.59it/s]\u001b[A\n",
"Iteration: 46% 123/268 [00:47<00:55, 2.59it/s]\u001b[A\n",
"Iteration: 46% 124/268 [00:47<00:55, 2.60it/s]\u001b[A\n",
"Iteration: 47% 125/268 [00:48<00:55, 2.60it/s]\u001b[A\n",
"Iteration: 47% 126/268 [00:48<00:54, 2.60it/s]\u001b[A\n",
"Iteration: 47% 127/268 [00:49<00:54, 2.60it/s]\u001b[A{'loss': 0.4732046127319336, 'learning_rate': 1.681592039800995e-05, 'epoch': 0.47761194029850745, 'total_flos': 340719493447680, 'step': 128}\n",
"\n",
"Iteration: 48% 128/268 [00:49<00:53, 2.60it/s]\u001b[A\n",
"Iteration: 48% 129/268 [00:49<00:53, 2.60it/s]\u001b[A\n",
"Iteration: 49% 130/268 [00:50<00:53, 2.60it/s]\u001b[A\n",
"Iteration: 49% 131/268 [00:50<00:52, 2.59it/s]\u001b[A\n",
"Iteration: 49% 132/268 [00:50<00:52, 2.60it/s]\u001b[A\n",
"Iteration: 50% 133/268 [00:51<00:52, 2.59it/s]\u001b[A\n",
"Iteration: 50% 134/268 [00:51<00:51, 2.59it/s]\u001b[A\n",
"Iteration: 50% 135/268 [00:52<00:51, 2.57it/s]\u001b[A{'loss': 0.40677738189697266, 'learning_rate': 1.6616915422885574e-05, 'epoch': 0.5074626865671642, 'total_flos': 362014461788160, 'step': 136}\n",
"\n",
"Iteration: 51% 136/268 [00:52<00:51, 2.57it/s]\u001b[A\n",
"Iteration: 51% 137/268 [00:52<00:50, 2.58it/s]\u001b[A\n",
"Iteration: 51% 138/268 [00:53<00:50, 2.58it/s]\u001b[A\n",
"Iteration: 52% 139/268 [00:53<00:49, 2.59it/s]\u001b[A\n",
"Iteration: 52% 140/268 [00:54<00:49, 2.59it/s]\u001b[A\n",
"Iteration: 53% 141/268 [00:54<00:49, 2.59it/s]\u001b[A\n",
"Iteration: 53% 142/268 [00:54<00:48, 2.59it/s]\u001b[A\n",
"Iteration: 53% 143/268 [00:55<00:48, 2.59it/s]\u001b[A{'loss': 0.3689250946044922, 'learning_rate': 1.6417910447761197e-05, 'epoch': 0.5373134328358209, 'total_flos': 383309430128640, 'step': 144}\n",
"\n",
"Iteration: 54% 144/268 [00:55<00:47, 2.59it/s]\u001b[A\n",
"Iteration: 54% 145/268 [00:55<00:47, 2.59it/s]\u001b[A\n",
"Iteration: 54% 146/268 [00:56<00:47, 2.59it/s]\u001b[A\n",
"Iteration: 55% 147/268 [00:56<00:46, 2.59it/s]\u001b[A\n",
"Iteration: 55% 148/268 [00:57<00:46, 2.59it/s]\u001b[A\n",
"Iteration: 56% 149/268 [00:57<00:45, 2.59it/s]\u001b[A\n",
"Iteration: 56% 150/268 [00:57<00:45, 2.60it/s]\u001b[A\n",
"Iteration: 56% 151/268 [00:58<00:45, 2.60it/s]\u001b[A{'loss': 0.46918773651123047, 'learning_rate': 1.6218905472636817e-05, 'epoch': 0.5671641791044776, 'total_flos': 404604398469120, 'step': 152}\n",
"\n",
"Iteration: 57% 152/268 [00:58<00:44, 2.60it/s]\u001b[A\n",
"Iteration: 57% 153/268 [00:59<00:44, 2.59it/s]\u001b[A\n",
"Iteration: 57% 154/268 [00:59<00:43, 2.60it/s]\u001b[A\n",
"Iteration: 58% 155/268 [00:59<00:43, 2.60it/s]\u001b[A\n",
"Iteration: 58% 156/268 [01:00<00:43, 2.58it/s]\u001b[A\n",
"Iteration: 59% 157/268 [01:00<00:42, 2.59it/s]\u001b[A\n",
"Iteration: 59% 158/268 [01:01<00:42, 2.59it/s]\u001b[A\n",
"Iteration: 59% 159/268 [01:01<00:42, 2.59it/s]\u001b[A{'loss': 0.4825115203857422, 'learning_rate': 1.601990049751244e-05, 'epoch': 0.5970149253731343, 'total_flos': 425899366809600, 'step': 160}\n",
"\n",
"Iteration: 60% 160/268 [01:01<00:41, 2.59it/s]\u001b[A\n",
"Iteration: 60% 161/268 [01:02<00:41, 2.59it/s]\u001b[A\n",
"Iteration: 60% 162/268 [01:02<00:40, 2.59it/s]\u001b[A\n",
"Iteration: 61% 163/268 [01:02<00:40, 2.60it/s]\u001b[A\n",
"Iteration: 61% 164/268 [01:03<00:40, 2.60it/s]\u001b[A\n",
"Iteration: 62% 165/268 [01:03<00:39, 2.60it/s]\u001b[A\n",
"Iteration: 62% 166/268 [01:04<00:39, 2.60it/s]\u001b[A\n",
"Iteration: 62% 167/268 [01:04<00:38, 2.59it/s]\u001b[A{'loss': 0.5030183792114258, 'learning_rate': 1.582089552238806e-05, 'epoch': 0.6268656716417911, 'total_flos': 447194335150080, 'step': 168}\n",
"\n",
"Iteration: 63% 168/268 [01:04<00:38, 2.58it/s]\u001b[A\n",
"Iteration: 63% 169/268 [01:05<00:38, 2.59it/s]\u001b[A\n",
"Iteration: 63% 170/268 [01:05<00:37, 2.59it/s]\u001b[A\n",
"Iteration: 64% 171/268 [01:06<00:37, 2.59it/s]\u001b[A\n",
"Iteration: 64% 172/268 [01:06<00:36, 2.60it/s]\u001b[A\n",
"Iteration: 65% 173/268 [01:06<00:36, 2.59it/s]\u001b[A\n",
"Iteration: 65% 174/268 [01:07<00:36, 2.59it/s]\u001b[A\n",
"Iteration: 65% 175/268 [01:07<00:35, 2.59it/s]\u001b[A{'loss': 0.4476757049560547, 'learning_rate': 1.5621890547263684e-05, 'epoch': 0.6567164179104478, 'total_flos': 468489303490560, 'step': 176}\n",
"\n",
"Iteration: 66% 176/268 [01:07<00:35, 2.59it/s]\u001b[A\n",
"Iteration: 66% 177/268 [01:08<00:35, 2.58it/s]\u001b[A\n",
"Iteration: 66% 178/268 [01:08<00:34, 2.59it/s]\u001b[A\n",
"Iteration: 67% 179/268 [01:09<00:34, 2.59it/s]\u001b[A\n",
"Iteration: 67% 180/268 [01:09<00:33, 2.59it/s]\u001b[A\n",
"Iteration: 68% 181/268 [01:09<00:33, 2.59it/s]\u001b[A\n",
"Iteration: 68% 182/268 [01:10<00:33, 2.59it/s]\u001b[A\n",
"Iteration: 68% 183/268 [01:10<00:32, 2.60it/s]\u001b[A{'loss': 0.41637325286865234, 'learning_rate': 1.5422885572139307e-05, 'epoch': 0.6865671641791045, 'total_flos': 489784271831040, 'step': 184}\n",
"\n",
"Iteration: 69% 184/268 [01:11<00:32, 2.59it/s]\u001b[A\n",
"Iteration: 69% 185/268 [01:11<00:31, 2.60it/s]\u001b[A\n",
"Iteration: 69% 186/268 [01:11<00:31, 2.59it/s]\u001b[A\n",
"Iteration: 70% 187/268 [01:12<00:31, 2.59it/s]\u001b[A\n",
"Iteration: 70% 188/268 [01:12<00:30, 2.59it/s]\u001b[A\n",
"Iteration: 71% 189/268 [01:12<00:30, 2.59it/s]\u001b[A\n",
"Iteration: 71% 190/268 [01:13<00:30, 2.59it/s]\u001b[A\n",
"Iteration: 71% 191/268 [01:13<00:29, 2.60it/s]\u001b[A{'loss': 0.41908836364746094, 'learning_rate': 1.5223880597014925e-05, 'epoch': 0.7164179104477612, 'total_flos': 511079240171520, 'step': 192}\n",
"\n",
"Iteration: 72% 192/268 [01:14<00:29, 2.59it/s]\u001b[A\n",
"Iteration: 72% 193/268 [01:14<00:28, 2.59it/s]\u001b[A\n",
"Iteration: 72% 194/268 [01:14<00:28, 2.59it/s]\u001b[A\n",
"Iteration: 73% 195/268 [01:15<00:28, 2.60it/s]\u001b[A\n",
"Iteration: 73% 196/268 [01:15<00:27, 2.60it/s]\u001b[A\n",
"Iteration: 74% 197/268 [01:16<00:27, 2.59it/s]\u001b[A\n",
"Iteration: 74% 198/268 [01:16<00:27, 2.58it/s]\u001b[A\n",
"Iteration: 74% 199/268 [01:16<00:26, 2.58it/s]\u001b[A{'loss': 0.5190162658691406, 'learning_rate': 1.5024875621890549e-05, 'epoch': 0.746268656716418, 'total_flos': 532374208512000, 'step': 200}\n",
"\n",
"Iteration: 75% 200/268 [01:17<00:26, 2.56it/s]\u001b[A\n",
"Iteration: 75% 201/268 [01:17<00:26, 2.57it/s]\u001b[A\n",
"Iteration: 75% 202/268 [01:17<00:25, 2.58it/s]\u001b[A\n",
"Iteration: 76% 203/268 [01:18<00:25, 2.57it/s]\u001b[A\n",
"Iteration: 76% 204/268 [01:18<00:24, 2.58it/s]\u001b[A\n",
"Iteration: 76% 205/268 [01:19<00:24, 2.58it/s]\u001b[A\n",
"Iteration: 77% 206/268 [01:19<00:23, 2.58it/s]\u001b[A\n",
"Iteration: 77% 207/268 [01:19<00:23, 2.59it/s]\u001b[A{'loss': 0.4448719024658203, 'learning_rate': 1.4825870646766169e-05, 'epoch': 0.7761194029850746, 'total_flos': 553669176852480, 'step': 208}\n",
"\n",
"Iteration: 78% 208/268 [01:20<00:23, 2.59it/s]\u001b[A\n",
"Iteration: 78% 209/268 [01:20<00:22, 2.58it/s]\u001b[A\n",
"Iteration: 78% 210/268 [01:21<00:22, 2.59it/s]\u001b[A\n",
"Iteration: 79% 211/268 [01:21<00:22, 2.58it/s]\u001b[A\n",
"Iteration: 79% 212/268 [01:21<00:21, 2.58it/s]\u001b[A\n",
"Iteration: 79% 213/268 [01:22<00:21, 2.59it/s]\u001b[A\n",
"Iteration: 80% 214/268 [01:22<00:20, 2.59it/s]\u001b[A\n",
"Iteration: 80% 215/268 [01:23<00:20, 2.59it/s]\u001b[A{'loss': 0.4260139465332031, 'learning_rate': 1.4626865671641792e-05, 'epoch': 0.8059701492537313, 'total_flos': 574964145192960, 'step': 216}\n",
"\n",
"Iteration: 81% 216/268 [01:23<00:20, 2.59it/s]\u001b[A\n",
"Iteration: 81% 217/268 [01:23<00:19, 2.59it/s]\u001b[A\n",
"Iteration: 81% 218/268 [01:24<00:19, 2.59it/s]\u001b[A\n",
"Iteration: 82% 219/268 [01:24<00:18, 2.59it/s]\u001b[A\n",
"Iteration: 82% 220/268 [01:24<00:18, 2.59it/s]\u001b[A\n",
"Iteration: 82% 221/268 [01:25<00:18, 2.59it/s]\u001b[A\n",
"Iteration: 83% 222/268 [01:25<00:17, 2.59it/s]\u001b[A\n",
"Iteration: 83% 223/268 [01:26<00:17, 2.59it/s]\u001b[A{'loss': 0.4600038528442383, 'learning_rate': 1.4427860696517415e-05, 'epoch': 0.835820895522388, 'total_flos': 596259113533440, 'step': 224}\n",
"\n",
"Iteration: 84% 224/268 [01:26<00:16, 2.59it/s]\u001b[A\n",
"Iteration: 84% 225/268 [01:26<00:16, 2.59it/s]\u001b[A\n",
"Iteration: 84% 226/268 [01:27<00:16, 2.59it/s]\u001b[A\n",
"Iteration: 85% 227/268 [01:27<00:15, 2.57it/s]\u001b[A\n",
"Iteration: 85% 228/268 [01:28<00:15, 2.57it/s]\u001b[A\n",
"Iteration: 85% 229/268 [01:28<00:15, 2.58it/s]\u001b[A\n",
"Iteration: 86% 230/268 [01:28<00:14, 2.58it/s]\u001b[A\n",
"Iteration: 86% 231/268 [01:29<00:14, 2.58it/s]\u001b[A{'loss': 0.4007720947265625, 'learning_rate': 1.4228855721393035e-05, 'epoch': 0.8656716417910447, 'total_flos': 617554081873920, 'step': 232}\n",
"\n",
"Iteration: 87% 232/268 [01:29<00:13, 2.58it/s]\u001b[A\n",
"Iteration: 87% 233/268 [01:29<00:13, 2.58it/s]\u001b[A\n",
"Iteration: 87% 234/268 [01:30<00:13, 2.58it/s]\u001b[A\n",
"Iteration: 88% 235/268 [01:30<00:12, 2.58it/s]\u001b[A\n",
"Iteration: 88% 236/268 [01:31<00:12, 2.58it/s]\u001b[A\n",
"Iteration: 88% 237/268 [01:31<00:11, 2.58it/s]\u001b[A\n",
"Iteration: 89% 238/268 [01:31<00:11, 2.59it/s]\u001b[A\n",
"Iteration: 89% 239/268 [01:32<00:11, 2.59it/s]\u001b[A{'loss': 0.4433774948120117, 'learning_rate': 1.4029850746268658e-05, 'epoch': 0.8955223880597015, 'total_flos': 638849050214400, 'step': 240}\n",
"\n",
"Iteration: 90% 240/268 [01:32<00:10, 2.59it/s]\u001b[A\n",
"Iteration: 90% 241/268 [01:33<00:10, 2.59it/s]\u001b[A\n",
"Iteration: 90% 242/268 [01:33<00:10, 2.59it/s]\u001b[A\n",
"Iteration: 91% 243/268 [01:33<00:09, 2.59it/s]\u001b[A\n",
"Iteration: 91% 244/268 [01:34<00:09, 2.59it/s]\u001b[A\n",
"Iteration: 91% 245/268 [01:34<00:08, 2.58it/s]\u001b[A\n",
"Iteration: 92% 246/268 [01:35<00:08, 2.58it/s]\u001b[A\n",
"Iteration: 92% 247/268 [01:35<00:08, 2.58it/s]\u001b[A{'loss': 0.41600608825683594, 'learning_rate': 1.3830845771144278e-05, 'epoch': 0.9253731343283582, 'total_flos': 660144018554880, 'step': 248}\n",
"\n",
"Iteration: 93% 248/268 [01:35<00:07, 2.58it/s]\u001b[A\n",
"Iteration: 93% 249/268 [01:36<00:07, 2.58it/s]\u001b[A\n",
"Iteration: 93% 250/268 [01:36<00:06, 2.58it/s]\u001b[A\n",
"Iteration: 94% 251/268 [01:36<00:06, 2.58it/s]\u001b[A\n",
"Iteration: 94% 252/268 [01:37<00:06, 2.58it/s]\u001b[A\n",
"Iteration: 94% 253/268 [01:37<00:05, 2.59it/s]\u001b[A\n",
"Iteration: 95% 254/268 [01:38<00:05, 2.59it/s]\u001b[A\n",
"Iteration: 95% 255/268 [01:38<00:05, 2.59it/s]\u001b[A{'loss': 0.42204761505126953, 'learning_rate': 1.3631840796019902e-05, 'epoch': 0.9552238805970149, 'total_flos': 681438986895360, 'step': 256}\n",
"\n",
"Iteration: 96% 256/268 [01:38<00:04, 2.59it/s]\u001b[A\n",
"Iteration: 96% 257/268 [01:39<00:04, 2.59it/s]\u001b[A\n",
"Iteration: 96% 258/268 [01:39<00:03, 2.59it/s]\u001b[A\n",
"Iteration: 97% 259/268 [01:40<00:03, 2.59it/s]\u001b[A\n",
"Iteration: 97% 260/268 [01:40<00:03, 2.59it/s]\u001b[A\n",
"Iteration: 97% 261/268 [01:40<00:02, 2.59it/s]\u001b[A\n",
"Iteration: 98% 262/268 [01:41<00:02, 2.59it/s]\u001b[A\n",
"Iteration: 98% 263/268 [01:41<00:01, 2.59it/s]\u001b[A{'loss': 0.4416160583496094, 'learning_rate': 1.3432835820895525e-05, 'epoch': 0.9850746268656716, 'total_flos': 702733955235840, 'step': 264}\n",
"\n",
"Iteration: 99% 264/268 [01:41<00:01, 2.59it/s]\u001b[A\n",
"Iteration: 99% 265/268 [01:42<00:01, 2.59it/s]\u001b[A\n",
"Iteration: 99% 266/268 [01:42<00:00, 2.58it/s]\u001b[A\n",
"Iteration: 100% 267/268 [01:43<00:00, 2.58it/s]\u001b[A\n",
"Iteration: 100% 268/268 [01:43<00:00, 2.60it/s]\n",
"Epoch: 33% 1/3 [01:43<03:26, 103.25s/it]\n",
"Iteration: 0% 0/268 [00:00<?, ?it/s]\u001b[A\n",
"Iteration: 0% 1/268 [00:00<01:43, 2.58it/s]\u001b[A\n",
"Iteration: 1% 2/268 [00:00<01:42, 2.58it/s]\u001b[A\n",
"Iteration: 1% 3/268 [00:01<01:42, 2.59it/s]\u001b[A{'loss': 0.3595905303955078, 'learning_rate': 1.3233830845771145e-05, 'epoch': 1.0149253731343284, 'total_flos': 721949336824320, 'step': 272}\n",
"\n",
"Iteration: 1% 4/268 [00:01<01:41, 2.59it/s]\u001b[A\n",
"Iteration: 2% 5/268 [00:01<01:41, 2.59it/s]\u001b[A\n",
"Iteration: 2% 6/268 [00:02<01:41, 2.59it/s]\u001b[A\n",
"Iteration: 3% 7/268 [00:02<01:40, 2.59it/s]\u001b[A\n",
"Iteration: 3% 8/268 [00:03<01:40, 2.59it/s]\u001b[A\n",
"Iteration: 3% 9/268 [00:03<01:39, 2.59it/s]\u001b[A\n",
"Iteration: 4% 10/268 [00:03<01:39, 2.59it/s]\u001b[A\n",
"Iteration: 4% 11/268 [00:04<01:38, 2.60it/s]\u001b[A{'loss': 0.2794647216796875, 'learning_rate': 1.3034825870646768e-05, 'epoch': 1.044776119402985, 'total_flos': 743244305164800, 'step': 280}\n",
"\n",
"Iteration: 4% 12/268 [00:04<01:38, 2.59it/s]\u001b[A\n",
"Iteration: 5% 13/268 [00:05<01:38, 2.59it/s]\u001b[A\n",
"Iteration: 5% 14/268 [00:05<01:38, 2.59it/s]\u001b[A\n",
"Iteration: 6% 15/268 [00:05<01:37, 2.59it/s]\u001b[A\n",
"Iteration: 6% 16/268 [00:06<01:37, 2.59it/s]\u001b[A\n",
"Iteration: 6% 17/268 [00:06<01:37, 2.58it/s]\u001b[A\n",
"Iteration: 7% 18/268 [00:06<01:36, 2.58it/s]\u001b[A\n",
"Iteration: 7% 19/268 [00:07<01:36, 2.59it/s]\u001b[A{'loss': 0.30727195739746094, 'learning_rate': 1.2835820895522388e-05, 'epoch': 1.0746268656716418, 'total_flos': 764539273505280, 'step': 288}\n",
"\n",
"Iteration: 7% 20/268 [00:07<01:35, 2.58it/s]\u001b[A\n",
"Iteration: 8% 21/268 [00:08<01:35, 2.58it/s]\u001b[A\n",
"Iteration: 8% 22/268 [00:08<01:35, 2.58it/s]\u001b[A\n",
"Iteration: 9% 23/268 [00:08<01:34, 2.59it/s]\u001b[A\n",
"Iteration: 9% 24/268 [00:09<01:34, 2.59it/s]\u001b[A\n",
"Iteration: 9% 25/268 [00:09<01:34, 2.58it/s]\u001b[A\n",
"Iteration: 10% 26/268 [00:10<01:33, 2.58it/s]\u001b[A\n",
"Iteration: 10% 27/268 [00:10<01:33, 2.58it/s]\u001b[A{'loss': 0.3145313262939453, 'learning_rate': 1.2636815920398011e-05, 'epoch': 1.1044776119402986, 'total_flos': 785834241845760, 'step': 296}\n",
"\n",
"Iteration: 10% 28/268 [00:10<01:32, 2.58it/s]\u001b[A\n",
"Iteration: 11% 29/268 [00:11<01:32, 2.58it/s]\u001b[A\n",
"Iteration: 11% 30/268 [00:11<01:32, 2.58it/s]\u001b[A\n",
"Iteration: 12% 31/268 [00:11<01:31, 2.58it/s]\u001b[A\n",
"Iteration: 12% 32/268 [00:12<01:31, 2.58it/s]\u001b[A\n",
"Iteration: 12% 33/268 [00:12<01:31, 2.58it/s]\u001b[A\n",
"Iteration: 13% 34/268 [00:13<01:30, 2.58it/s]\u001b[A\n",
"Iteration: 13% 35/268 [00:13<01:30, 2.59it/s]\u001b[A{'loss': 0.29607582092285156, 'learning_rate': 1.2437810945273631e-05, 'epoch': 1.1343283582089552, 'total_flos': 807129210186240, 'step': 304}\n",
"\n",
"Iteration: 13% 36/268 [00:13<01:29, 2.59it/s]\u001b[A\n",
"Iteration: 14% 37/268 [00:14<01:29, 2.58it/s]\u001b[A\n",
"Iteration: 14% 38/268 [00:14<01:28, 2.59it/s]\u001b[A\n",
"Iteration: 15% 39/268 [00:15<01:28, 2.59it/s]\u001b[A\n",
"Iteration: 15% 40/268 [00:15<01:28, 2.59it/s]\u001b[A\n",
"Iteration: 15% 41/268 [00:15<01:27, 2.59it/s]\u001b[A\n",
"Iteration: 16% 42/268 [00:16<01:27, 2.59it/s]\u001b[A\n",
"Iteration: 16% 43/268 [00:16<01:26, 2.59it/s]\u001b[A{'loss': 0.2753257751464844, 'learning_rate': 1.2238805970149255e-05, 'epoch': 1.164179104477612, 'total_flos': 828424178526720, 'step': 312}\n",
"\n",
"Iteration: 16% 44/268 [00:17<01:26, 2.59it/s]\u001b[A\n",
"Iteration: 17% 45/268 [00:17<01:26, 2.58it/s]\u001b[A\n",
"Iteration: 17% 46/268 [00:17<01:25, 2.59it/s]\u001b[A\n",
"Iteration: 18% 47/268 [00:18<01:25, 2.59it/s]\u001b[A\n",
"Iteration: 18% 48/268 [00:18<01:24, 2.59it/s]\u001b[A\n",
"Iteration: 18% 49/268 [00:18<01:24, 2.59it/s]\u001b[A\n",
"Iteration: 19% 50/268 [00:19<01:24, 2.58it/s]\u001b[A\n",
"Iteration: 19% 51/268 [00:19<01:24, 2.57it/s]\u001b[A{'loss': 0.28547096252441406, 'learning_rate': 1.2039800995024878e-05, 'epoch': 1.1940298507462686, 'total_flos': 849719146867200, 'step': 320}\n",
"\n",
"Iteration: 19% 52/268 [00:20<01:24, 2.57it/s]\u001b[A\n",
"Iteration: 20% 53/268 [00:20<01:23, 2.57it/s]\u001b[A\n",
"Iteration: 20% 54/268 [00:20<01:23, 2.57it/s]\u001b[A\n",
"Iteration: 21% 55/268 [00:21<01:22, 2.57it/s]\u001b[A\n",
"Iteration: 21% 56/268 [00:21<01:22, 2.58it/s]\u001b[A\n",
"Iteration: 21% 57/268 [00:22<01:21, 2.59it/s]\u001b[A\n",
"Iteration: 22% 58/268 [00:22<01:21, 2.59it/s]\u001b[A\n",
"Iteration: 22% 59/268 [00:22<01:20, 2.59it/s]\u001b[A{'loss': 0.29924583435058594, 'learning_rate': 1.1840796019900498e-05, 'epoch': 1.2238805970149254, 'total_flos': 871014115207680, 'step': 328}\n",
"\n",
"Iteration: 22% 60/268 [00:23<01:20, 2.59it/s]\u001b[A\n",
"Iteration: 23% 61/268 [00:23<01:20, 2.59it/s]\u001b[A\n",
"Iteration: 23% 62/268 [00:23<01:19, 2.58it/s]\u001b[A\n",
"Iteration: 24% 63/268 [00:24<01:19, 2.58it/s]\u001b[A\n",
"Iteration: 24% 64/268 [00:24<01:19, 2.58it/s]\u001b[A\n",
"Iteration: 24% 65/268 [00:25<01:18, 2.58it/s]\u001b[A\n",
"Iteration: 25% 66/268 [00:25<01:18, 2.58it/s]\u001b[A\n",
"Iteration: 25% 67/268 [00:25<01:17, 2.58it/s]\u001b[A{'loss': 0.31764984130859375, 'learning_rate': 1.1641791044776121e-05, 'epoch': 1.2537313432835822, 'total_flos': 892309083548160, 'step': 336}\n",
"\n",
"Iteration: 25% 68/268 [00:26<01:17, 2.58it/s]\u001b[A\n",
"Iteration: 26% 69/268 [00:26<01:16, 2.59it/s]\u001b[A\n",
"Iteration: 26% 70/268 [00:27<01:16, 2.58it/s]\u001b[A\n",
"Iteration: 26% 71/268 [00:27<01:16, 2.58it/s]\u001b[A\n",
"Iteration: 27% 72/268 [00:27<01:15, 2.59it/s]\u001b[A\n",
"Iteration: 27% 73/268 [00:28<01:15, 2.58it/s]\u001b[A\n",
"Iteration: 28% 74/268 [00:28<01:15, 2.58it/s]\u001b[A\n",
"Iteration: 28% 75/268 [00:29<01:14, 2.58it/s]\u001b[A{'loss': 0.28099822998046875, 'learning_rate': 1.1442786069651741e-05, 'epoch': 1.2835820895522387, 'total_flos': 913604051888640, 'step': 344}\n",
"\n",
"Iteration: 28% 76/268 [00:29<01:14, 2.58it/s]\u001b[A\n",
"Iteration: 29% 77/268 [00:29<01:13, 2.58it/s]\u001b[A\n",
"Iteration: 29% 78/268 [00:30<01:13, 2.59it/s]\u001b[A\n",
"Iteration: 29% 79/268 [00:30<01:13, 2.59it/s]\u001b[A\n",
"Iteration: 30% 80/268 [00:30<01:12, 2.59it/s]\u001b[A\n",
"Iteration: 30% 81/268 [00:31<01:12, 2.59it/s]\u001b[A\n",
"Iteration: 31% 82/268 [00:31<01:11, 2.59it/s]\u001b[A\n",
"Iteration: 31% 83/268 [00:32<01:11, 2.58it/s]\u001b[A{'loss': 0.28821563720703125, 'learning_rate': 1.1243781094527364e-05, 'epoch': 1.3134328358208955, 'total_flos': 934899020229120, 'step': 352}\n",
"\n",
"Iteration: 31% 84/268 [00:32<01:11, 2.58it/s]\u001b[A\n",
"Iteration: 32% 85/268 [00:32<01:11, 2.57it/s]\u001b[A\n",
"Iteration: 32% 86/268 [00:33<01:10, 2.58it/s]\u001b[A\n",
"Iteration: 32% 87/268 [00:33<01:10, 2.58it/s]\u001b[A\n",
"Iteration: 33% 88/268 [00:34<01:09, 2.58it/s]\u001b[A\n",
"Iteration: 33% 89/268 [00:34<01:09, 2.59it/s]\u001b[A\n",
"Iteration: 34% 90/268 [00:34<01:08, 2.59it/s]\u001b[A\n",
"Iteration: 34% 91/268 [00:35<01:08, 2.59it/s]\u001b[A{'loss': 0.25145530700683594, 'learning_rate': 1.1044776119402986e-05, 'epoch': 1.3432835820895521, 'total_flos': 956193988569600, 'step': 360}\n",
"\n",
"Iteration: 34% 92/268 [00:35<01:08, 2.59it/s]\u001b[A\n",
"Iteration: 35% 93/268 [00:35<01:07, 2.59it/s]\u001b[A\n",
"Iteration: 35% 94/268 [00:36<01:07, 2.59it/s]\u001b[A\n",
"Iteration: 35% 95/268 [00:36<01:06, 2.59it/s]\u001b[A\n",
"Iteration: 36% 96/268 [00:37<01:06, 2.59it/s]\u001b[A\n",
"Iteration: 36% 97/268 [00:37<01:06, 2.59it/s]\u001b[A\n",
"Iteration: 37% 98/268 [00:37<01:05, 2.59it/s]\u001b[A\n",
"Iteration: 37% 99/268 [00:38<01:05, 2.59it/s]\u001b[A{'loss': 0.25258827209472656, 'learning_rate': 1.0845771144278608e-05, 'epoch': 1.373134328358209, 'total_flos': 977488956910080, 'step': 368}\n",
"\n",
"Iteration: 37% 100/268 [00:38<01:05, 2.58it/s]\u001b[A\n",
"Iteration: 38% 101/268 [00:39<01:04, 2.59it/s]\u001b[A\n",
"Iteration: 38% 102/268 [00:39<01:04, 2.59it/s]\u001b[A\n",
"Iteration: 38% 103/268 [00:39<01:03, 2.59it/s]\u001b[A\n",
"Iteration: 39% 104/268 [00:40<01:03, 2.59it/s]\u001b[A\n",
"Iteration: 39% 105/268 [00:40<01:03, 2.58it/s]\u001b[A\n",
"Iteration: 40% 106/268 [00:41<01:02, 2.58it/s]\u001b[A\n",
"Iteration: 40% 107/268 [00:41<01:02, 2.59it/s]\u001b[A{'loss': 0.3570365905761719, 'learning_rate': 1.064676616915423e-05, 'epoch': 1.4029850746268657, 'total_flos': 998783925250560, 'step': 376}\n",
"\n",
"Iteration: 40% 108/268 [00:41<01:01, 2.59it/s]\u001b[A\n",
"Iteration: 41% 109/268 [00:42<01:01, 2.59it/s]\u001b[A\n",
"Iteration: 41% 110/268 [00:42<01:01, 2.59it/s]\u001b[A\n",
"Iteration: 41% 111/268 [00:42<01:00, 2.58it/s]\u001b[A\n",
"Iteration: 42% 112/268 [00:43<01:00, 2.58it/s]\u001b[A\n",
"Iteration: 42% 113/268 [00:43<00:59, 2.58it/s]\u001b[A\n",
"Iteration: 43% 114/268 [00:44<00:59, 2.59it/s]\u001b[A\n",
"Iteration: 43% 115/268 [00:44<00:59, 2.59it/s]\u001b[A{'loss': 0.3033332824707031, 'learning_rate': 1.0447761194029851e-05, 'epoch': 1.4328358208955223, 'total_flos': 1020078893591040, 'step': 384}\n",
"\n",
"Iteration: 43% 116/268 [00:44<00:58, 2.59it/s]\u001b[A\n",
"Iteration: 44% 117/268 [00:45<00:58, 2.59it/s]\u001b[A\n",
"Iteration: 44% 118/268 [00:45<00:57, 2.59it/s]\u001b[A\n",
"Iteration: 44% 119/268 [00:46<00:57, 2.57it/s]\u001b[A\n",
"Iteration: 45% 120/268 [00:46<00:57, 2.58it/s]\u001b[A\n",
"Iteration: 45% 121/268 [00:46<00:56, 2.58it/s]\u001b[A\n",
"Iteration: 46% 122/268 [00:47<00:56, 2.58it/s]\u001b[A\n",
"Iteration: 46% 123/268 [00:47<00:56, 2.59it/s]\u001b[A{'loss': 0.2525367736816406, 'learning_rate': 1.0248756218905474e-05, 'epoch': 1.462686567164179, 'total_flos': 1041373861931520, 'step': 392}\n",
"\n",
"Iteration: 46% 124/268 [00:47<00:55, 2.59it/s]\u001b[A\n",
"Iteration: 47% 125/268 [00:48<00:55, 2.59it/s]\u001b[A\n",
"Iteration: 47% 126/268 [00:48<00:54, 2.59it/s]\u001b[A\n",
"Iteration: 47% 127/268 [00:49<00:54, 2.59it/s]\u001b[A\n",
"Iteration: 48% 128/268 [00:49<00:54, 2.59it/s]\u001b[A\n",
"Iteration: 48% 129/268 [00:49<00:53, 2.59it/s]\u001b[A\n",
"Iteration: 49% 130/268 [00:50<00:53, 2.59it/s]\u001b[A\n",
"Iteration: 49% 131/268 [00:50<00:52, 2.59it/s]\u001b[A{'loss': 0.32231712341308594, 'learning_rate': 1.0049751243781096e-05, 'epoch': 1.4925373134328357, 'total_flos': 1062668830272000, 'step': 400}\n",
"\n",
"Iteration: 49% 132/268 [00:51<00:52, 2.59it/s]\u001b[A\n",
"Iteration: 50% 133/268 [00:51<00:52, 2.58it/s]\u001b[A\n",
"Iteration: 50% 134/268 [00:51<00:51, 2.58it/s]\u001b[A\n",
"Iteration: 50% 135/268 [00:52<00:51, 2.58it/s]\u001b[A\n",
"Iteration: 51% 136/268 [00:52<00:51, 2.59it/s]\u001b[A\n",
"Iteration: 51% 137/268 [00:52<00:50, 2.59it/s]\u001b[A\n",
"Iteration: 51% 138/268 [00:53<00:50, 2.59it/s]\u001b[A\n",
"Iteration: 52% 139/268 [00:53<00:49, 2.59it/s]\u001b[A{'loss': 0.25180816650390625, 'learning_rate': 9.850746268656717e-06, 'epoch': 1.5223880597014925, 'total_flos': 1083963798612480, 'step': 408}\n",
"\n",
"Iteration: 52% 140/268 [00:54<00:49, 2.58it/s]\u001b[A\n",
"Iteration: 53% 141/268 [00:54<00:49, 2.57it/s]\u001b[A\n",
"Iteration: 53% 142/268 [00:54<00:48, 2.58it/s]\u001b[A\n",
"Iteration: 53% 143/268 [00:55<00:48, 2.58it/s]\u001b[A\n",
"Iteration: 54% 144/268 [00:55<00:47, 2.59it/s]\u001b[A\n",
"Iteration: 54% 145/268 [00:56<00:47, 2.59it/s]\u001b[A\n",
"Iteration: 54% 146/268 [00:56<00:47, 2.59it/s]\u001b[A\n",
"Iteration: 55% 147/268 [00:56<00:46, 2.59it/s]\u001b[A{'loss': 0.3343639373779297, 'learning_rate': 9.651741293532339e-06, 'epoch': 1.5522388059701493, 'total_flos': 1105258766952960, 'step': 416}\n",
"\n",
"Iteration: 55% 148/268 [00:57<00:46, 2.59it/s]\u001b[A\n",
"Iteration: 56% 149/268 [00:57<00:45, 2.59it/s]\u001b[A\n",
"Iteration: 56% 150/268 [00:58<00:45, 2.59it/s]\u001b[A\n",
"Iteration: 56% 151/268 [00:58<00:45, 2.59it/s]\u001b[A\n",
"Iteration: 57% 152/268 [00:58<00:44, 2.59it/s]\u001b[A\n",
"Iteration: 57% 153/268 [00:59<00:44, 2.59it/s]\u001b[A\n",
"Iteration: 57% 154/268 [00:59<00:44, 2.59it/s]\u001b[A\n",
"Iteration: 58% 155/268 [00:59<00:43, 2.59it/s]\u001b[A{'loss': 0.24284744262695312, 'learning_rate': 9.45273631840796e-06, 'epoch': 1.582089552238806, 'total_flos': 1126553735293440, 'step': 424}\n",
"\n",
"Iteration: 58% 156/268 [01:00<00:43, 2.59it/s]\u001b[A\n",
"Iteration: 59% 157/268 [01:00<00:42, 2.59it/s]\u001b[A\n",
"Iteration: 59% 158/268 [01:01<00:42, 2.59it/s]\u001b[A\n",
"Iteration: 59% 159/268 [01:01<00:42, 2.59it/s]\u001b[A\n",
"Iteration: 60% 160/268 [01:01<00:41, 2.59it/s]\u001b[A\n",
"Iteration: 60% 161/268 [01:02<00:41, 2.59it/s]\u001b[A\n",
"Iteration: 60% 162/268 [01:02<00:40, 2.59it/s]\u001b[A\n",
"Iteration: 61% 163/268 [01:03<00:40, 2.59it/s]\u001b[A{'loss': 0.3274555206298828, 'learning_rate': 9.253731343283582e-06, 'epoch': 1.6119402985074627, 'total_flos': 1147848703633920, 'step': 432}\n",
"\n",
"Iteration: 61% 164/268 [01:03<00:40, 2.59it/s]\u001b[A\n",
"Iteration: 62% 165/268 [01:03<00:39, 2.59it/s]\u001b[A\n",
"Iteration: 62% 166/268 [01:04<00:39, 2.59it/s]\u001b[A\n",
"Iteration: 62% 167/268 [01:04<00:38, 2.59it/s]\u001b[A\n",
"Iteration: 63% 168/268 [01:04<00:38, 2.60it/s]\u001b[A\n",
"Iteration: 63% 169/268 [01:05<00:38, 2.59it/s]\u001b[A\n",
"Iteration: 63% 170/268 [01:05<00:37, 2.59it/s]\u001b[A\n",
"Iteration: 64% 171/268 [01:06<00:37, 2.59it/s]\u001b[A{'loss': 0.2362499237060547, 'learning_rate': 9.054726368159204e-06, 'epoch': 1.6417910447761193, 'total_flos': 1169143671974400, 'step': 440}\n",
"\n",
"Iteration: 64% 172/268 [01:06<00:37, 2.59it/s]\u001b[A\n",
"Iteration: 65% 173/268 [01:06<00:36, 2.59it/s]\u001b[A\n",
"Iteration: 65% 174/268 [01:07<00:36, 2.58it/s]\u001b[A\n",
"Iteration: 65% 175/268 [01:07<00:35, 2.58it/s]\u001b[A\n",
"Iteration: 66% 176/268 [01:08<00:35, 2.58it/s]\u001b[A\n",
"Iteration: 66% 177/268 [01:08<00:35, 2.59it/s]\u001b[A\n",
"Iteration: 66% 178/268 [01:08<00:34, 2.59it/s]\u001b[A\n",
"Iteration: 67% 179/268 [01:09<00:34, 2.58it/s]\u001b[A{'loss': 0.21753692626953125, 'learning_rate': 8.855721393034826e-06, 'epoch': 1.671641791044776, 'total_flos': 1190438640314880, 'step': 448}\n",
"\n",
"Iteration: 67% 180/268 [01:09<00:34, 2.58it/s]\u001b[A\n",
"Iteration: 68% 181/268 [01:09<00:33, 2.59it/s]\u001b[A\n",
"Iteration: 68% 182/268 [01:10<00:33, 2.59it/s]\u001b[A\n",
"Iteration: 68% 183/268 [01:10<00:32, 2.59it/s]\u001b[A\n",
"Iteration: 69% 184/268 [01:11<00:32, 2.59it/s]\u001b[A\n",
"Iteration: 69% 185/268 [01:11<00:32, 2.59it/s]\u001b[A\n",
"Iteration: 69% 186/268 [01:11<00:31, 2.59it/s]\u001b[A\n",
"Iteration: 70% 187/268 [01:12<00:31, 2.59it/s]\u001b[A{'loss': 0.28411102294921875, 'learning_rate': 8.656716417910447e-06, 'epoch': 1.7014925373134329, 'total_flos': 1211733608655360, 'step': 456}\n",
"\n",
"Iteration: 70% 188/268 [01:12<00:30, 2.59it/s]\u001b[A\n",
"Iteration: 71% 189/268 [01:13<00:30, 2.59it/s]\u001b[A\n",
"Iteration: 71% 190/268 [01:13<00:30, 2.59it/s]\u001b[A\n",
"Iteration: 71% 191/268 [01:13<00:29, 2.59it/s]\u001b[A\n",
"Iteration: 72% 192/268 [01:14<00:29, 2.59it/s]\u001b[A\n",
"Iteration: 72% 193/268 [01:14<00:29, 2.58it/s]\u001b[A\n",
"Iteration: 72% 194/268 [01:15<00:28, 2.58it/s]\u001b[A\n",
"Iteration: 73% 195/268 [01:15<00:28, 2.58it/s]\u001b[A{'loss': 0.2508563995361328, 'learning_rate': 8.45771144278607e-06, 'epoch': 1.7313432835820897, 'total_flos': 1233028576995840, 'step': 464}\n",
"\n",
"Iteration: 73% 196/268 [01:15<00:27, 2.59it/s]\u001b[A\n",
"Iteration: 74% 197/268 [01:16<00:27, 2.59it/s]\u001b[A\n",
"Iteration: 74% 198/268 [01:16<00:27, 2.59it/s]\u001b[A\n",
"Iteration: 74% 199/268 [01:16<00:26, 2.59it/s]\u001b[A\n",
"Iteration: 75% 200/268 [01:17<00:26, 2.59it/s]\u001b[A\n",
"Iteration: 75% 201/268 [01:17<00:25, 2.59it/s]\u001b[A\n",
"Iteration: 75% 202/268 [01:18<00:25, 2.59it/s]\u001b[A\n",
"Iteration: 76% 203/268 [01:18<00:25, 2.59it/s]\u001b[A{'loss': 0.23903846740722656, 'learning_rate': 8.258706467661692e-06, 'epoch': 1.7611940298507462, 'total_flos': 1254323545336320, 'step': 472}\n",
"\n",
"Iteration: 76% 204/268 [01:18<00:24, 2.59it/s]\u001b[A\n",
"Iteration: 76% 205/268 [01:19<00:24, 2.59it/s]\u001b[A\n",
"Iteration: 77% 206/268 [01:19<00:23, 2.59it/s]\u001b[A\n",
"Iteration: 77% 207/268 [01:20<00:23, 2.59it/s]\u001b[A\n",
"Iteration: 78% 208/268 [01:20<00:23, 2.59it/s]\u001b[A\n",
"Iteration: 78% 209/268 [01:20<00:22, 2.59it/s]\u001b[A\n",
"Iteration: 78% 210/268 [01:21<00:22, 2.59it/s]\u001b[A\n",
"Iteration: 79% 211/268 [01:21<00:21, 2.59it/s]\u001b[A{'loss': 0.2828254699707031, 'learning_rate': 8.059701492537314e-06, 'epoch': 1.7910447761194028, 'total_flos': 1275618513676800, 'step': 480}\n",
"\n",
"Iteration: 79% 212/268 [01:21<00:21, 2.59it/s]\u001b[A\n",
"Iteration: 79% 213/268 [01:22<00:21, 2.59it/s]\u001b[A\n",
"Iteration: 80% 214/268 [01:22<00:20, 2.59it/s]\u001b[A\n",
"Iteration: 80% 215/268 [01:23<00:20, 2.59it/s]\u001b[A\n",
"Iteration: 81% 216/268 [01:23<00:20, 2.59it/s]\u001b[A\n",
"Iteration: 81% 217/268 [01:23<00:19, 2.59it/s]\u001b[A\n",
"Iteration: 81% 218/268 [01:24<00:19, 2.59it/s]\u001b[A\n",
"Iteration: 82% 219/268 [01:24<00:18, 2.59it/s]\u001b[A{'loss': 0.33939552307128906, 'learning_rate': 7.860696517412935e-06, 'epoch': 1.8208955223880596, 'total_flos': 1296913482017280, 'step': 488}\n",
"\n",
"Iteration: 82% 220/268 [01:25<00:18, 2.59it/s]\u001b[A\n",
"Iteration: 82% 221/268 [01:25<00:18, 2.59it/s]\u001b[A\n",
"Iteration: 83% 222/268 [01:25<00:17, 2.58it/s]\u001b[A\n",
"Iteration: 83% 223/268 [01:26<00:17, 2.59it/s]\u001b[A\n",
"Iteration: 84% 224/268 [01:26<00:16, 2.59it/s]\u001b[A\n",
"Iteration: 84% 225/268 [01:26<00:16, 2.59it/s]\u001b[A\n",
"Iteration: 84% 226/268 [01:27<00:16, 2.59it/s]\u001b[A\n",
"Iteration: 85% 227/268 [01:27<00:15, 2.59it/s]\u001b[A{'loss': 0.25545310974121094, 'learning_rate': 7.661691542288557e-06, 'epoch': 1.8507462686567164, 'total_flos': 1318208450357760, 'step': 496}\n",
"\n",
"Iteration: 85% 228/268 [01:28<00:15, 2.59it/s]\u001b[A\n",
"Iteration: 85% 229/268 [01:28<00:15, 2.58it/s]\u001b[A\n",
"Iteration: 86% 230/268 [01:28<00:14, 2.58it/s]\u001b[A\n",
"Iteration: 86% 231/268 [01:29<00:14, 2.58it/s]\u001b[A\n",
"Iteration: 87% 232/268 [01:34<01:04, 1.78s/it]\u001b[A\n",
"Iteration: 87% 233/268 [01:34<00:48, 1.37s/it]\u001b[A\n",
"Iteration: 87% 234/268 [01:35<00:36, 1.08s/it]\u001b[A\n",
"Iteration: 88% 235/268 [01:35<00:28, 1.15it/s]\u001b[A{'loss': 0.24697303771972656, 'learning_rate': 7.46268656716418e-06, 'epoch': 1.8805970149253732, 'total_flos': 1339503418698240, 'step': 504}\n",
"\n",
"Iteration: 88% 236/268 [01:35<00:23, 1.38it/s]\u001b[A\n",
"Iteration: 88% 237/268 [01:36<00:19, 1.61it/s]\u001b[A\n",
"Iteration: 89% 238/268 [01:36<00:16, 1.81it/s]\u001b[A\n",
"Iteration: 89% 239/268 [01:37<00:14, 1.99it/s]\u001b[A\n",
"Iteration: 90% 240/268 [01:37<00:13, 2.14it/s]\u001b[A\n",
"Iteration: 90% 241/268 [01:37<00:11, 2.26it/s]\u001b[A\n",
"Iteration: 90% 242/268 [01:38<00:11, 2.35it/s]\u001b[A\n",
"Iteration: 91% 243/268 [01:38<00:10, 2.41it/s]\u001b[A{'loss': 0.2710132598876953, 'learning_rate': 7.263681592039802e-06, 'epoch': 1.9104477611940298, 'total_flos': 1360798387038720, 'step': 512}\n",
"\n",
"Iteration: 91% 244/268 [01:39<00:09, 2.46it/s]\u001b[A\n",
"Iteration: 91% 245/268 [01:39<00:09, 2.50it/s]\u001b[A\n",
"Iteration: 92% 246/268 [01:39<00:08, 2.53it/s]\u001b[A\n",
"Iteration: 92% 247/268 [01:40<00:08, 2.54it/s]\u001b[A\n",
"Iteration: 93% 248/268 [01:40<00:07, 2.56it/s]\u001b[A\n",
"Iteration: 93% 249/268 [01:40<00:07, 2.57it/s]\u001b[A\n",
"Iteration: 93% 250/268 [01:41<00:06, 2.58it/s]\u001b[A\n",
"Iteration: 94% 251/268 [01:41<00:06, 2.57it/s]\u001b[A{'loss': 0.3086738586425781, 'learning_rate': 7.064676616915423e-06, 'epoch': 1.9402985074626866, 'total_flos': 1382093355379200, 'step': 520}\n",
"\n",
"Iteration: 94% 252/268 [01:42<00:06, 2.57it/s]\u001b[A\n",
"Iteration: 94% 253/268 [01:42<00:05, 2.58it/s]\u001b[A\n",
"Iteration: 95% 254/268 [01:42<00:05, 2.58it/s]\u001b[A\n",
"Iteration: 95% 255/268 [01:43<00:05, 2.59it/s]\u001b[A\n",
"Iteration: 96% 256/268 [01:43<00:04, 2.59it/s]\u001b[A\n",
"Iteration: 96% 257/268 [01:44<00:04, 2.59it/s]\u001b[A\n",
"Iteration: 96% 258/268 [01:44<00:03, 2.59it/s]\u001b[A\n",
"Iteration: 97% 259/268 [01:44<00:03, 2.59it/s]\u001b[A{'loss': 0.3468170166015625, 'learning_rate': 6.865671641791045e-06, 'epoch': 1.9701492537313432, 'total_flos': 1403388323719680, 'step': 528}\n",
"\n",
"Iteration: 97% 260/268 [01:45<00:03, 2.59it/s]\u001b[A\n",
"Iteration: 97% 261/268 [01:45<00:02, 2.59it/s]\u001b[A\n",
"Iteration: 98% 262/268 [01:45<00:02, 2.59it/s]\u001b[A\n",
"Iteration: 98% 263/268 [01:46<00:01, 2.60it/s]\u001b[A\n",
"Iteration: 99% 264/268 [01:46<00:01, 2.60it/s]\u001b[A\n",
"Iteration: 99% 265/268 [01:47<00:01, 2.59it/s]\u001b[A\n",
"Iteration: 99% 266/268 [01:47<00:00, 2.59it/s]\u001b[A\n",
"Iteration: 100% 267/268 [01:47<00:00, 2.59it/s]\u001b[A{'loss': 0.23921585083007812, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0, 'total_flos': 1422603705308160, 'step': 536}\n",
"\n",
"Iteration: 100% 268/268 [01:48<00:00, 2.48it/s]\n",
"Epoch: 67% 2/3 [03:31<01:44, 104.67s/it]\n",
"Iteration: 0% 0/268 [00:00<?, ?it/s]\u001b[A\n",
"Iteration: 0% 1/268 [00:00<01:42, 2.59it/s]\u001b[A\n",
"Iteration: 1% 2/268 [00:00<01:42, 2.58it/s]\u001b[A\n",
"Iteration: 1% 3/268 [00:01<01:42, 2.59it/s]\u001b[A\n",
"Iteration: 1% 4/268 [00:01<01:41, 2.59it/s]\u001b[A\n",
"Iteration: 2% 5/268 [00:01<01:41, 2.59it/s]\u001b[A\n",
"Iteration: 2% 6/268 [00:02<01:41, 2.59it/s]\u001b[A\n",
"Iteration: 3% 7/268 [00:02<01:40, 2.59it/s]\u001b[A{'loss': 0.15148353576660156, 'learning_rate': 6.46766169154229e-06, 'epoch': 2.029850746268657, 'total_flos': 1443898673648640, 'step': 544}\n",
"\n",
"Iteration: 3% 8/268 [00:03<01:40, 2.58it/s]\u001b[A\n",
"Iteration: 3% 9/268 [00:03<01:40, 2.59it/s]\u001b[A\n",
"Iteration: 4% 10/268 [00:03<01:39, 2.59it/s]\u001b[A\n",
"Iteration: 4% 11/268 [00:04<01:39, 2.59it/s]\u001b[A\n",
"Iteration: 4% 12/268 [00:04<01:38, 2.59it/s]\u001b[A\n",
"Iteration: 5% 13/268 [00:05<01:38, 2.59it/s]\u001b[A\n",
"Iteration: 5% 14/268 [00:05<01:38, 2.59it/s]\u001b[A\n",
"Iteration: 6% 15/268 [00:05<01:37, 2.59it/s]\u001b[A{'loss': 0.162750244140625, 'learning_rate': 6.2686567164179116e-06, 'epoch': 2.0597014925373136, 'total_flos': 1465193641989120, 'step': 552}\n",
"\n",
"Iteration: 6% 16/268 [00:06<01:37, 2.59it/s]\u001b[A\n",
"Iteration: 6% 17/268 [00:06<01:36, 2.59it/s]\u001b[A\n",
"Iteration: 7% 18/268 [00:06<01:36, 2.58it/s]\u001b[A\n",
"Iteration: 7% 19/268 [00:07<01:36, 2.58it/s]\u001b[A\n",
"Iteration: 7% 20/268 [00:07<01:35, 2.59it/s]\u001b[A\n",
"Iteration: 8% 21/268 [00:08<01:35, 2.59it/s]\u001b[A\n",
"Iteration: 8% 22/268 [00:08<01:35, 2.59it/s]\u001b[A\n",
"Iteration: 9% 23/268 [00:08<01:34, 2.59it/s]\u001b[A{'loss': 0.16520118713378906, 'learning_rate': 6.069651741293533e-06, 'epoch': 2.08955223880597, 'total_flos': 1486488610329600, 'step': 560}\n",
"\n",
"Iteration: 9% 24/268 [00:09<01:34, 2.59it/s]\u001b[A\n",
"Iteration: 9% 25/268 [00:09<01:33, 2.59it/s]\u001b[A\n",
"Iteration: 10% 26/268 [00:10<01:33, 2.59it/s]\u001b[A\n",
"Iteration: 10% 27/268 [00:10<01:32, 2.60it/s]\u001b[A\n",
"Iteration: 10% 28/268 [00:10<01:32, 2.60it/s]\u001b[A\n",
"Iteration: 11% 29/268 [00:11<01:32, 2.59it/s]\u001b[A\n",
"Iteration: 11% 30/268 [00:11<01:31, 2.59it/s]\u001b[A\n",
"Iteration: 12% 31/268 [00:11<01:31, 2.59it/s]\u001b[A{'loss': 0.1896533966064453, 'learning_rate': 5.870646766169155e-06, 'epoch': 2.1194029850746268, 'total_flos': 1507783578670080, 'step': 568}\n",
"\n",
"Iteration: 12% 32/268 [00:12<01:31, 2.58it/s]\u001b[A\n",
"Iteration: 12% 33/268 [00:12<01:30, 2.58it/s]\u001b[A\n",
"Iteration: 13% 34/268 [00:13<01:30, 2.58it/s]\u001b[A\n",
"Iteration: 13% 35/268 [00:13<01:30, 2.58it/s]\u001b[A\n",
"Iteration: 13% 36/268 [00:13<01:30, 2.58it/s]\u001b[A\n",
"Iteration: 14% 37/268 [00:14<01:29, 2.58it/s]\u001b[A\n",
"Iteration: 14% 38/268 [00:14<01:29, 2.58it/s]\u001b[A\n",
"Iteration: 15% 39/268 [00:15<01:28, 2.58it/s]\u001b[A{'loss': 0.14014434814453125, 'learning_rate': 5.671641791044776e-06, 'epoch': 2.1492537313432836, 'total_flos': 1529078547010560, 'step': 576}\n",
"\n",
"Iteration: 15% 40/268 [00:15<01:28, 2.58it/s]\u001b[A\n",
"Iteration: 15% 41/268 [00:15<01:27, 2.58it/s]\u001b[A\n",
"Iteration: 16% 42/268 [00:16<01:27, 2.58it/s]\u001b[A\n",
"Iteration: 16% 43/268 [00:16<01:26, 2.59it/s]\u001b[A\n",
"Iteration: 16% 44/268 [00:17<01:26, 2.59it/s]\u001b[A\n",
"Iteration: 17% 45/268 [00:17<01:26, 2.59it/s]\u001b[A\n",
"Iteration: 17% 46/268 [00:17<01:25, 2.59it/s]\u001b[A\n",
"Iteration: 18% 47/268 [00:18<01:25, 2.59it/s]\u001b[A{'loss': 0.2625617980957031, 'learning_rate': 5.472636815920398e-06, 'epoch': 2.1791044776119404, 'total_flos': 1550373515351040, 'step': 584}\n",
"\n",
"Iteration: 18% 48/268 [00:18<01:24, 2.59it/s]\u001b[A\n",
"Iteration: 18% 49/268 [00:18<01:24, 2.59it/s]\u001b[A\n",
"Iteration: 19% 50/268 [00:19<01:24, 2.59it/s]\u001b[A\n",
"Iteration: 19% 51/268 [00:19<01:23, 2.59it/s]\u001b[A\n",
"Iteration: 19% 52/268 [00:20<01:23, 2.59it/s]\u001b[A\n",
"Iteration: 20% 53/268 [00:20<01:23, 2.59it/s]\u001b[A\n",
"Iteration: 20% 54/268 [00:20<01:22, 2.58it/s]\u001b[A\n",
"Iteration: 21% 55/268 [00:21<01:22, 2.59it/s]\u001b[A{'loss': 0.1728382110595703, 'learning_rate': 5.2736318407960205e-06, 'epoch': 2.208955223880597, 'total_flos': 1571668483691520, 'step': 592}\n",
"\n",
"Iteration: 21% 56/268 [00:21<01:21, 2.59it/s]\u001b[A\n",
"Iteration: 21% 57/268 [00:22<01:21, 2.58it/s]\u001b[A\n",
"Iteration: 22% 58/268 [00:22<01:21, 2.58it/s]\u001b[A\n",
"Iteration: 22% 59/268 [00:22<01:20, 2.58it/s]\u001b[A\n",
"Iteration: 22% 60/268 [00:23<01:20, 2.59it/s]\u001b[A\n",
"Iteration: 23% 61/268 [00:23<01:19, 2.59it/s]\u001b[A\n",
"Iteration: 23% 62/268 [00:23<01:19, 2.59it/s]\u001b[A\n",
"Iteration: 24% 63/268 [00:24<01:19, 2.59it/s]\u001b[A{'loss': 0.19240570068359375, 'learning_rate': 5.074626865671642e-06, 'epoch': 2.2388059701492535, 'total_flos': 1592963452032000, 'step': 600}\n",
"\n",
"Iteration: 24% 64/268 [00:24<01:18, 2.59it/s]\u001b[A\n",
"Iteration: 24% 65/268 [00:25<01:18, 2.59it/s]\u001b[A\n",
"Iteration: 25% 66/268 [00:25<01:18, 2.59it/s]\u001b[A\n",
"Iteration: 25% 67/268 [00:25<01:17, 2.59it/s]\u001b[A\n",
"Iteration: 25% 68/268 [00:26<01:17, 2.59it/s]\u001b[A\n",
"Iteration: 26% 69/268 [00:26<01:16, 2.59it/s]\u001b[A\n",
"Iteration: 26% 70/268 [00:27<01:16, 2.59it/s]\u001b[A\n",
"Iteration: 26% 71/268 [00:27<01:15, 2.59it/s]\u001b[A{'loss': 0.2273120880126953, 'learning_rate': 4.875621890547264e-06, 'epoch': 2.2686567164179103, 'total_flos': 1614258420372480, 'step': 608}\n",
"\n",
"Iteration: 27% 72/268 [00:27<01:15, 2.59it/s]\u001b[A\n",
"Iteration: 27% 73/268 [00:28<01:15, 2.59it/s]\u001b[A\n",
"Iteration: 28% 74/268 [00:28<01:14, 2.59it/s]\u001b[A\n",
"Iteration: 28% 75/268 [00:28<01:14, 2.59it/s]\u001b[A\n",
"Iteration: 28% 76/268 [00:29<01:14, 2.59it/s]\u001b[A\n",
"Iteration: 29% 77/268 [00:29<01:13, 2.59it/s]\u001b[A\n",
"Iteration: 29% 78/268 [00:30<01:13, 2.59it/s]\u001b[A\n",
"Iteration: 29% 79/268 [00:30<01:13, 2.58it/s]\u001b[A{'loss': 0.17682456970214844, 'learning_rate': 4.676616915422886e-06, 'epoch': 2.298507462686567, 'total_flos': 1635553388712960, 'step': 616}\n",
"\n",
"Iteration: 30% 80/268 [00:30<01:12, 2.58it/s]\u001b[A\n",
"Iteration: 30% 81/268 [00:31<01:12, 2.58it/s]\u001b[A\n",
"Iteration: 31% 82/268 [00:31<01:11, 2.59it/s]\u001b[A\n",
"Iteration: 31% 83/268 [00:32<01:11, 2.59it/s]\u001b[A\n",
"Iteration: 31% 84/268 [00:32<01:11, 2.58it/s]\u001b[A\n",
"Iteration: 32% 85/268 [00:32<01:10, 2.59it/s]\u001b[A\n",
"Iteration: 32% 86/268 [00:33<01:10, 2.59it/s]\u001b[A\n",
"Iteration: 32% 87/268 [00:33<01:09, 2.59it/s]\u001b[A{'loss': 0.1505413055419922, 'learning_rate': 4.477611940298508e-06, 'epoch': 2.328358208955224, 'total_flos': 1656848357053440, 'step': 624}\n",
"\n",
"Iteration: 33% 88/268 [00:34<01:09, 2.59it/s]\u001b[A\n",
"Iteration: 33% 89/268 [00:34<01:09, 2.59it/s]\u001b[A\n",
"Iteration: 34% 90/268 [00:34<01:08, 2.59it/s]\u001b[A\n",
"Iteration: 34% 91/268 [00:35<01:08, 2.58it/s]\u001b[A\n",
"Iteration: 34% 92/268 [00:35<01:08, 2.58it/s]\u001b[A\n",
"Iteration: 35% 93/268 [00:35<01:07, 2.58it/s]\u001b[A\n",
"Iteration: 35% 94/268 [00:36<01:07, 2.58it/s]\u001b[A\n",
"Iteration: 35% 95/268 [00:36<01:06, 2.58it/s]\u001b[A{'loss': 0.1272563934326172, 'learning_rate': 4.278606965174129e-06, 'epoch': 2.3582089552238807, 'total_flos': 1678143325393920, 'step': 632}\n",
"\n",
"Iteration: 36% 96/268 [00:37<01:06, 2.58it/s]\u001b[A\n",
"Iteration: 36% 97/268 [00:37<01:06, 2.59it/s]\u001b[A\n",
"Iteration: 37% 98/268 [00:37<01:05, 2.58it/s]\u001b[A\n",
"Iteration: 37% 99/268 [00:38<01:05, 2.58it/s]\u001b[A\n",
"Iteration: 37% 100/268 [00:38<01:04, 2.59it/s]\u001b[A\n",
"Iteration: 38% 101/268 [00:39<01:04, 2.59it/s]\u001b[A\n",
"Iteration: 38% 102/268 [00:39<01:04, 2.59it/s]\u001b[A\n",
"Iteration: 38% 103/268 [00:39<01:03, 2.59it/s]\u001b[A{'loss': 0.21704673767089844, 'learning_rate': 4.079601990049751e-06, 'epoch': 2.388059701492537, 'total_flos': 1699438293734400, 'step': 640}\n",
"\n",
"Iteration: 39% 104/268 [00:40<01:03, 2.59it/s]\u001b[A\n",
"Iteration: 39% 105/268 [00:40<01:02, 2.59it/s]\u001b[A\n",
"Iteration: 40% 106/268 [00:40<01:02, 2.59it/s]\u001b[A\n",
"Iteration: 40% 107/268 [00:41<01:02, 2.59it/s]\u001b[A\n",
"Iteration: 40% 108/268 [00:41<01:01, 2.59it/s]\u001b[A\n",
"Iteration: 41% 109/268 [00:42<01:01, 2.59it/s]\u001b[A\n",
"Iteration: 41% 110/268 [00:42<01:01, 2.59it/s]\u001b[A\n",
"Iteration: 41% 111/268 [00:42<01:00, 2.59it/s]\u001b[A{'loss': 0.20648956298828125, 'learning_rate': 3.8805970149253735e-06, 'epoch': 2.417910447761194, 'total_flos': 1720733262074880, 'step': 648}\n",
"\n",
"Iteration: 42% 112/268 [00:43<01:00, 2.58it/s]\u001b[A\n",
"Iteration: 42% 113/268 [00:43<01:00, 2.58it/s]\u001b[A\n",
"Iteration: 43% 114/268 [00:44<00:59, 2.59it/s]\u001b[A\n",
"Iteration: 43% 115/268 [00:44<00:59, 2.59it/s]\u001b[A\n",
"Iteration: 43% 116/268 [00:44<00:58, 2.59it/s]\u001b[A\n",
"Iteration: 44% 117/268 [00:45<00:58, 2.59it/s]\u001b[A\n",
"Iteration: 44% 118/268 [00:45<00:57, 2.59it/s]\u001b[A\n",
"Iteration: 44% 119/268 [00:45<00:57, 2.59it/s]\u001b[A{'loss': 0.13833045959472656, 'learning_rate': 3.681592039800995e-06, 'epoch': 2.4477611940298507, 'total_flos': 1742028230415360, 'step': 656}\n",
"\n",
"Iteration: 45% 120/268 [00:46<00:57, 2.59it/s]\u001b[A\n",
"Iteration: 45% 121/268 [00:46<00:56, 2.59it/s]\u001b[A\n",
"Iteration: 46% 122/268 [00:47<00:56, 2.58it/s]\u001b[A\n",
"Iteration: 46% 123/268 [00:47<00:56, 2.59it/s]\u001b[A\n",
"Iteration: 46% 124/268 [00:47<00:55, 2.59it/s]\u001b[A\n",
"Iteration: 47% 125/268 [00:48<00:55, 2.59it/s]\u001b[A\n",
"Iteration: 47% 126/268 [00:48<00:54, 2.59it/s]\u001b[A\n",
"Iteration: 47% 127/268 [00:49<00:54, 2.59it/s]\u001b[A{'loss': 0.21320152282714844, 'learning_rate': 3.4825870646766175e-06, 'epoch': 2.4776119402985075, 'total_flos': 1763323198755840, 'step': 664}\n",
"\n",
"Iteration: 48% 128/268 [00:49<00:54, 2.59it/s]\u001b[A\n",
"Iteration: 48% 129/268 [00:49<00:53, 2.58it/s]\u001b[A\n",
"Iteration: 49% 130/268 [00:50<00:53, 2.58it/s]\u001b[A\n",
"Iteration: 49% 131/268 [00:50<00:53, 2.58it/s]\u001b[A\n",
"Iteration: 49% 132/268 [00:51<00:52, 2.58it/s]\u001b[A\n",
"Iteration: 50% 133/268 [00:51<00:52, 2.58it/s]\u001b[A\n",
"Iteration: 50% 134/268 [00:51<00:51, 2.58it/s]\u001b[A\n",
"Iteration: 50% 135/268 [00:52<00:51, 2.58it/s]\u001b[A{'loss': 0.16924476623535156, 'learning_rate': 3.283582089552239e-06, 'epoch': 2.5074626865671643, 'total_flos': 1784618167096320, 'step': 672}\n",
"\n",
"Iteration: 51% 136/268 [00:52<00:51, 2.59it/s]\u001b[A\n",
"Iteration: 51% 137/268 [00:52<00:50, 2.59it/s]\u001b[A\n",
"Iteration: 51% 138/268 [00:53<00:50, 2.59it/s]\u001b[A\n",
"Iteration: 52% 139/268 [00:53<00:49, 2.59it/s]\u001b[A\n",
"Iteration: 52% 140/268 [00:54<00:49, 2.59it/s]\u001b[A\n",
"Iteration: 53% 141/268 [00:54<00:49, 2.59it/s]\u001b[A\n",
"Iteration: 53% 142/268 [00:54<00:48, 2.59it/s]\u001b[A\n",
"Iteration: 53% 143/268 [00:55<00:48, 2.59it/s]\u001b[A{'loss': 0.14482688903808594, 'learning_rate': 3.0845771144278608e-06, 'epoch': 2.5373134328358207, 'total_flos': 1805913135436800, 'step': 680}\n",
"\n",
"Iteration: 54% 144/268 [00:55<00:47, 2.59it/s]\u001b[A\n",
"Iteration: 54% 145/268 [00:56<00:47, 2.59it/s]\u001b[A\n",
"Iteration: 54% 146/268 [00:56<00:47, 2.57it/s]\u001b[A\n",
"Iteration: 55% 147/268 [00:56<00:47, 2.57it/s]\u001b[A\n",
"Iteration: 55% 148/268 [00:57<00:46, 2.57it/s]\u001b[A\n",
"Iteration: 56% 149/268 [00:57<00:46, 2.58it/s]\u001b[A\n",
"Iteration: 56% 150/268 [00:57<00:45, 2.58it/s]\u001b[A\n",
"Iteration: 56% 151/268 [00:58<00:45, 2.58it/s]\u001b[A{'loss': 0.1535816192626953, 'learning_rate': 2.885572139303483e-06, 'epoch': 2.5671641791044775, 'total_flos': 1827208103777280, 'step': 688}\n",
"\n",
"Iteration: 57% 152/268 [00:58<00:44, 2.58it/s]\u001b[A\n",
"Iteration: 57% 153/268 [00:59<00:44, 2.59it/s]\u001b[A\n",
"Iteration: 57% 154/268 [00:59<00:44, 2.59it/s]\u001b[A\n",
"Iteration: 58% 155/268 [00:59<00:43, 2.59it/s]\u001b[A\n",
"Iteration: 58% 156/268 [01:00<00:43, 2.59it/s]\u001b[A\n",
"Iteration: 59% 157/268 [01:00<00:42, 2.59it/s]\u001b[A\n",
"Iteration: 59% 158/268 [01:01<00:42, 2.58it/s]\u001b[A\n",
"Iteration: 59% 159/268 [01:01<00:42, 2.58it/s]\u001b[A{'loss': 0.17783164978027344, 'learning_rate': 2.686567164179105e-06, 'epoch': 2.5970149253731343, 'total_flos': 1848503072117760, 'step': 696}\n",
"\n",
"Iteration: 60% 160/268 [01:01<00:41, 2.58it/s]\u001b[A\n",
"Iteration: 60% 161/268 [01:02<00:41, 2.58it/s]\u001b[A\n",
"Iteration: 60% 162/268 [01:02<00:40, 2.59it/s]\u001b[A\n",
"Iteration: 61% 163/268 [01:03<00:40, 2.59it/s]\u001b[A\n",
"Iteration: 61% 164/268 [01:03<00:40, 2.59it/s]\u001b[A\n",
"Iteration: 62% 165/268 [01:03<00:39, 2.59it/s]\u001b[A\n",
"Iteration: 62% 166/268 [01:04<00:39, 2.59it/s]\u001b[A\n",
"Iteration: 62% 167/268 [01:04<00:38, 2.59it/s]\u001b[A{'loss': 0.17455673217773438, 'learning_rate': 2.4875621890547264e-06, 'epoch': 2.626865671641791, 'total_flos': 1869798040458240, 'step': 704}\n",
"\n",
"Iteration: 63% 168/268 [01:04<00:38, 2.59it/s]\u001b[A\n",
"Iteration: 63% 169/268 [01:05<00:38, 2.59it/s]\u001b[A\n",
"Iteration: 63% 170/268 [01:05<00:37, 2.59it/s]\u001b[A\n",
"Iteration: 64% 171/268 [01:06<00:37, 2.59it/s]\u001b[A\n",
"Iteration: 64% 172/268 [01:06<00:37, 2.59it/s]\u001b[A\n",
"Iteration: 65% 173/268 [01:06<00:36, 2.59it/s]\u001b[A\n",
"Iteration: 65% 174/268 [01:07<00:36, 2.59it/s]\u001b[A\n",
"Iteration: 65% 175/268 [01:07<00:35, 2.59it/s]\u001b[A{'loss': 0.166412353515625, 'learning_rate': 2.2885572139303485e-06, 'epoch': 2.656716417910448, 'total_flos': 1891093008798720, 'step': 712}\n",
"\n",
"Iteration: 66% 176/268 [01:08<00:35, 2.59it/s]\u001b[A\n",
"Iteration: 66% 177/268 [01:08<00:35, 2.59it/s]\u001b[A\n",
"Iteration: 66% 178/268 [01:08<00:34, 2.59it/s]\u001b[A\n",
"Iteration: 67% 179/268 [01:09<00:34, 2.59it/s]\u001b[A\n",
"Iteration: 67% 180/268 [01:09<00:34, 2.58it/s]\u001b[A\n",
"Iteration: 68% 181/268 [01:09<00:33, 2.58it/s]\u001b[A\n",
"Iteration: 68% 182/268 [01:10<00:33, 2.58it/s]\u001b[A\n",
"Iteration: 68% 183/268 [01:10<00:32, 2.59it/s]\u001b[A{'loss': 0.12270927429199219, 'learning_rate': 2.08955223880597e-06, 'epoch': 2.6865671641791042, 'total_flos': 1912387977139200, 'step': 720}\n",
"\n",
"Iteration: 69% 184/268 [01:11<00:32, 2.59it/s]\u001b[A\n",
"Iteration: 69% 185/268 [01:11<00:32, 2.59it/s]\u001b[A\n",
"Iteration: 69% 186/268 [01:11<00:31, 2.59it/s]\u001b[A\n",
"Iteration: 70% 187/268 [01:12<00:31, 2.59it/s]\u001b[A\n",
"Iteration: 70% 188/268 [01:12<00:30, 2.60it/s]\u001b[A\n",
"Iteration: 71% 189/268 [01:13<00:30, 2.59it/s]\u001b[A\n",
"Iteration: 71% 190/268 [01:13<00:30, 2.59it/s]\u001b[A\n",
"Iteration: 71% 191/268 [01:13<00:29, 2.59it/s]\u001b[A{'loss': 0.25324440002441406, 'learning_rate': 1.8905472636815921e-06, 'epoch': 2.716417910447761, 'total_flos': 1933682945479680, 'step': 728}\n",
"\n",
"Iteration: 72% 192/268 [01:14<00:29, 2.59it/s]\u001b[A\n",
"Iteration: 72% 193/268 [01:14<00:29, 2.58it/s]\u001b[A\n",
"Iteration: 72% 194/268 [01:14<00:28, 2.59it/s]\u001b[A\n",
"Iteration: 73% 195/268 [01:15<00:28, 2.59it/s]\u001b[A\n",
"Iteration: 73% 196/268 [01:15<00:27, 2.59it/s]\u001b[A\n",
"Iteration: 74% 197/268 [01:16<00:27, 2.59it/s]\u001b[A\n",
"Iteration: 74% 198/268 [01:16<00:27, 2.59it/s]\u001b[A\n",
"Iteration: 74% 199/268 [01:16<00:26, 2.60it/s]\u001b[A{'loss': 0.13264083862304688, 'learning_rate': 1.6915422885572142e-06, 'epoch': 2.746268656716418, 'total_flos': 1954977913820160, 'step': 736}\n",
"\n",
"Iteration: 75% 200/268 [01:17<00:26, 2.60it/s]\u001b[A\n",
"Iteration: 75% 201/268 [01:17<00:25, 2.59it/s]\u001b[A\n",
"Iteration: 75% 202/268 [01:18<00:25, 2.59it/s]\u001b[A\n",
"Iteration: 76% 203/268 [01:18<00:25, 2.60it/s]\u001b[A\n",
"Iteration: 76% 204/268 [01:18<00:24, 2.60it/s]\u001b[A\n",
"Iteration: 76% 205/268 [01:19<00:24, 2.59it/s]\u001b[A\n",
"Iteration: 77% 206/268 [01:19<00:23, 2.59it/s]\u001b[A\n",
"Iteration: 77% 207/268 [01:20<00:23, 2.59it/s]\u001b[A{'loss': 0.13996315002441406, 'learning_rate': 1.4925373134328358e-06, 'epoch': 2.7761194029850746, 'total_flos': 1976272882160640, 'step': 744}\n",
"\n",
"Iteration: 78% 208/268 [01:20<00:23, 2.59it/s]\u001b[A\n",
"Iteration: 78% 209/268 [01:20<00:22, 2.59it/s]\u001b[A\n",
"Iteration: 78% 210/268 [01:21<00:22, 2.59it/s]\u001b[A\n",
"Iteration: 79% 211/268 [01:21<00:21, 2.59it/s]\u001b[A\n",
"Iteration: 79% 212/268 [01:21<00:21, 2.60it/s]\u001b[A\n",
"Iteration: 79% 213/268 [01:22<00:21, 2.60it/s]\u001b[A\n",
"Iteration: 80% 214/268 [01:22<00:20, 2.59it/s]\u001b[A\n",
"Iteration: 80% 215/268 [01:23<00:20, 2.59it/s]\u001b[A{'loss': 0.1657257080078125, 'learning_rate': 1.2935323383084578e-06, 'epoch': 2.8059701492537314, 'total_flos': 1997567850501120, 'step': 752}\n",
"\n",
"Iteration: 81% 216/268 [01:23<00:20, 2.59it/s]\u001b[A\n",
"Iteration: 81% 217/268 [01:23<00:19, 2.59it/s]\u001b[A\n",
"Iteration: 81% 218/268 [01:24<00:19, 2.59it/s]\u001b[A\n",
"Iteration: 82% 219/268 [01:24<00:18, 2.59it/s]\u001b[A\n",
"Iteration: 82% 220/268 [01:25<00:18, 2.59it/s]\u001b[A\n",
"Iteration: 82% 221/268 [01:25<00:18, 2.59it/s]\u001b[A\n",
"Iteration: 83% 222/268 [01:25<00:17, 2.59it/s]\u001b[A\n",
"Iteration: 83% 223/268 [01:26<00:17, 2.59it/s]\u001b[A{'loss': 0.1683349609375, 'learning_rate': 1.0945273631840796e-06, 'epoch': 2.835820895522388, 'total_flos': 2018862818841600, 'step': 760}\n",
"\n",
"Iteration: 84% 224/268 [01:26<00:16, 2.59it/s]\u001b[A\n",
"Iteration: 84% 225/268 [01:26<00:16, 2.58it/s]\u001b[A\n",
"Iteration: 84% 226/268 [01:27<00:16, 2.59it/s]\u001b[A\n",
"Iteration: 85% 227/268 [01:27<00:15, 2.59it/s]\u001b[A\n",
"Iteration: 85% 228/268 [01:28<00:15, 2.59it/s]\u001b[A\n",
"Iteration: 85% 229/268 [01:28<00:15, 2.59it/s]\u001b[A\n",
"Iteration: 86% 230/268 [01:28<00:14, 2.60it/s]\u001b[A\n",
"Iteration: 86% 231/268 [01:29<00:14, 2.60it/s]\u001b[A{'loss': 0.17348670959472656, 'learning_rate': 8.955223880597015e-07, 'epoch': 2.8656716417910446, 'total_flos': 2040157787182080, 'step': 768}\n",
"\n",
"Iteration: 87% 232/268 [01:29<00:13, 2.60it/s]\u001b[A\n",
"Iteration: 87% 233/268 [01:30<00:13, 2.59it/s]\u001b[A\n",
"Iteration: 87% 234/268 [01:30<00:13, 2.59it/s]\u001b[A\n",
"Iteration: 88% 235/268 [01:30<00:12, 2.59it/s]\u001b[A\n",
"Iteration: 88% 236/268 [01:31<00:12, 2.59it/s]\u001b[A\n",
"Iteration: 88% 237/268 [01:31<00:11, 2.59it/s]\u001b[A\n",
"Iteration: 89% 238/268 [01:31<00:11, 2.59it/s]\u001b[A\n",
"Iteration: 89% 239/268 [01:32<00:11, 2.59it/s]\u001b[A{'loss': 0.214385986328125, 'learning_rate': 6.965174129353235e-07, 'epoch': 2.8955223880597014, 'total_flos': 2061452755522560, 'step': 776}\n",
"\n",
"Iteration: 90% 240/268 [01:32<00:10, 2.59it/s]\u001b[A\n",
"Iteration: 90% 241/268 [01:33<00:10, 2.59it/s]\u001b[A\n",
"Iteration: 90% 242/268 [01:33<00:10, 2.59it/s]\u001b[A\n",
"Iteration: 91% 243/268 [01:33<00:09, 2.59it/s]\u001b[A\n",
"Iteration: 91% 244/268 [01:34<00:09, 2.60it/s]\u001b[A\n",
"Iteration: 91% 245/268 [01:34<00:08, 2.59it/s]\u001b[A\n",
"Iteration: 92% 246/268 [01:35<00:08, 2.59it/s]\u001b[A\n",
"Iteration: 92% 247/268 [01:35<00:08, 2.59it/s]\u001b[A{'loss': 0.18590545654296875, 'learning_rate': 4.975124378109453e-07, 'epoch': 2.925373134328358, 'total_flos': 2082747723863040, 'step': 784}\n",
"\n",
"Iteration: 93% 248/268 [01:35<00:07, 2.59it/s]\u001b[A\n",
"Iteration: 93% 249/268 [01:36<00:07, 2.59it/s]\u001b[A\n",
"Iteration: 93% 250/268 [01:36<00:06, 2.59it/s]\u001b[A\n",
"Iteration: 94% 251/268 [01:36<00:06, 2.59it/s]\u001b[A\n",
"Iteration: 94% 252/268 [01:37<00:06, 2.59it/s]\u001b[A\n",
"Iteration: 94% 253/268 [01:37<00:05, 2.59it/s]\u001b[A\n",
"Iteration: 95% 254/268 [01:38<00:05, 2.59it/s]\u001b[A\n",
"Iteration: 95% 255/268 [01:38<00:05, 2.59it/s]\u001b[A{'loss': 0.1844921112060547, 'learning_rate': 2.9850746268656716e-07, 'epoch': 2.955223880597015, 'total_flos': 2104042692203520, 'step': 792}\n",
"\n",
"Iteration: 96% 256/268 [01:38<00:04, 2.58it/s]\u001b[A\n",
"Iteration: 96% 257/268 [01:39<00:04, 2.58it/s]\u001b[A\n",
"Iteration: 96% 258/268 [01:39<00:03, 2.59it/s]\u001b[A\n",
"Iteration: 97% 259/268 [01:40<00:03, 2.59it/s]\u001b[A\n",
"Iteration: 97% 260/268 [01:40<00:03, 2.59it/s]\u001b[A\n",
"Iteration: 97% 261/268 [01:40<00:02, 2.59it/s]\u001b[A\n",
"Iteration: 98% 262/268 [01:41<00:02, 2.60it/s]\u001b[A\n",
"Iteration: 98% 263/268 [01:41<00:01, 2.59it/s]\u001b[A{'loss': 0.17237091064453125, 'learning_rate': 9.950248756218906e-08, 'epoch': 2.9850746268656714, 'total_flos': 2125337660544000, 'step': 800}\n",
"\n",
"Iteration: 99% 264/268 [01:41<00:01, 2.59it/s]\u001b[A\n",
"Iteration: 99% 265/268 [01:42<00:01, 2.59it/s]\u001b[A\n",
"Iteration: 99% 266/268 [01:42<00:00, 2.59it/s]\u001b[A\n",
"Iteration: 100% 267/268 [01:43<00:00, 2.59it/s]\u001b[A\n",
"Iteration: 100% 268/268 [01:43<00:00, 2.60it/s]\n",
"Epoch: 100% 3/3 [05:14<00:00, 104.84s/it]\n",
"/usr/local/lib/python3.6/dist-packages/transformers/trainer.py:1129: FutureWarning: This method is deprecated, use `Trainer.is_world_process_zero()` instead.\n",
" warnings.warn(\"This method is deprecated, use `Trainer.is_world_process_zero()` instead.\", FutureWarning)\n",
"10/21/2020 07:50:20 - INFO - __main__ - *** Evaluate ***\n",
"Evaluation: 100% 131/131 [00:04<00:00, 28.99it/s]\n",
"{'eval_loss': 0.5057335923982147, 'eval_mcc': 0.6107363335829461, 'epoch': 3.0, 'total_flos': 2133905557962240, 'step': 804}\n",
"10/21/2020 07:50:25 - INFO - __main__ - ***** Eval results cola *****\n",
"10/21/2020 07:50:25 - INFO - __main__ - eval_loss = 0.5057335923982147\n",
"10/21/2020 07:50:25 - INFO - __main__ - eval_mcc = 0.6107363335829461\n",
"10/21/2020 07:50:25 - INFO - __main__ - epoch = 3.0\n",
"10/21/2020 07:50:25 - INFO - __main__ - total_flos = 2133905557962240\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "7jPVVVfWn3Uq"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment