ceshine/quest-public.ipynb Secret

## quest-public.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "quest-public.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "authorship_tag": "ABX9TyNHvdkFfvCQLr8F+fTSgxtj",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "TPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/ceshine/752c77742973a013320a9f20384528a1/quest-public.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7BI0qPAAyWFS",
        "colab_type": "text"
      },
      "source": [
        "## Prepare environment"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "39MSBo8QzdIE",
        "colab_type": "code",
        "outputId": "a51f0a08-07bf-472a-e6a4-415290e3d1a5",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "%tensorflow_version 2.x"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "TensorFlow 2.x selected.\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "_R5grSggQIBk",
        "colab_type": "code",
        "outputId": "52024d31-29bf-470b-f70c-d081986f790d",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "import tensorflow as tf\n",
        "tf.__version__"
      ],
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'2.1.0'"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 2
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "r6FwWWP9zmKc",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# Restart the runtime after running and then skip this step\n",
        "!pip install -U wandb fire transformers==2.3.0 python-telegram-bot kaggle ipykernel"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "m5DJi5zI-nJb",
        "colab_type": "code",
        "colab": {},
        "cellView": "both"
      },
      "source": [
        "#@title Secrets (Leave TG constants empty if you don't want Telegram notification)\n",
        "TG_TOKEN = ''  #@param {type: \"string\"}\n",
        "TG_CHAT_ID = ''  #@param {type: \"string\"}\n",
        "KAGGLE_API_KEY = 'api key used for uploading the dataset'  #@param {type: \"string\"}"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wDjoS9NEzqmK",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# Authenticate to read from Google Cloud Storage\n",
        "from google.colab import auth\n",
        "auth.authenticate_user()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Ma7tIEOpzvZ1",
        "colab_type": "code",
        "outputId": "29a6e280-b1b5-4c99-bcad-7eb1abbe8756",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 86
        }
      },
      "source": [
        "!gsutil cp gs://ceshine-colab-tmp-2/quest/*.whl ."
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Copying gs://ceshine-colab-tmp-2/quest/quest-0.0.1-py3-none-any.whl...\n",
            "/ [0 files][    0.0 B/ 18.0 KiB]                                                \r/ [1 files][ 18.0 KiB/ 18.0 KiB]                                                \rCopying gs://ceshine-colab-tmp-2/quest/tf_helper_bot-0.0.1-py3-none-any.whl...\n",
            "/ [1 files][ 18.0 KiB/ 31.2 KiB]                                                \r/ [2 files][ 31.2 KiB/ 31.2 KiB]                                                \r\n",
            "Operation completed over 2 objects/31.2 KiB.                                     \n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "MOxYSAxD2csb",
        "colab_type": "code",
        "outputId": "023784ea-4f39-4d8a-f73c-2ccf1f9df387",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 121
        }
      },
      "source": [
        "!pip install --force-reinstall tf_helper_bot-0.0.1-py3-none-any.whl\n",
        "!pip install --force-reinstall quest-0.0.1-py3-none-any.whl"
      ],
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Processing ./tf_helper_bot-0.0.1-py3-none-any.whl\n",
            "Installing collected packages: tf-helper-bot\n",
            "Successfully installed tf-helper-bot-0.0.1\n",
            "Processing ./quest-0.0.1-py3-none-any.whl\n",
            "Installing collected packages: quest\n",
            "Successfully installed quest-0.0.1\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JhZ9gEFx9l1j",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!mkdir -p ~/.kaggle\n",
        "content = '\\'{\"username\":\"ceshine\",\"key\":\"%s\"}\\'' % KAGGLE_API_KEY\n",
        "!echo {content} > ~/.kaggle/kaggle.json\n",
        "!chmod 600 ~/.kaggle/kaggle.json"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "x7IyXDDq2jWd",
        "colab_type": "code",
        "outputId": "5f6f0c3d-7a95-4857-e023-3d5aee9ab8a7",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 243
        }
      },
      "source": [
        "!pip install -U --force-reinstall --no-deps kaggle"
      ],
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Collecting kaggle\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/62/ab/bb20f9b9e24f9a6250f95a432f8d9a7d745f8d24039d7a5a6eaadb7783ba/kaggle-1.5.6.tar.gz (58kB)\n",
            "\r\u001b[K     |█████▋                          | 10kB 18.4MB/s eta 0:00:01\r\u001b[K     |███████████▎                    | 20kB 2.2MB/s eta 0:00:01\r\u001b[K     |█████████████████               | 30kB 3.1MB/s eta 0:00:01\r\u001b[K     |██████████████████████▌         | 40kB 2.1MB/s eta 0:00:01\r\u001b[K     |████████████████████████████▏   | 51kB 2.5MB/s eta 0:00:01\r\u001b[K     |████████████████████████████████| 61kB 2.4MB/s \n",
            "\u001b[?25hBuilding wheels for collected packages: kaggle\n",
            "  Building wheel for kaggle (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for kaggle: filename=kaggle-1.5.6-cp36-none-any.whl size=72859 sha256=6f91a26c57e5844d3334549065a3f2a05392b2894c90e41132fe88097c461c1b\n",
            "  Stored in directory: /root/.cache/pip/wheels/57/4e/e8/bb28d035162fb8f17f8ca5d42c3230e284c6aa565b42b72674\n",
            "Successfully built kaggle\n",
            "Installing collected packages: kaggle\n",
            "  Found existing installation: kaggle 1.5.6\n",
            "    Uninstalling kaggle-1.5.6:\n",
            "      Successfully uninstalled kaggle-1.5.6\n",
            "Successfully installed kaggle-1.5.6\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "U-6lZ5Fg9xiK",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!mkdir -p cache/kaggleds\n",
        "!cd cache/kaggleds # && kaggle datasets metadata quest-models "
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "_opEfdnN2_Oq",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!echo '{ \\\n",
        "  \"subtitle\": \"\", \\\n",
        "  \"description\": \"\", \\\n",
        "  \"title\": \"Quest Models\", \\\n",
        "  \"keywords\": [], \\\n",
        "  \"id\": \"ceshine/quest-models-public\"\\\n",
        "}' > cache/kaggleds/dataset-metadata.json"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "c2txSWpZ90QF",
        "colab_type": "code",
        "outputId": "a64967ab-2ba7-4ec0-eda2-4c2b087509e8",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 243
        }
      },
      "source": [
        "!gsutil cp -r gs://ceshine-colab-tmp-2/quest/tokenizer* cache/kaggleds/"
      ],
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Copying gs://ceshine-colab-tmp-2/quest/tokenizer_roberta-base/added_tokens.json...\n",
            "Copying gs://ceshine-colab-tmp-2/quest/tokenizer_roberta-base/merges.txt...\n",
            "Copying gs://ceshine-colab-tmp-2/quest/tokenizer_roberta-base/special_tokens_map.json...\n",
            "Copying gs://ceshine-colab-tmp-2/quest/tokenizer_roberta-base/tokenizer_config.json...\n",
            "/ [4 files][445.8 KiB/445.8 KiB]                                                \n",
            "==> NOTE: You are performing a sequence of gsutil operations that may\n",
            "run significantly faster if you instead use gsutil -m cp ... Please\n",
            "see the -m section under \"gsutil help options\" for further information\n",
            "about when gsutil -m can be advantageous.\n",
            "\n",
            "Copying gs://ceshine-colab-tmp-2/quest/tokenizer_roberta-base/vocab.json...\n",
            "/ [5 files][  1.3 MiB/  1.3 MiB]                                                \n",
            "Operation completed over 5 objects/1.3 MiB.                                      \n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0Mmnav9C_WiU",
        "colab_type": "code",
        "outputId": "289732a7-c7a9-4a75-fb02-018c45bddf3e",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "!ls cache/kaggleds"
      ],
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "dataset-metadata.json  tokenizer_roberta-base\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "p6zDyKKQFMl8",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "content = f'export TG_TOKEN={TG_TOKEN} TG_CHAT_ID={TG_CHAT_ID} TF_CPP_MIN_LOG_LEVEL=3'\n",
        "!echo {content} > ./env.sh"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "KZJW7utQypyq",
        "colab_type": "text"
      },
      "source": [
        "## Train models"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "emqTCElpHCMF",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!rm -rf cache/kaggleds/*fold*"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "V-Qa9agMpJdS",
        "colab_type": "code",
        "outputId": "7cbf7f6e-c531-401a-89b6-ad4f81f56f36",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        }
      },
      "source": [
        "!source env.sh && \\\n",
        "   python -m quest.train_folds --model-name roberta-base --batch-size 2 \\\n",
        "  --train-path-pattern \"gs://ceshine-colab-tmp-2/quest/train-%d-*.tfrec\" \\\n",
        "  --valid-path-pattern \"gs://ceshine-colab-tmp-2/quest/valid-%d-*.tfrec\" \\\n",
        "  --max_lr 1e-4 -n-folds 5 --freeze 3 \\\n",
        "  --steps 1500 --checkpoint-interval 300 --log-interval 150 \\\n",
        "  --output_path_pattern \"cache/kaggleds/roberta-base-fold-%d\""
      ],
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "====================\n",
            "Training Fold 1\n",
            "====================\n",
            "Running on TPU  ['10.7.137.26:8470']\n",
            "INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0\n",
            "REPLICAS:  8\n",
            "cnt: 4863 batch size: 16\n",
            "cnt: 1216 batch size: 32\n",
            "Model: \"dual_roberta_model\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "roberta_question (RobertaEnc multiple                  124645632 \n",
            "_________________________________________________________________\n",
            "dropout_38 (Dropout)         multiple                  0         \n",
            "_________________________________________________________________\n",
            "q_classifier (Dense)         multiple                  16149     \n",
            "_________________________________________________________________\n",
            "a_classifier (Dense)         multiple                  3845      \n",
            "_________________________________________________________________\n",
            "j_classifier (Dense)         multiple                  9220      \n",
            "_________________________________________________________________\n",
            "se_layer (SELayer)           multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_1 (SELayer)         multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_2 (SELayer)         multiple                  2657088   \n",
            "=================================================================\n",
            "Total params: 127,923,678\n",
            "Trainable params: 3,278,046\n",
            "Non-trainable params: 124,645,632\n",
            "_________________________________________________________________\n",
            "None\n",
            "Train for 912 steps\n",
            "912/912 [==============================] - 118s 129ms/step - loss: 0.4473\n",
            "Model: \"dual_roberta_model\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "roberta_question (RobertaEnc multiple                  124645632 \n",
            "_________________________________________________________________\n",
            "dropout_38 (Dropout)         multiple                  0         \n",
            "_________________________________________________________________\n",
            "q_classifier (Dense)         multiple                  16149     \n",
            "_________________________________________________________________\n",
            "a_classifier (Dense)         multiple                  3845      \n",
            "_________________________________________________________________\n",
            "j_classifier (Dense)         multiple                  9220      \n",
            "_________________________________________________________________\n",
            "se_layer (SELayer)           multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_1 (SELayer)         multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_2 (SELayer)         multiple                  2657088   \n",
            "=================================================================\n",
            "Total params: 127,923,678\n",
            "Trainable params: 127,923,678\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n",
            "None\n",
            "Steps per epoch: 304 | 38\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "/tensorflow-2.1.0/python3.6/tensorflow_core/python/framework/indexed_slices.py:433: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
            "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "[INFO][02/12/2020 12:30:08] Step   150 | loss 0.3948 | lr 1.00e-04 | 0.933s per step\n",
            "[INFO][02/12/2020 12:30:42] Step   300 | loss 0.3839 | lr 9.70e-05 | 0.228s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:20<00:00,  6.68it/s]\n",
            "[INFO][02/12/2020 12:31:03] Metrics at step 300:\n",
            "[INFO][02/12/2020 12:31:03] loss: 0.3749\n",
            "[INFO][02/12/2020 12:31:03] spearman: 38.21\n",
            "[INFO][02/12/2020 12:31:40] Step   450 | loss 0.3719 | lr 8.83e-05 | 0.386s per step\n",
            "[INFO][02/12/2020 12:32:15] Step   600 | loss 0.3676 | lr 7.50e-05 | 0.230s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.73it/s]\n",
            "[INFO][02/12/2020 12:32:21] Metrics at step 600:\n",
            "[INFO][02/12/2020 12:32:21] loss: 0.3716\n",
            "[INFO][02/12/2020 12:32:21] spearman: 39.09\n",
            "[INFO][02/12/2020 12:32:58] Step   750 | loss 0.3611 | lr 5.87e-05 | 0.291s per step\n",
            "[INFO][02/12/2020 12:33:33] Step   900 | loss 0.3588 | lr 4.13e-05 | 0.231s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:05<00:00,  6.55it/s]\n",
            "[INFO][02/12/2020 12:33:39] Metrics at step 900:\n",
            "[INFO][02/12/2020 12:33:39] loss: 0.3683\n",
            "[INFO][02/12/2020 12:33:39] spearman: 40.49\n",
            "[INFO][02/12/2020 12:34:16] Step  1050 | loss 0.3496 | lr 2.50e-05 | 0.290s per step\n",
            "[INFO][02/12/2020 12:34:51] Step  1200 | loss 0.3484 | lr 1.17e-05 | 0.230s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.75it/s]\n",
            "[INFO][02/12/2020 12:34:57] Metrics at step 1200:\n",
            "[INFO][02/12/2020 12:34:57] loss: 0.3681\n",
            "[INFO][02/12/2020 12:34:57] spearman: 40.60\n",
            "[INFO][02/12/2020 12:35:35] Step  1350 | loss 0.3424 | lr 3.03e-06 | 0.291s per step\n",
            "[INFO][02/12/2020 12:36:09] Step  1500 | loss 0.3402 | lr 1.00e-08 | 0.229s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.58it/s]\n",
            "[INFO][02/12/2020 12:36:15] Metrics at step 1500:\n",
            "[INFO][02/12/2020 12:36:15] loss: 0.3690\n",
            "[INFO][02/12/2020 12:36:15] spearman: 40.55\n",
            "[INFO][02/12/2020 12:36:15] Training finished. Best step(s):\n",
            "[INFO][02/12/2020 12:36:15] loss: 0.3681 @ step 1200\n",
            "[INFO][02/12/2020 12:36:15] spearman: 40.60 @ step 1200\n",
            "====================\n",
            "Training Fold 2\n",
            "====================\n",
            "Running on TPU  ['10.7.137.26:8470']\n",
            "WARNING:tensorflow:TPU system 10.7.137.26:8470 has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.\n",
            "WARNING:tensorflow:TPU system 10.7.137.26:8470 has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.\n",
            "REPLICAS:  8\n",
            "cnt: 4863 batch size: 16\n",
            "cnt: 1216 batch size: 32\n",
            "Model: \"dual_roberta_model_1\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "roberta_question (RobertaEnc multiple                  124645632 \n",
            "_________________________________________________________________\n",
            "dropout_77 (Dropout)         multiple                  0         \n",
            "_________________________________________________________________\n",
            "q_classifier (Dense)         multiple                  16149     \n",
            "_________________________________________________________________\n",
            "a_classifier (Dense)         multiple                  3845      \n",
            "_________________________________________________________________\n",
            "j_classifier (Dense)         multiple                  9220      \n",
            "_________________________________________________________________\n",
            "se_layer_3 (SELayer)         multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_4 (SELayer)         multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_5 (SELayer)         multiple                  2657088   \n",
            "=================================================================\n",
            "Total params: 127,923,678\n",
            "Trainable params: 3,278,046\n",
            "Non-trainable params: 124,645,632\n",
            "_________________________________________________________________\n",
            "None\n",
            "Train for 912 steps\n",
            "912/912 [==============================] - 123s 134ms/step - loss: 0.4428\n",
            "Model: \"dual_roberta_model_1\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "roberta_question (RobertaEnc multiple                  124645632 \n",
            "_________________________________________________________________\n",
            "dropout_77 (Dropout)         multiple                  0         \n",
            "_________________________________________________________________\n",
            "q_classifier (Dense)         multiple                  16149     \n",
            "_________________________________________________________________\n",
            "a_classifier (Dense)         multiple                  3845      \n",
            "_________________________________________________________________\n",
            "j_classifier (Dense)         multiple                  9220      \n",
            "_________________________________________________________________\n",
            "se_layer_3 (SELayer)         multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_4 (SELayer)         multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_5 (SELayer)         multiple                  2657088   \n",
            "=================================================================\n",
            "Total params: 127,923,678\n",
            "Trainable params: 127,923,678\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n",
            "None\n",
            "Steps per epoch: 304 | 38\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "/tensorflow-2.1.0/python3.6/tensorflow_core/python/framework/indexed_slices.py:433: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
            "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "[INFO][02/12/2020 12:41:36] Step   150 | loss 0.3958 | lr 1.00e-04 | 0.933s per step\n",
            "[INFO][02/12/2020 12:42:11] Step   300 | loss 0.3848 | lr 9.70e-05 | 0.228s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:20<00:00,  6.54it/s]\n",
            "[INFO][02/12/2020 12:42:31] Metrics at step 300:\n",
            "[INFO][02/12/2020 12:42:31] loss: 0.3772\n",
            "[INFO][02/12/2020 12:42:31] spearman: 36.65\n",
            "[INFO][02/12/2020 12:43:09] Step   450 | loss 0.3718 | lr 8.83e-05 | 0.386s per step\n",
            "[INFO][02/12/2020 12:43:44] Step   600 | loss 0.3681 | lr 7.50e-05 | 0.234s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.80it/s]\n",
            "[INFO][02/12/2020 12:43:50] Metrics at step 600:\n",
            "[INFO][02/12/2020 12:43:50] loss: 0.3712\n",
            "[INFO][02/12/2020 12:43:50] spearman: 38.33\n",
            "[INFO][02/12/2020 12:44:28] Step   750 | loss 0.3627 | lr 5.87e-05 | 0.294s per step\n",
            "[INFO][02/12/2020 12:45:02] Step   900 | loss 0.3574 | lr 4.13e-05 | 0.232s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.56it/s]\n",
            "[INFO][02/12/2020 12:45:09] Metrics at step 900:\n",
            "[INFO][02/12/2020 12:45:09] loss: 0.3697\n",
            "[INFO][02/12/2020 12:45:09] spearman: 38.91\n",
            "[INFO][02/12/2020 12:45:46] Step  1050 | loss 0.3528 | lr 2.50e-05 | 0.293s per step\n",
            "[INFO][02/12/2020 12:46:21] Step  1200 | loss 0.3459 | lr 1.17e-05 | 0.233s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.57it/s]\n",
            "[INFO][02/12/2020 12:46:27] Metrics at step 1200:\n",
            "[INFO][02/12/2020 12:46:27] loss: 0.3664\n",
            "[INFO][02/12/2020 12:46:27] spearman: 39.43\n",
            "[INFO][02/12/2020 12:47:05] Step  1350 | loss 0.3429 | lr 3.03e-06 | 0.290s per step\n",
            "[INFO][02/12/2020 12:47:40] Step  1500 | loss 0.3392 | lr 1.00e-08 | 0.231s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.66it/s]\n",
            "[INFO][02/12/2020 12:47:46] Metrics at step 1500:\n",
            "[INFO][02/12/2020 12:47:46] loss: 0.3671\n",
            "[INFO][02/12/2020 12:47:46] spearman: 39.63\n",
            "[INFO][02/12/2020 12:47:49] Training finished. Best step(s):\n",
            "[INFO][02/12/2020 12:47:49] loss: 0.3664 @ step 1200\n",
            "[INFO][02/12/2020 12:47:49] spearman: 39.63 @ step 1500\n",
            "====================\n",
            "Training Fold 3\n",
            "====================\n",
            "Running on TPU  ['10.7.137.26:8470']\n",
            "WARNING:tensorflow:TPU system 10.7.137.26:8470 has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.\n",
            "WARNING:tensorflow:TPU system 10.7.137.26:8470 has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.\n",
            "REPLICAS:  8\n",
            "cnt: 4863 batch size: 16\n",
            "cnt: 1216 batch size: 32\n",
            "Model: \"dual_roberta_model_2\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "roberta_question (RobertaEnc multiple                  124645632 \n",
            "_________________________________________________________________\n",
            "dropout_116 (Dropout)        multiple                  0         \n",
            "_________________________________________________________________\n",
            "q_classifier (Dense)         multiple                  16149     \n",
            "_________________________________________________________________\n",
            "a_classifier (Dense)         multiple                  3845      \n",
            "_________________________________________________________________\n",
            "j_classifier (Dense)         multiple                  9220      \n",
            "_________________________________________________________________\n",
            "se_layer_6 (SELayer)         multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_7 (SELayer)         multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_8 (SELayer)         multiple                  2657088   \n",
            "=================================================================\n",
            "Total params: 127,923,678\n",
            "Trainable params: 3,278,046\n",
            "Non-trainable params: 124,645,632\n",
            "_________________________________________________________________\n",
            "None\n",
            "Train for 912 steps\n",
            "912/912 [==============================] - 121s 133ms/step - loss: 0.4483\n",
            "Model: \"dual_roberta_model_2\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "roberta_question (RobertaEnc multiple                  124645632 \n",
            "_________________________________________________________________\n",
            "dropout_116 (Dropout)        multiple                  0         \n",
            "_________________________________________________________________\n",
            "q_classifier (Dense)         multiple                  16149     \n",
            "_________________________________________________________________\n",
            "a_classifier (Dense)         multiple                  3845      \n",
            "_________________________________________________________________\n",
            "j_classifier (Dense)         multiple                  9220      \n",
            "_________________________________________________________________\n",
            "se_layer_6 (SELayer)         multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_7 (SELayer)         multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_8 (SELayer)         multiple                  2657088   \n",
            "=================================================================\n",
            "Total params: 127,923,678\n",
            "Trainable params: 127,923,678\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n",
            "None\n",
            "Steps per epoch: 304 | 38\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "/tensorflow-2.1.0/python3.6/tensorflow_core/python/framework/indexed_slices.py:433: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
            "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "[INFO][02/12/2020 12:53:12] Step   150 | loss 0.3958 | lr 1.00e-04 | 0.937s per step\n",
            "[INFO][02/12/2020 12:53:46] Step   300 | loss 0.3854 | lr 9.70e-05 | 0.229s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:21<00:00,  6.47it/s]\n",
            "[INFO][02/12/2020 12:54:07] Metrics at step 300:\n",
            "[INFO][02/12/2020 12:54:07] loss: 0.3714\n",
            "[INFO][02/12/2020 12:54:07] spearman: 37.81\n",
            "[INFO][02/12/2020 12:54:45] Step   450 | loss 0.3736 | lr 8.83e-05 | 0.394s per step\n",
            "[INFO][02/12/2020 12:55:19] Step   600 | loss 0.3682 | lr 7.50e-05 | 0.229s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.46it/s]\n",
            "[INFO][02/12/2020 12:55:26] Metrics at step 600:\n",
            "[INFO][02/12/2020 12:55:26] loss: 0.3689\n",
            "[INFO][02/12/2020 12:55:26] spearman: 39.83\n",
            "[INFO][02/12/2020 12:56:03] Step   750 | loss 0.3611 | lr 5.87e-05 | 0.294s per step\n",
            "[INFO][02/12/2020 12:56:38] Step   900 | loss 0.3580 | lr 4.13e-05 | 0.232s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.39it/s]\n",
            "[INFO][02/12/2020 12:56:45] Metrics at step 900:\n",
            "[INFO][02/12/2020 12:56:45] loss: 0.3641\n",
            "[INFO][02/12/2020 12:56:45] spearman: 40.62\n",
            "[INFO][02/12/2020 12:57:23] Step  1050 | loss 0.3512 | lr 2.50e-05 | 0.298s per step\n",
            "[INFO][02/12/2020 12:57:58] Step  1200 | loss 0.3466 | lr 1.17e-05 | 0.233s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.49it/s]\n",
            "[INFO][02/12/2020 12:58:04] Metrics at step 1200:\n",
            "[INFO][02/12/2020 12:58:04] loss: 0.3640\n",
            "[INFO][02/12/2020 12:58:04] spearman: 41.06\n",
            "[INFO][02/12/2020 12:58:42] Step  1350 | loss 0.3410 | lr 3.03e-06 | 0.294s per step\n",
            "[INFO][02/12/2020 12:59:17] Step  1500 | loss 0.3386 | lr 1.00e-08 | 0.236s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.60it/s]\n",
            "[INFO][02/12/2020 12:59:23] Metrics at step 1500:\n",
            "[INFO][02/12/2020 12:59:23] loss: 0.3650\n",
            "[INFO][02/12/2020 12:59:23] spearman: 40.91\n",
            "[INFO][02/12/2020 12:59:23] Training finished. Best step(s):\n",
            "[INFO][02/12/2020 12:59:23] loss: 0.3640 @ step 1200\n",
            "[INFO][02/12/2020 12:59:23] spearman: 41.06 @ step 1200\n",
            "====================\n",
            "Training Fold 4\n",
            "====================\n",
            "Running on TPU  ['10.7.137.26:8470']\n",
            "WARNING:tensorflow:TPU system 10.7.137.26:8470 has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.\n",
            "WARNING:tensorflow:TPU system 10.7.137.26:8470 has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.\n",
            "REPLICAS:  8\n",
            "cnt: 4863 batch size: 16\n",
            "cnt: 1216 batch size: 32\n",
            "Model: \"dual_roberta_model_3\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "roberta_question (RobertaEnc multiple                  124645632 \n",
            "_________________________________________________________________\n",
            "dropout_155 (Dropout)        multiple                  0         \n",
            "_________________________________________________________________\n",
            "q_classifier (Dense)         multiple                  16149     \n",
            "_________________________________________________________________\n",
            "a_classifier (Dense)         multiple                  3845      \n",
            "_________________________________________________________________\n",
            "j_classifier (Dense)         multiple                  9220      \n",
            "_________________________________________________________________\n",
            "se_layer_9 (SELayer)         multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_10 (SELayer)        multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_11 (SELayer)        multiple                  2657088   \n",
            "=================================================================\n",
            "Total params: 127,923,678\n",
            "Trainable params: 3,278,046\n",
            "Non-trainable params: 124,645,632\n",
            "_________________________________________________________________\n",
            "None\n",
            "Train for 912 steps\n",
            "912/912 [==============================] - 124s 136ms/step - loss: 0.4514\n",
            "Model: \"dual_roberta_model_3\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "roberta_question (RobertaEnc multiple                  124645632 \n",
            "_________________________________________________________________\n",
            "dropout_155 (Dropout)        multiple                  0         \n",
            "_________________________________________________________________\n",
            "q_classifier (Dense)         multiple                  16149     \n",
            "_________________________________________________________________\n",
            "a_classifier (Dense)         multiple                  3845      \n",
            "_________________________________________________________________\n",
            "j_classifier (Dense)         multiple                  9220      \n",
            "_________________________________________________________________\n",
            "se_layer_9 (SELayer)         multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_10 (SELayer)        multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_11 (SELayer)        multiple                  2657088   \n",
            "=================================================================\n",
            "Total params: 127,923,678\n",
            "Trainable params: 127,923,678\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n",
            "None\n",
            "Steps per epoch: 304 | 38\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "/tensorflow-2.1.0/python3.6/tensorflow_core/python/framework/indexed_slices.py:433: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
            "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "[INFO][02/12/2020 13:04:47] Step   150 | loss 0.3966 | lr 1.00e-04 | 0.953s per step\n",
            "[INFO][02/12/2020 13:05:21] Step   300 | loss 0.3836 | lr 9.70e-05 | 0.231s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:20<00:00,  6.53it/s]\n",
            "[INFO][02/12/2020 13:05:42] Metrics at step 300:\n",
            "[INFO][02/12/2020 13:05:42] loss: 0.3709\n",
            "[INFO][02/12/2020 13:05:42] spearman: 36.84\n",
            "[INFO][02/12/2020 13:06:20] Step   450 | loss 0.3744 | lr 8.83e-05 | 0.390s per step\n",
            "[INFO][02/12/2020 13:06:54] Step   600 | loss 0.3697 | lr 7.50e-05 | 0.230s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.55it/s]\n",
            "[INFO][02/12/2020 13:07:01] Metrics at step 600:\n",
            "[INFO][02/12/2020 13:07:01] loss: 0.3685\n",
            "[INFO][02/12/2020 13:07:01] spearman: 38.79\n",
            "[INFO][02/12/2020 13:07:38] Step   750 | loss 0.3640 | lr 5.87e-05 | 0.293s per step\n",
            "[INFO][02/12/2020 13:08:14] Step   900 | loss 0.3591 | lr 4.13e-05 | 0.236s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.40it/s]\n",
            "[INFO][02/12/2020 13:08:20] Metrics at step 900:\n",
            "[INFO][02/12/2020 13:08:20] loss: 0.3633\n",
            "[INFO][02/12/2020 13:08:20] spearman: 39.45\n",
            "[INFO][02/12/2020 13:08:58] Step  1050 | loss 0.3544 | lr 2.50e-05 | 0.294s per step\n",
            "[INFO][02/12/2020 13:09:33] Step  1200 | loss 0.3488 | lr 1.17e-05 | 0.232s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.41it/s]\n",
            "[INFO][02/12/2020 13:09:39] Metrics at step 1200:\n",
            "[INFO][02/12/2020 13:09:39] loss: 0.3622\n",
            "[INFO][02/12/2020 13:09:39] spearman: 40.40\n",
            "[INFO][02/12/2020 13:10:17] Step  1350 | loss 0.3452 | lr 3.03e-06 | 0.292s per step\n",
            "[INFO][02/12/2020 13:10:52] Step  1500 | loss 0.3415 | lr 1.00e-08 | 0.234s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.55it/s]\n",
            "[INFO][02/12/2020 13:10:58] Metrics at step 1500:\n",
            "[INFO][02/12/2020 13:10:58] loss: 0.3619\n",
            "[INFO][02/12/2020 13:10:58] spearman: 40.46\n",
            "[INFO][02/12/2020 13:11:01] Training finished. Best step(s):\n",
            "[INFO][02/12/2020 13:11:01] loss: 0.3619 @ step 1500\n",
            "[INFO][02/12/2020 13:11:01] spearman: 40.46 @ step 1500\n",
            "====================\n",
            "Training Fold 5\n",
            "====================\n",
            "Running on TPU  ['10.7.137.26:8470']\n",
            "WARNING:tensorflow:TPU system 10.7.137.26:8470 has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.\n",
            "WARNING:tensorflow:TPU system 10.7.137.26:8470 has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.\n",
            "REPLICAS:  8\n",
            "cnt: 4864 batch size: 16\n",
            "cnt: 1215 batch size: 32\n",
            "Model: \"dual_roberta_model_4\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "roberta_question (RobertaEnc multiple                  124645632 \n",
            "_________________________________________________________________\n",
            "dropout_194 (Dropout)        multiple                  0         \n",
            "_________________________________________________________________\n",
            "q_classifier (Dense)         multiple                  16149     \n",
            "_________________________________________________________________\n",
            "a_classifier (Dense)         multiple                  3845      \n",
            "_________________________________________________________________\n",
            "j_classifier (Dense)         multiple                  9220      \n",
            "_________________________________________________________________\n",
            "se_layer_12 (SELayer)        multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_13 (SELayer)        multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_14 (SELayer)        multiple                  2657088   \n",
            "=================================================================\n",
            "Total params: 127,923,678\n",
            "Trainable params: 3,278,046\n",
            "Non-trainable params: 124,645,632\n",
            "_________________________________________________________________\n",
            "None\n",
            "Train for 912 steps\n",
            "912/912 [==============================] - 125s 138ms/step - loss: 0.4479\n",
            "Model: \"dual_roberta_model_4\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "roberta_question (RobertaEnc multiple                  124645632 \n",
            "_________________________________________________________________\n",
            "dropout_194 (Dropout)        multiple                  0         \n",
            "_________________________________________________________________\n",
            "q_classifier (Dense)         multiple                  16149     \n",
            "_________________________________________________________________\n",
            "a_classifier (Dense)         multiple                  3845      \n",
            "_________________________________________________________________\n",
            "j_classifier (Dense)         multiple                  9220      \n",
            "_________________________________________________________________\n",
            "se_layer_12 (SELayer)        multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_13 (SELayer)        multiple                  295872    \n",
            "_________________________________________________________________\n",
            "se_layer_14 (SELayer)        multiple                  2657088   \n",
            "=================================================================\n",
            "Total params: 127,923,678\n",
            "Trainable params: 127,923,678\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n",
            "None\n",
            "Steps per epoch: 304 | 38\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "/tensorflow-2.1.0/python3.6/tensorflow_core/python/framework/indexed_slices.py:433: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
            "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "WARNING:tensorflow:Gradients do not exist for variables ['roberta_question/roberta/pooler/dense/kernel:0', 'roberta_question/roberta/pooler/dense/bias:0'] when minimizing the loss.\n",
            "[INFO][02/12/2020 13:16:25] Step   150 | loss 0.3952 | lr 1.00e-04 | 0.942s per step\n",
            "[INFO][02/12/2020 13:17:00] Step   300 | loss 0.3841 | lr 9.70e-05 | 0.232s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:39<00:00,  5.66s/it]\n",
            "[INFO][02/12/2020 13:17:39] Metrics at step 300:\n",
            "[INFO][02/12/2020 13:17:39] loss: 0.3764\n",
            "[INFO][02/12/2020 13:17:39] spearman: 35.93\n",
            "[INFO][02/12/2020 13:18:18] Step   450 | loss 0.3748 | lr 8.83e-05 | 0.518s per step\n",
            "[INFO][02/12/2020 13:18:53] Step   600 | loss 0.3694 | lr 7.50e-05 | 0.234s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.41it/s]\n",
            "[INFO][02/12/2020 13:18:59] Metrics at step 600:\n",
            "[INFO][02/12/2020 13:18:59] loss: 0.3676\n",
            "[INFO][02/12/2020 13:18:59] spearman: 37.72\n",
            "[INFO][02/12/2020 13:19:37] Step   750 | loss 0.3620 | lr 5.87e-05 | 0.295s per step\n",
            "[INFO][02/12/2020 13:20:12] Step   900 | loss 0.3578 | lr 4.13e-05 | 0.237s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.66it/s]\n",
            "[INFO][02/12/2020 13:20:19] Metrics at step 900:\n",
            "[INFO][02/12/2020 13:20:19] loss: 0.3675\n",
            "[INFO][02/12/2020 13:20:19] spearman: 38.94\n",
            "[INFO][02/12/2020 13:20:57] Step  1050 | loss 0.3533 | lr 2.50e-05 | 0.294s per step\n",
            "[INFO][02/12/2020 13:21:32] Step  1200 | loss 0.3480 | lr 1.17e-05 | 0.235s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.50it/s]\n",
            "[INFO][02/12/2020 13:21:38] Metrics at step 1200:\n",
            "[INFO][02/12/2020 13:21:38] loss: 0.3664\n",
            "[INFO][02/12/2020 13:21:38] spearman: 39.21\n",
            "[INFO][02/12/2020 13:22:16] Step  1350 | loss 0.3440 | lr 3.03e-06 | 0.293s per step\n",
            "[INFO][02/12/2020 13:22:51] Step  1500 | loss 0.3395 | lr 1.00e-08 | 0.234s per step\n",
            "100%|███████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.42it/s]\n",
            "[INFO][02/12/2020 13:22:57] Metrics at step 1500:\n",
            "[INFO][02/12/2020 13:22:57] loss: 0.3675\n",
            "[INFO][02/12/2020 13:22:57] spearman: 39.32\n",
            "[INFO][02/12/2020 13:23:00] Training finished. Best step(s):\n",
            "[INFO][02/12/2020 13:23:00] loss: 0.3664 @ step 1200\n",
            "[INFO][02/12/2020 13:23:00] spearman: 39.32 @ step 1500\n",
            "Scores: -0.402150424863349 +- 0.006430421567132591\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "poKVYv5Yyt7g",
        "colab_type": "text"
      },
      "source": [
        "## Post-processing"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "n2BCT77lBMPV",
        "colab_type": "code",
        "outputId": "ce0fb3b9-2446-4334-bac4-2a98aad16fe4",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "import sklearn\n",
        "sklearn.__version__"
      ],
      "execution_count": 16,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'0.22.1'"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 16
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "9UKyT0e1tKSl",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "7dfa2a29-8e3e-4ffe-88c0-1d06e82075e4"
      },
      "source": [
        "!rm cache/oof.jl"
      ],
      "execution_count": 17,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "rm: cannot remove 'cache/oof.jl': No such file or directory\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "suwGiS83enOk",
        "colab_type": "code",
        "outputId": "4134e3c2-a429-4547-ae65-9ec8ffd01177",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 590
        }
      },
      "source": [
        "!python -m quest.eval_tpu -batch-size 8 --model-pattern \"cache/kaggleds/roberta-base-fold-%d.h5\""
      ],
      "execution_count": 18,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Running on TPU  ['10.7.137.26:8470']\n",
            "2020-02-12 13:37:08.269938: E tensorflow/stream_executor/cuda/cuda_driver.cc:351] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected\n",
            "INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0\n",
            "cnt: 1216 batch size: 64\n",
            "100%|███████████████████████████████████████████████████████████████| 19/19 [00:30<00:00,  1.32it/s]\n",
            "Raw Spearman:  40.60\n",
            "Running on TPU  ['10.7.137.26:8470']\n",
            "WARNING:tensorflow:TPU system 10.7.137.26:8470 has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.\n",
            "WARNING:tensorflow:TPU system 10.7.137.26:8470 has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.\n",
            "cnt: 1216 batch size: 64\n",
            "100%|███████████████████████████████████████████████████████████████| 19/19 [00:29<00:00,  1.28it/s]\n",
            "Raw Spearman:  39.63\n",
            "Running on TPU  ['10.7.137.26:8470']\n",
            "WARNING:tensorflow:TPU system 10.7.137.26:8470 has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.\n",
            "WARNING:tensorflow:TPU system 10.7.137.26:8470 has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.\n",
            "cnt: 1216 batch size: 64\n",
            "100%|███████████████████████████████████████████████████████████████| 19/19 [00:29<00:00,  1.28it/s]\n",
            "Raw Spearman:  41.06\n",
            "Running on TPU  ['10.7.137.26:8470']\n",
            "WARNING:tensorflow:TPU system 10.7.137.26:8470 has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.\n",
            "WARNING:tensorflow:TPU system 10.7.137.26:8470 has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.\n",
            "cnt: 1216 batch size: 64\n",
            "100%|███████████████████████████████████████████████████████████████| 19/19 [00:29<00:00,  1.27it/s]\n",
            "Raw Spearman:  40.46\n",
            "Running on TPU  ['10.7.137.26:8470']\n",
            "WARNING:tensorflow:TPU system 10.7.137.26:8470 has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.\n",
            "WARNING:tensorflow:TPU system 10.7.137.26:8470 has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.\n",
            "cnt: 1215 batch size: 64\n",
            "100%|███████████████████████████████████████████████████████████████| 19/19 [00:46<00:00,  6.10s/it]\n",
            "Raw Spearman:  39.32\n",
            "Raw Spearman:  40.15\n",
            "Optimized Spearman:  43.00\n",
            "[50, 26, 5, 24, 13, 2, 21, 18, 11, 22, 30, 4, 4, 3, 4, 5, 13, 18, 36, 2, 24, 17, 14, 46, 27, 48, 53, 22, 19, 24]\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "OVufYclvsm0s",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!mv cache/best_bins.jl cache/kaggleds/best_bins.jl"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xmzyPUZFywO8",
        "colab_type": "text"
      },
      "source": [
        "## Upload the model"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "8T_eh5vr_ZOF",
        "colab_type": "code",
        "outputId": "f11166a6-9a5f-47c1-8963-4ce50ba700c3",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 399
        }
      },
      "source": [
        "!cd cache/kaggleds && kaggle datasets version --dir-mode tar -m \"5fold\" -d"
      ],
      "execution_count": 20,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Starting upload for file roberta-base-fold-3.h5\n",
            "100% 488M/488M [00:05<00:00, 99.5MB/s]\n",
            "Upload successful: roberta-base-fold-3.h5 (488MB)\n",
            "Starting upload for file best_bins.jl\n",
            "100% 1.22k/1.22k [00:00<00:00, 1.58kB/s]\n",
            "Upload successful: best_bins.jl (1KB)\n",
            "Starting upload for file roberta-base-fold-0.h5\n",
            "100% 488M/488M [00:03<00:00, 133MB/s]\n",
            "Upload successful: roberta-base-fold-0.h5 (488MB)\n",
            "Starting upload for file tokenizer_roberta-base.tar\n",
            "100% 1.30M/1.30M [00:01<00:00, 1.26MB/s]\n",
            "Upload successful: tokenizer_roberta-base.tar (1MB)\n",
            "Starting upload for file roberta-base-fold-1.h5\n",
            "100% 488M/488M [00:04<00:00, 117MB/s]\n",
            "Upload successful: roberta-base-fold-1.h5 (488MB)\n",
            "Starting upload for file roberta-base-fold-2.h5\n",
            "100% 488M/488M [00:04<00:00, 123MB/s]\n",
            "Upload successful: roberta-base-fold-2.h5 (488MB)\n",
            "Starting upload for file roberta-base-fold-4.h5\n",
            "100% 488M/488M [00:03<00:00, 134MB/s]\n",
            "Upload successful: roberta-base-fold-4.h5 (488MB)\n",
            "Dataset version is being created. Please check progress at https://www.kaggle.com/ceshine/quest-models-public\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "_epkw2mZpgnW",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}