frankShih/fine_tune_sample.ipynb

## fine_tune_sample.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Fine tune - tutorial\n",
    "---\n",
    "### Prepare data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Reusing dataset amazon_reviews_multi (/home/aifadmin/.cache/huggingface/datasets/amazon_reviews_multi/es/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2b42e355d2774adf99055c3473e17d89",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Reusing dataset amazon_reviews_multi (/home/aifadmin/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "90bc72ef184d44c0885e32da0e25eb42",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at /home/aifadmin/.cache/huggingface/datasets/amazon_reviews_multi/es/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-cc11caebdf3a241a.arrow\n",
      "Loading cached processed dataset at /home/aifadmin/.cache/huggingface/datasets/amazon_reviews_multi/es/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-20d771e5b07bff20.arrow\n",
      "Loading cached processed dataset at /home/aifadmin/.cache/huggingface/datasets/amazon_reviews_multi/es/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-9e35450d4c1049d8.arrow\n",
      "Loading cached processed dataset at /home/aifadmin/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-74d92a94d824b0db.arrow\n",
      "Loading cached processed dataset at /home/aifadmin/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-1995df5055b2a54e.arrow\n",
      "Loading cached processed dataset at /home/aifadmin/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-c09e99a2e203a356.arrow\n",
      "Loading cached shuffled indices for dataset at /home/aifadmin/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-f73189dad4cd3715.arrow\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "'>> Title: I'm dissapointed.'\n",
      "'>> Review: I guess I had higher expectations for this book from the reviews. I really thought I'd at least like it. The plot idea was great. I loved Ash but, it just didnt go anywhere. Most of the book was about their radio show and talking to callers. I wanted the author to dig deeper so we could really get to know the characters. All we know about Grace is that she is attractive looking, Latino and is kind of a brat. I'm dissapointed.'\n",
      "\n",
      "'>> Title: Good art, good price, poor design'\n",
      "'>> Review: I had gotten the DC Vintage calendar the past two years, but it was on backorder forever this year and I saw they had shrunk the dimensions for no good reason. This one has good art choices but the design has the fold going through the picture, so it's less aesthetically pleasing, especially if you want to keep a picture to hang. For the price, a good calendar'\n",
      "\n",
      "'>> Title: Helpful'\n",
      "'>> Review: Nearly all the tips useful and. I consider myself an intermediate to advanced user of OneNote. I would highly recommend.'\n"
     ]
    }
   ],
   "source": [
    "'''\n",
    "data cleaning\n",
    "'''\n",
    "\n",
    "from datasets import load_dataset\n",
    "\n",
    "spanish_dataset = load_dataset(\"amazon_reviews_multi\", \"es\")\n",
    "english_dataset = load_dataset(\"amazon_reviews_multi\", \"en\")\n",
    "# print(english_dataset)\n",
    "\n",
    "\n",
    "def show_samples(dataset, num_samples=3, seed=42):\n",
    "    sample = dataset[\"train\"].shuffle(seed=seed).select(range(num_samples))\n",
    "    for example in sample:\n",
    "        print(f\"\\n'>> Title: {example['review_title']}'\")\n",
    "        print(f\"'>> Review: {example['review_body']}'\")\n",
    "\n",
    "'''\n",
    "# show_samples(english_dataset)\n",
    "\n",
    "english_dataset.set_format(\"pandas\")\n",
    "english_df = english_dataset[\"train\"][:]\n",
    "# Show counts for top 20 products\n",
    "print(english_df[\"product_category\"].value_counts()[:20])\n",
    "'''\n",
    "\n",
    "def filter_books(example):\n",
    "    return (\n",
    "        example[\"product_category\"] == \"book\"\n",
    "        or example[\"product_category\"] == \"digital_ebook_purchase\"\n",
    "    )\n",
    "\n",
    "english_dataset.reset_format()\n",
    "spanish_books = spanish_dataset.filter(filter_books)\n",
    "english_books = english_dataset.filter(filter_books)\n",
    "\n",
    "show_samples(english_books)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached shuffled indices for dataset at /home/aifadmin/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-0400eebe0dcc1748.arrow\n",
      "Loading cached shuffled indices for dataset at /home/aifadmin/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-ae378fb51b519f75.arrow\n",
      "Loading cached shuffled indices for dataset at /home/aifadmin/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-23c1fe0557fd8366.arrow\n",
      "Loading cached processed dataset at /home/aifadmin/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-192197dbdbeb07e0.arrow\n",
      "Loading cached processed dataset at /home/aifadmin/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-12aad5bf8b3ba48a.arrow\n",
      "Loading cached processed dataset at /home/aifadmin/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-0fb0ba0de73126cd.arrow\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "'>> Title: Easy to follow!!!!'\n",
      "'>> Review: I loved The dash diet weight loss Solution. Never hungry. I would recommend this diet. Also the menus are well rounded. Try it. Has lots of the information need thanks.'\n",
      "\n",
      "'>> Title: PARCIALMENTE DAÑADO'\n",
      "'>> Review: Me llegó el día que tocaba, junto a otros libros que pedí, pero la caja llegó en mal estado lo cual dañó las esquinas de los libros porque venían sin protección (forro).'\n",
      "\n",
      "'>> Title: no lo he podido descargar'\n",
      "'>> Review: igual que el anterior'\n"
     ]
    }
   ],
   "source": [
    "'''\n",
    "format data into model input format\n",
    "concatenate the datasets, and shuffle\n",
    " filter out the examples with very short titles\n",
    "'''\n",
    "\n",
    "from datasets import concatenate_datasets, DatasetDict\n",
    "\n",
    "books_dataset = DatasetDict()\n",
    "\n",
    "for split in english_books.keys():\n",
    "    books_dataset[split] = concatenate_datasets(\n",
    "        [english_books[split], spanish_books[split]]\n",
    "    )\n",
    "    books_dataset[split] = books_dataset[split].shuffle(seed=42)\n",
    "    \n",
    "    \n",
    "show_samples(books_dataset)\n",
    "    \n",
    "books_dataset = books_dataset.filter(lambda x: len(x[\"review_title\"].split()) > 2)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### load model tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/anaconda/envs/azureml_py38/lib/python3.8/site-packages/transformers/convert_slow_tokenizer.py:434: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.\n",
      "  warnings.warn(\n",
      "Loading cached processed dataset at /home/aifadmin/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-ea91ca6bab46faa7.arrow\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4bd6bffa7156412183dc407ee7786fb8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at /home/aifadmin/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-536c2042c2cfea77.arrow\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForSeq2SeqLM, TFAutoModelForSeq2SeqLM\n",
    "\n",
    "PRETRAIN_NAME = 'google/mt5-small'\n",
    "tokenizer = AutoTokenizer.from_pretrained(PRETRAIN_NAME)\n",
    "\n",
    "'''\n",
    "inputs = tokenizer(\"I loved reading the Hunger Games!\")\n",
    "print(inputs)\n",
    "print(tokenizer.convert_ids_to_tokens(inputs.input_ids))\n",
    "'''\n",
    "\n",
    "\n",
    "max_input_length = 512\n",
    "max_target_length = 30\n",
    "\n",
    "def preprocess_function(examples):\n",
    "    model_inputs = tokenizer(\n",
    "        examples['review_body'], # DSU_原文\n",
    "        max_length=max_input_length, \n",
    "        truncation=True, \n",
    "        # padding=True,\n",
    "        # return_tensors=\"tf\" # np, pt, tf\n",
    "    )\n",
    "    # Set up the tokenizer for targets\n",
    "    with tokenizer.as_target_tokenizer():\n",
    "        labels = tokenizer(\n",
    "            examples['review_title'], # word_原文\n",
    "            max_length=max_target_length, \n",
    "            truncation=True,\n",
    "            # padding=True,\n",
    "            # return_tensors=\"tf\" # np, pt, tf\n",
    "        )\n",
    "\n",
    "    model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
    "    return model_inputs\n",
    "\n",
    "\n",
    "tokenized_datasets = books_dataset.map(preprocess_function, batched=True)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### evaluation metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-09-14 12:22:54.687524: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2022-09-14 12:22:58.923243: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
      "2022-09-14 12:23:05.880180: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n",
      "2022-09-14 12:23:05.880337: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n",
      "2022-09-14 12:23:05.880351: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'dict'> {'rouge1': 92.308, 'rouge2': 72.727, 'rougeL': 92.308, 'rougeLsum': 92.308}\n"
     ]
    }
   ],
   "source": [
    "import evaluate\n",
    "\n",
    "rouge_score = evaluate.load(\"rouge\")\n",
    "\n",
    "\n",
    "generated_summary = \"I absolutely loved reading the Hunger Games\"\n",
    "reference_summary = \"I loved reading the Hunger Games\"\n",
    "scores = rouge_score.compute(\n",
    "    predictions=[generated_summary], references=[reference_summary]\n",
    ")\n",
    "scores = {k: round(scores[k]*100, 3) for k in scores}\n",
    "\n",
    "print(type(scores), scores)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /home/aifadmin/nltk_data...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "I grew up reading Koontz, and years ago, I stopped,convinced i had \"outgrown\" him.\n",
      "Still,when a friend was looking for something suspenseful too read, I suggested Koontz.\n",
      "She found Strangers.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data]   Package punkt is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "'''\n",
    "Creating a strong baseline\n",
    "A common baseline for text summarization is to simply take the first three sentences of an article, \n",
    "often called the lead-3 baseline\n",
    "'''\n",
    "import nltk\n",
    "from nltk.tokenize import sent_tokenize\n",
    "\n",
    "\n",
    "nltk.download(\"punkt\")\n",
    "\n",
    "def three_sentence_summary(text):\n",
    "    return \"\\n\".join(sent_tokenize(text)[:3])\n",
    "\n",
    "\n",
    "print(three_sentence_summary(books_dataset[\"train\"][1][\"review_body\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'rouge1': 16.818, 'rouge2': 8.785, 'rougeL': 15.579, 'rougeLsum': 15.945}\n"
     ]
    }
   ],
   "source": [
    "def evaluate_baseline(dataset, metric):\n",
    "    summaries = [three_sentence_summary(text) for text in dataset[\"review_body\"]]\n",
    "    # print(len(summaries), len(dataset[\"review_title\"]))\n",
    "    score = metric.compute(predictions=summaries, references=dataset[\"review_title\"])\n",
    "    score = {k: round(score[k]*100, 3) for k in score}\n",
    "    return score \n",
    "\n",
    "\n",
    "score = evaluate_baseline(books_dataset[\"validation\"], rouge_score)\n",
    "# rouge_names = [\"rouge1\", \"rouge2\", \"rougeL\", \"rougeLsum\"]\n",
    "# rouge_dict = dict((rn, round(score[rn].mid.fmeasure * 100, 2)) for rn in rouge_names)\n",
    "print(score)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### fine tuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-09-14 12:26:09.196414: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2022-09-14 12:26:15.225956: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22057 MB memory:  -> device: 0, name: Tesla P40, pci bus id: 0001:00:00.0, compute capability: 6.1\n",
      "/anaconda/envs/azureml_py38/lib/python3.8/site-packages/keras/initializers/initializers_v2.py:120: UserWarning: The initializer RandomNormal is unseeded and being called multiple times, which will return identical values  each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initalizer instance more than once.\n",
      "  warnings.warn(\n",
      "All model checkpoint layers were used when initializing TFMT5ForConditionalGeneration.\n",
      "\n",
      "All the layers of TFMT5ForConditionalGeneration were initialized from the model checkpoint at google/mt5-small.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMT5ForConditionalGeneration for predictions without further training.\n"
     ]
    }
   ],
   "source": [
    "model = TFAutoModelForSeq2SeqLM.from_pretrained(PRETRAIN_NAME)\n",
    "# model = AutoModelForSeq2SeqLM.from_pretrained(PRETRAIN_NAME)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "72d02843911e4bd5ac41e3a8486436f1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "'''\n",
    "login so that your call use push callback to upload model\n",
    "'''\n",
    "\n",
    "from huggingface_hub import notebook_login\n",
    "\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DatasetDict({\n",
      "    train: Dataset({\n",
      "        features: ['input_ids', 'attention_mask', 'labels'],\n",
      "        num_rows: 9672\n",
      "    })\n",
      "    validation: Dataset({\n",
      "        features: ['input_ids', 'attention_mask', 'labels'],\n",
      "        num_rows: 238\n",
      "    })\n",
      "    test: Dataset({\n",
      "        features: ['input_ids', 'attention_mask', 'labels'],\n",
      "        num_rows: 245\n",
      "    })\n",
      "})\n"
     ]
    }
   ],
   "source": [
    "'''\n",
    "dynamically pad the inputs and the labels for us\n",
    "'''\n",
    "\n",
    "from transformers import DataCollatorForSeq2Seq\n",
    "\n",
    "data_collator = DataCollatorForSeq2Seq(\n",
    "    tokenizer, \n",
    "    model=model, \n",
    "    return_tensors=\"tf\",\n",
    ")\n",
    "\n",
    "\n",
    "tokenized_datasets = tokenized_datasets.remove_columns(\n",
    "    books_dataset[\"train\"].column_names\n",
    ")\n",
    "\n",
    "print(tokenized_datasets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': <tf.Tensor: shape=(2, 188), dtype=int32, numpy=\n",
       "array([[   653,   1957,   1314,    261,   2757,   1280,    435,    259,\n",
       "         29166,    263,    269,    774,   5547,      1,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0,      0,      0,      0,      0,\n",
       "             0,      0,      0,      0],\n",
       "       [   336,   6998,    481,   1150,  11807,   1590,  16339,    360,\n",
       "           261,    305,   3127,   2780,    261,    336,  35042,    345,\n",
       "           261,   3141, 136052,    285,    259,    266,   1425,    313,\n",
       "          2983, 106419,    272,    311,   4065,    260,  36874,    261,\n",
       "          1909,    259,    262,  22163,    639,    259,   7505,    332,\n",
       "          9066,  88398,    265,   5105,   6320,   4906,    261,    336,\n",
       "         21847,    345,   1590,  16339,    360,    260,   4630,   5897,\n",
       "           320, 146261,    260,    486,  21101,   1998,   3684,   2606,\n",
       "          2316,    609,    639,   3014,  11665,    416,    260,    336,\n",
       "           639,   7779,    259,    266,   1425,  57512,   9066,   5401,\n",
       "           260,    336,    259,  91451,    259,    262,  27613,    332,\n",
       "           259,  34779,    260,   1494,    639,    259,    262,   3005,\n",
       "           584,  62351,    288,    461,   6801,  22590,  62799,   3093,\n",
       "           259,    262,  20233,   3622,    304,    259,  11994,    259,\n",
       "         36260,  48971,    533,    418,   2725,    484,  27067,   1059,\n",
       "           260,  18646,    521,    259,   3659,   1063,   1388,    514,\n",
       "          3622,   3004,  26222,    260,    298, 109508,    276,    533,\n",
       "          1425,    288,    390,    259, 131690,    260,    336,    259,\n",
       "         25505,    609,    288,    390,    287,   3435,    336,   4906,\n",
       "          6338,    259,   6924,    259,  29426,    261,   2469,    351,\n",
       "           259, 109933,    260,    563,  15129,    261,    336,    277,\n",
       "           857,  36975,    345,  14355,    305,    259,    265, 111962,\n",
       "           288,  27039,    714,    260,    459,  25640,    259,    262,\n",
       "          5150,   4906,    260,      1]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 188), dtype=int32, numpy=\n",
       "array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
       "       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>, 'labels': <tf.Tensor: shape=(2, 7), dtype=int32, numpy=\n",
       "array([[  298,   259,  5994,   269,   774,  5547,     1],\n",
       "       [  298, 10380,   304, 13992,   291,     1,  -100]], dtype=int32)>, 'decoder_input_ids': <tf.Tensor: shape=(2, 7), dtype=int32, numpy=\n",
       "array([[    0,   298,   259,  5994,   269,   774,  5547],\n",
       "       [    0,   298, 10380,   304, 13992,   291,     1]], dtype=int32)>}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'''\n",
    "not the same as shown in tutorial ???\n",
    "'''\n",
    "features = [tokenized_datasets[\"train\"][i] for i in range(2)]\n",
    "data_collator(features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf_train_dataset = tokenized_datasets[\"train\"].to_tf_dataset(\n",
    "    columns=[\"input_ids\", \"attention_mask\", \"labels\"],\n",
    "    collate_fn=data_collator,\n",
    "    shuffle=True,\n",
    "    batch_size=8,\n",
    ")\n",
    "tf_eval_dataset = tokenized_datasets[\"validation\"].to_tf_dataset(\n",
    "    columns=[\"input_ids\", \"attention_mask\", \"labels\"],\n",
    "    collate_fn=data_collator,\n",
    "    shuffle=False,\n",
    "    batch_size=8,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Mixed precision compatibility check (mixed_float16): WARNING\n",
      "Your GPU may run slowly with dtype policy mixed_float16 because it does not have compute capability of at least 7.0. Your GPU:\n",
      "  Tesla P40, compute capability 6.1\n",
      "See https://developer.nvidia.com/cuda-gpus for a list of GPUs and their compute capabilities.\n",
      "If you will use compatible GPU(s) not attached to this host, e.g. by running a multi-worker model, you can ignore this warning. This message will only be logged once\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Mixed precision compatibility check (mixed_float16): WARNING\n",
      "Your GPU may run slowly with dtype policy mixed_float16 because it does not have compute capability of at least 7.0. Your GPU:\n",
      "  Tesla P40, compute capability 6.1\n",
      "See https://developer.nvidia.com/cuda-gpus for a list of GPUs and their compute capabilities.\n",
      "If you will use compatible GPU(s) not attached to this host, e.g. by running a multi-worker model, you can ignore this warning. This message will only be logged once\n"
     ]
    }
   ],
   "source": [
    "from transformers import create_optimizer\n",
    "import tensorflow as tf\n",
    "\n",
    "# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied\n",
    "# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,\n",
    "# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.\n",
    "num_train_epochs = 4\n",
    "num_train_steps = len(tf_train_dataset) * num_train_epochs\n",
    "model_name = PRETRAIN_NAME.split(\"/\")[-1]\n",
    "\n",
    "optimizer, schedule = create_optimizer(\n",
    "    init_lr=5.6e-4,\n",
    "    num_warmup_steps=0,\n",
    "    num_train_steps=num_train_steps,\n",
    "    weight_decay_rate=0.01,\n",
    ")\n",
    "\n",
    "model.compile(optimizer=optimizer)\n",
    "\n",
    "# Train in mixed-precision float16\n",
    "tf.keras.mixed_precision.set_global_policy(\"mixed_float16\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/aifadmin/notebooks/team01/mt5-small-finetuned-amazon-en-es is already a clone of https://huggingface.co/HanSSH/mt5-small-finetuned-amazon-en-es. Make sure you pull the latest changes with `repo.git_pull()`.\n",
      "WARNING:huggingface_hub.repository:/home/aifadmin/notebooks/team01/mt5-small-finetuned-amazon-en-es is already a clone of https://huggingface.co/HanSSH/mt5-small-finetuned-amazon-en-es. Make sure you pull the latest changes with `repo.git_pull()`.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/4\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-09-14 12:30:29.303782: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1209/1209 [==============================] - 398s 302ms/step - loss: 5.8144 - val_loss: 3.5283\n",
      "Epoch 2/4\n",
      "1209/1209 [==============================] - 359s 297ms/step - loss: 3.8758 - val_loss: 3.2971\n",
      "Epoch 3/4\n",
      "1209/1209 [==============================] - 357s 295ms/step - loss: 3.4741 - val_loss: 3.2452\n",
      "Epoch 4/4\n",
      "1209/1209 [==============================] - 359s 297ms/step - loss: 3.2684 - val_loss: 3.2288\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Several commits (2) will be pushed upstream.\n",
      "WARNING:huggingface_hub.repository:Several commits (2) will be pushed upstream.\n",
      "The progress bars may be unreliable.\n",
      "WARNING:huggingface_hub.repository:The progress bars may be unreliable.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a360a93eba124e078197a66a534b8861",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Upload file tf_model.h5:   0%|          | 32.0k/1.12G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "remote: Scanning LFS files for validity, may be slow...        \n",
      "remote: LFS file scan complete.        \n",
      "To https://huggingface.co/HanSSH/mt5-small-finetuned-amazon-en-es\n",
      "   b225ac5..6a1d2d3  main -> main\n",
      "\n",
      "WARNING:huggingface_hub.repository:remote: Scanning LFS files for validity, may be slow...        \n",
      "remote: LFS file scan complete.        \n",
      "To https://huggingface.co/HanSSH/mt5-small-finetuned-amazon-en-es\n",
      "   b225ac5..6a1d2d3  main -> main\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7fa255e62610>"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "from transformers.keras_callbacks import PushToHubCallback\n",
    "\n",
    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
    "\n",
    "callback = PushToHubCallback(\n",
    "    output_dir=f\"{model_name}-finetuned-amazon-en-es\", tokenizer=tokenizer\n",
    ")\n",
    "\n",
    "model.fit(\n",
    "    tf_train_dataset, validation_data=tf_eval_dataset, callbacks=[callback], epochs=4\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 30/30 [01:45<00:00,  3.53s/it]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "import numpy as np\n",
    "\n",
    "all_preds = []\n",
    "all_labels = []\n",
    "for batch in tqdm(tf_eval_dataset):\n",
    "    predictions = model.generate(**batch)\n",
    "    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)\n",
    "    labels = batch[\"labels\"].numpy()\n",
    "    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n",
    "    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
    "    decoded_preds = [\"\\n\".join(sent_tokenize(pred.strip())) for pred in decoded_preds]\n",
    "    decoded_labels = [\"\\n\".join(sent_tokenize(label.strip())) for label in decoded_labels]\n",
    "    all_preds.extend(decoded_preds)\n",
    "    all_labels.extend(decoded_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'rouge1': 25.556, 'rouge2': 18.889, 'rougeL': 25.556, 'rougeLsum': 25.0}\n"
     ]
    }
   ],
   "source": [
    "result = rouge_score.compute(\n",
    "    predictions=decoded_preds, references=decoded_labels, use_stemmer=True\n",
    ")\n",
    "# print(result)\n",
    "# result = {key: value.mid.fmeasure * 100 for key, value in result.items()}\n",
    "print({k: round(v*100, 3) for k, v in result.items()})\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cute, dark, dark things around what could have been better\n",
      "This one is darker than normal.\n"
     ]
    }
   ],
   "source": [
    "print(decoded_preds[0])\n",
    "print(decoded_labels[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install evaluate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ef90fcc2355c4a22b9eea8e180899853",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading tf_model.h5:   0%|          | 0.00/1.12G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/anaconda/envs/azureml_py38/lib/python3.8/site-packages/keras/initializers/initializers_v2.py:120: UserWarning: The initializer RandomNormal is unseeded and being called multiple times, which will return identical values  each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initalizer instance more than once.\n",
      "  warnings.warn(\n",
      "All model checkpoint layers were used when initializing TFMT5ForConditionalGeneration.\n",
      "\n",
      "All the layers of TFMT5ForConditionalGeneration were initialized from the model checkpoint at HanSSH/mt5-small-finetuned-amazon-en-es.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMT5ForConditionalGeneration for predictions without further training.\n"
     ]
    }
   ],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "hub_model_id =  \"HanSSH/mt5-small-finetuned-amazon-en-es\" #'huggingface-course/mt5-small-finetuned-amazon-en-es'\n",
    "summarizer = pipeline(\n",
    "    \"summarization\", \n",
    "    # model='./mt5-small-finetuned-amazon-en-es/',\n",
    "    model=hub_model_id,\n",
    ")\n",
    "\n",
    "def print_summary(idx):\n",
    "    review = books_dataset[\"test\"][idx][\"review_body\"]\n",
    "    title = books_dataset[\"test\"][idx][\"review_title\"]\n",
    "    summary = summarizer(review)\n",
    "    summary = summary[0][\"summary_text\"]\n",
    "    \n",
    "    print(f\"'>>> Review: {review}'\")\n",
    "    print(f\"\\n'>>> Title: {title}'\")\n",
    "    print(f\"\\n'>>> Summary: {summary}'\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-09-14 13:10:08.725150: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x563211957c80 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n",
      "2022-09-14 13:10:08.725181: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): Host, Default Version\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'>>> Review: Es una trilogia que se hace muy facil de leer. Me ha gustado, no me esperaba el final para nada'\n",
      "\n",
      "'>>> Title: Buena literatura para adolescentes'\n",
      "\n",
      "'>>> Summary: '\n"
     ]
    }
   ],
   "source": [
    "print_summary(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'>>> Review: interesante y ameno de leer aunque un poco repetitivo.. por ahora no me he hecho rico... funciona regular....;-)'\n",
      "\n",
      "'>>> Title: no es milagroso'\n",
      "\n",
      "'>>> Summary: '\n"
     ]
    }
   ],
   "source": [
    "print_summary(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Your max_length is set to 20, but you input_length is only 9. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'>>> Review: Muy apropiado para mis hijas'\n",
      "\n",
      "'>>> Title: Libros para el verano'\n",
      "\n",
      "'>>> Summary: '\n"
     ]
    }
   ],
   "source": [
    "print_summary(100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "azureml_py38",
   "language": "python",
   "name": "conda-env-azureml_py38-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "vscode": {
   "interpreter": {
    "hash": "81794d4967e6c3204c66dcd87b604927b115b27c00565d3d43f05ba2f3a2cb0d"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}