sayakpaul/bucket-lengths.ipynb

## bucket-lengths.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0fee8005",
   "metadata": {},
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "65fc8c36",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensorflow import keras\n",
    "import tensorflow as tf\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import random\n",
    "import tqdm\n",
    "\n",
    "SEED = 42\n",
    "tf.random.set_seed(SEED)\n",
    "np.random.seed(SEED)\n",
    "random.seed(SEED)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fe67b35a",
   "metadata": {},
   "source": [
    "## Data loading\n",
    "\n",
    "Data comes from here: https://www.kaggle.com/hijest/genre-classification-dataset-imdb."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a39fc6ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.read_csv(\n",
    "    \"./data/train_data.txt\",\n",
    "    engine=\"python\",\n",
    "    sep=\" ::: \",\n",
    "    names=[\"id\", \"movie\", \"genre\", \"summary\"],\n",
    ")\n",
    "\n",
    "test_df = pd.read_csv(\n",
    "    \"./data/test_data_solution.txt\",\n",
    "    engine=\"python\",\n",
    "    sep=\" ::: \",\n",
    "    names=[\"id\", \"movie\", \"genre\", \"summary\"],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4c1b5c6d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>movie</th>\n",
       "      <th>genre</th>\n",
       "      <th>summary</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Oscar et la dame rose (2009)</td>\n",
       "      <td>drama</td>\n",
       "      <td>Listening in to a conversation between his doc...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Cupid (1997)</td>\n",
       "      <td>thriller</td>\n",
       "      <td>A brother and sister with a past incestuous re...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Young, Wild and Wonderful (1980)</td>\n",
       "      <td>adult</td>\n",
       "      <td>As the bus empties the students for their fiel...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>The Secret Sin (1915)</td>\n",
       "      <td>drama</td>\n",
       "      <td>To help their unemployed father make ends meet...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>The Unrecovered (2007)</td>\n",
       "      <td>drama</td>\n",
       "      <td>The film's title refers not only to the un-rec...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                             movie     genre  \\\n",
       "0   1      Oscar et la dame rose (2009)     drama   \n",
       "1   2                      Cupid (1997)  thriller   \n",
       "2   3  Young, Wild and Wonderful (1980)     adult   \n",
       "3   4             The Secret Sin (1915)     drama   \n",
       "4   5            The Unrecovered (2007)     drama   \n",
       "\n",
       "                                             summary  \n",
       "0  Listening in to a conversation between his doc...  \n",
       "1  A brother and sister with a past incestuous re...  \n",
       "2  As the bus empties the students for their fiel...  \n",
       "3  To help their unemployed father make ends meet...  \n",
       "4  The film's title refers not only to the un-rec...  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Viewing training data\n",
    "train_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d2754413",
   "metadata": {},
   "source": [
    "## Data splitting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "51e8d0d6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of training samples: 48792.\n",
      "Number of validation samples: 5422.\n",
      "Number of test examples: 54200.\n"
     ]
    }
   ],
   "source": [
    "# Split the data using train_test_split from sklearn\n",
    "train_shuffled = train_df.sample(frac=1)\n",
    "train_df_new, val_df = train_test_split(train_shuffled, test_size=0.1)\n",
    "\n",
    "print(f\"Number of training samples: {len(train_df_new)}.\")\n",
    "print(f\"Number of validation samples: {len(val_df)}.\")\n",
    "print(f\"Number of test examples: {len(test_df)}.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "18668652",
   "metadata": {},
   "source": [
    "## Using [`bucket_by_sequence_length()`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#bucket_by_sequence_length)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0ef030b0",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2021-12-19 15:55:28.592850: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<tf.Tensor: shape=(), dtype=int32, numpy=92>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "passage = train_df_new[\"summary\"][0]\n",
    "word_splits = tf.strings.split(passage, sep=\" \")\n",
    "tf.shape(word_splits)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "5ba03fa0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor(b\"On a lonely stretch of a highway, Ronit - pulls up in to a desolate pump to fill petrol. The attendant informs him that his car's fan belt is broken and a new one will only arrive in the morning. Stuck in the middle of nowhere, Ronit prepares to stake the night out in his car. When another car pulls in. The driver is a dignified, well-spoken man who lives a few miles away. He offers to house Ronit for the night, promising to drop him back in the morning. Ronit agrees, believing there is a god. But then there is also the devil.\", shape=(), dtype=string)\n"
     ]
    }
   ],
   "source": [
    "dataset = tf.data.Dataset.from_tensor_slices(train_df_new[\"summary\"])\n",
    "\n",
    "for sample in dataset.take(1):\n",
    "    print(sample)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "e938da00",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['[UNK]',\n",
       " 'short',\n",
       " 'sci-fi',\n",
       " 'documentary',\n",
       " 'drama',\n",
       " 'thriller',\n",
       " 'comedy',\n",
       " 'adult',\n",
       " 'romance',\n",
       " 'adventure',\n",
       " 'western',\n",
       " 'family',\n",
       " 'talk-show',\n",
       " 'news',\n",
       " 'horror',\n",
       " 'history',\n",
       " 'music',\n",
       " 'sport',\n",
       " 'war',\n",
       " 'animation',\n",
       " 'game-show',\n",
       " 'action',\n",
       " 'crime',\n",
       " 'reality-tv',\n",
       " 'mystery',\n",
       " 'musical',\n",
       " 'fantasy',\n",
       " 'biography']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "label_encoder = keras.layers.StringLookup(vocabulary=train_df_new[\"genre\"].unique())\n",
    "label_encoder.get_vocabulary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "841e7e00",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/z_/d29z43w90kz6f4kbzv5c9m9r0000gn/T/ipykernel_60182/1668209276.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  train_df_new[\"total_words\"] = train_df_new[\"summary\"].str.split().str.len()\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1829"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df_new[\"total_words\"] = train_df_new[\"summary\"].str.split().str.len()\n",
    "vocabulary_size = train_df_new[\"total_words\"].max()\n",
    "vocabulary_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "bf3addc3",
   "metadata": {},
   "outputs": [],
   "source": [
    "text_vectorizer = keras.layers.TextVectorization(ngrams=2, output_mode=\"tf_idf\")\n",
    "\n",
    "# `TextVectorization` layer needs to be adapted as per the vocabulary from our\n",
    "# training set.\n",
    "dataset_ = tf.data.Dataset.from_tensor_slices(\n",
    "        (train_df_new[\"summary\"].values, train_df_new[\"genre\"].values)\n",
    "    )\n",
    "with tf.device(\"/CPU:0\"):\n",
    "    text_vectorizer.adapt(dataset_.map(lambda text, label: text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "32fa0dd0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess_batch(summary, label):\n",
    "    summary = text_vectorizer(summary)\n",
    "    label = label_encoder(label)\n",
    "    return summary, label\n",
    "\n",
    "\n",
    "def prepare_dataset(dataframe):\n",
    "    dataset = tf.data.Dataset.from_tensor_slices(\n",
    "        (dataframe[\"summary\"].values, dataframe[\"genre\"].values)\n",
    "    )\n",
    "    dataset = dataset.bucket_by_sequence_length(\n",
    "        element_length_func=lambda elem, label: tf.shape(tf.strings.split(elem, sep=\" \"))[0],\n",
    "        bucket_boundaries=[512],\n",
    "        bucket_batch_sizes=[32, 32],\n",
    "    )\n",
    "    dataset = dataset.map(preprocess_batch, num_parallel_calls=tf.data.AUTOTUNE)\n",
    "    return dataset.prefetch(tf.data.AUTOTUNE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "e62347f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "training_dataset = prepare_dataset(train_df_new)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "20b0ee53",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(32, 1702942)\n",
      "(32,)\n",
      "(32, 1702942)\n",
      "(32,)\n",
      "(32, 1702942)\n",
      "(32,)\n",
      "(32, 1702942)\n",
      "(32,)\n",
      "(32, 1702942)\n",
      "(32,)\n",
      "(32, 1702942)\n",
      "(32,)\n",
      "(32, 1702942)\n",
      "(32,)\n",
      "(32, 1702942)\n",
      "(32,)\n",
      "(32, 1702942)\n",
      "(32,)\n",
      "(32, 1702942)\n",
      "(32,)\n"
     ]
    }
   ],
   "source": [
    "for sample_batch in training_dataset.take(10):\n",
    "    print(sample_batch[0].shape)\n",
    "    print(sample_batch[1].shape)"
   ]
  }
 ],
 "metadata": {
  "environment": {
   "name": "tf2-gpu.2-6.m81",
   "type": "gcloud",
   "uri": "gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m81"
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "0fee8005",
	"metadata": {},
	"source": [
	"## Setup"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "65fc8c36",
	"metadata": {},
	"outputs": [],
	"source": [
	"from tensorflow import keras\n",
	"import tensorflow as tf\n",
	"\n",
	"from sklearn.model_selection import train_test_split\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"import random\n",
	"import tqdm\n",
	"\n",
	"SEED = 42\n",
	"tf.random.set_seed(SEED)\n",
	"np.random.seed(SEED)\n",
	"random.seed(SEED)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "fe67b35a",
	"metadata": {},
	"source": [
	"## Data loading\n",
	"\n",
	"Data comes from here: https://www.kaggle.com/hijest/genre-classification-dataset-imdb."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "a39fc6ff",
	"metadata": {},
	"outputs": [],
	"source": [
	"train_df = pd.read_csv(\n",
	" \"./data/train_data.txt\",\n",
	" engine=\"python\",\n",
	" sep=\" ::: \",\n",
	" names=[\"id\", \"movie\", \"genre\", \"summary\"],\n",
	")\n",
	"\n",
	"test_df = pd.read_csv(\n",
	" \"./data/test_data_solution.txt\",\n",
	" engine=\"python\",\n",
	" sep=\" ::: \",\n",
	" names=[\"id\", \"movie\", \"genre\", \"summary\"],\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "4c1b5c6d",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>id</th>\n",
	" <th>movie</th>\n",
	" <th>genre</th>\n",
	" <th>summary</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>1</td>\n",
	" <td>Oscar et la dame rose (2009)</td>\n",
	" <td>drama</td>\n",
	" <td>Listening in to a conversation between his doc...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>2</td>\n",
	" <td>Cupid (1997)</td>\n",
	" <td>thriller</td>\n",
	" <td>A brother and sister with a past incestuous re...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>3</td>\n",
	" <td>Young, Wild and Wonderful (1980)</td>\n",
	" <td>adult</td>\n",
	" <td>As the bus empties the students for their fiel...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>4</td>\n",
	" <td>The Secret Sin (1915)</td>\n",
	" <td>drama</td>\n",
	" <td>To help their unemployed father make ends meet...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>5</td>\n",
	" <td>The Unrecovered (2007)</td>\n",
	" <td>drama</td>\n",
	" <td>The film's title refers not only to the un-rec...</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" id movie genre \\\n",
	"0 1 Oscar et la dame rose (2009) drama \n",
	"1 2 Cupid (1997) thriller \n",
	"2 3 Young, Wild and Wonderful (1980) adult \n",
	"3 4 The Secret Sin (1915) drama \n",
	"4 5 The Unrecovered (2007) drama \n",
	"\n",
	" summary \n",
	"0 Listening in to a conversation between his doc... \n",
	"1 A brother and sister with a past incestuous re... \n",
	"2 As the bus empties the students for their fiel... \n",
	"3 To help their unemployed father make ends meet... \n",
	"4 The film's title refers not only to the un-rec... "
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Viewing training data\n",
	"train_df.head()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "d2754413",
	"metadata": {},
	"source": [
	"## Data splitting"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "51e8d0d6",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Number of training samples: 48792.\n",
	"Number of validation samples: 5422.\n",
	"Number of test examples: 54200.\n"
	]
	}
	],
	"source": [
	"# Split the data using train_test_split from sklearn\n",
	"train_shuffled = train_df.sample(frac=1)\n",
	"train_df_new, val_df = train_test_split(train_shuffled, test_size=0.1)\n",
	"\n",
	"print(f\"Number of training samples: {len(train_df_new)}.\")\n",
	"print(f\"Number of validation samples: {len(val_df)}.\")\n",
	"print(f\"Number of test examples: {len(test_df)}.\")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "18668652",
	"metadata": {},
	"source": [
	"## Using [`bucket_by_sequence_length()`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#bucket_by_sequence_length)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "0ef030b0",
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"2021-12-19 15:55:28.592850: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
	"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"<tf.Tensor: shape=(), dtype=int32, numpy=92>"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"passage = train_df_new[\"summary\"][0]\n",
	"word_splits = tf.strings.split(passage, sep=\" \")\n",
	"tf.shape(word_splits)[0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "5ba03fa0",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"tf.Tensor(b\"On a lonely stretch of a highway, Ronit - pulls up in to a desolate pump to fill petrol. The attendant informs him that his car's fan belt is broken and a new one will only arrive in the morning. Stuck in the middle of nowhere, Ronit prepares to stake the night out in his car. When another car pulls in. The driver is a dignified, well-spoken man who lives a few miles away. He offers to house Ronit for the night, promising to drop him back in the morning. Ronit agrees, believing there is a god. But then there is also the devil.\", shape=(), dtype=string)\n"
	]
	}
	],
	"source": [
	"dataset = tf.data.Dataset.from_tensor_slices(train_df_new[\"summary\"])\n",
	"\n",
	"for sample in dataset.take(1):\n",
	" print(sample)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "e938da00",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['[UNK]',\n",
	" 'short',\n",
	" 'sci-fi',\n",
	" 'documentary',\n",
	" 'drama',\n",
	" 'thriller',\n",
	" 'comedy',\n",
	" 'adult',\n",
	" 'romance',\n",
	" 'adventure',\n",
	" 'western',\n",
	" 'family',\n",
	" 'talk-show',\n",
	" 'news',\n",
	" 'horror',\n",
	" 'history',\n",
	" 'music',\n",
	" 'sport',\n",
	" 'war',\n",
	" 'animation',\n",
	" 'game-show',\n",
	" 'action',\n",
	" 'crime',\n",
	" 'reality-tv',\n",
	" 'mystery',\n",
	" 'musical',\n",
	" 'fantasy',\n",
	" 'biography']"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"label_encoder = keras.layers.StringLookup(vocabulary=train_df_new[\"genre\"].unique())\n",
	"label_encoder.get_vocabulary()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "841e7e00",
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/var/folders/z_/d29z43w90kz6f4kbzv5c9m9r0000gn/T/ipykernel_60182/1668209276.py:1: SettingWithCopyWarning: \n",
	"A value is trying to be set on a copy of a slice from a DataFrame.\n",
	"Try using .loc[row_indexer,col_indexer] = value instead\n",
	"\n",
	"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
	" train_df_new[\"total_words\"] = train_df_new[\"summary\"].str.split().str.len()\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"1829"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"train_df_new[\"total_words\"] = train_df_new[\"summary\"].str.split().str.len()\n",
	"vocabulary_size = train_df_new[\"total_words\"].max()\n",
	"vocabulary_size"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "bf3addc3",
	"metadata": {},
	"outputs": [],
	"source": [
	"text_vectorizer = keras.layers.TextVectorization(ngrams=2, output_mode=\"tf_idf\")\n",
	"\n",
	"# `TextVectorization` layer needs to be adapted as per the vocabulary from our\n",
	"# training set.\n",
	"dataset_ = tf.data.Dataset.from_tensor_slices(\n",
	" (train_df_new[\"summary\"].values, train_df_new[\"genre\"].values)\n",
	" )\n",
	"with tf.device(\"/CPU:0\"):\n",
	" text_vectorizer.adapt(dataset_.map(lambda text, label: text))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "32fa0dd0",
	"metadata": {},
	"outputs": [],
	"source": [
	"def preprocess_batch(summary, label):\n",
	" summary = text_vectorizer(summary)\n",
	" label = label_encoder(label)\n",
	" return summary, label\n",
	"\n",
	"\n",
	"def prepare_dataset(dataframe):\n",
	" dataset = tf.data.Dataset.from_tensor_slices(\n",
	" (dataframe[\"summary\"].values, dataframe[\"genre\"].values)\n",
	" )\n",
	" dataset = dataset.bucket_by_sequence_length(\n",
	" element_length_func=lambda elem, label: tf.shape(tf.strings.split(elem, sep=\" \"))[0],\n",
	" bucket_boundaries=[512],\n",
	" bucket_batch_sizes=[32, 32],\n",
	" )\n",
	" dataset = dataset.map(preprocess_batch, num_parallel_calls=tf.data.AUTOTUNE)\n",
	" return dataset.prefetch(tf.data.AUTOTUNE)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"id": "e62347f1",
	"metadata": {},
	"outputs": [],
	"source": [
	"training_dataset = prepare_dataset(train_df_new)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"id": "20b0ee53",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"(32, 1702942)\n",
	"(32,)\n",
	"(32, 1702942)\n",
	"(32,)\n",
	"(32, 1702942)\n",
	"(32,)\n",
	"(32, 1702942)\n",
	"(32,)\n",
	"(32, 1702942)\n",
	"(32,)\n",
	"(32, 1702942)\n",
	"(32,)\n",
	"(32, 1702942)\n",
	"(32,)\n",
	"(32, 1702942)\n",
	"(32,)\n",
	"(32, 1702942)\n",
	"(32,)\n",
	"(32, 1702942)\n",
	"(32,)\n"
	]
	}
	],
	"source": [
	"for sample_batch in training_dataset.take(10):\n",
	" print(sample_batch[0].shape)\n",
	" print(sample_batch[1].shape)"
	]
	}
	],
	"metadata": {
	"environment": {
	"name": "tf2-gpu.2-6.m81",
	"type": "gcloud",
	"uri": "gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m81"
	},
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}