Skip to content

Instantly share code, notes, and snippets.

@praateekmahajan
Last active December 16, 2020 05:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save praateekmahajan/bc2ace6477d04927038a7a5acaf30349 to your computer and use it in GitHub Desktop.
Save praateekmahajan/bc2ace6477d04927038a7a5acaf30349 to your computer and use it in GitHub Desktop.
Pack padded sequence is slower than unpacked sequence
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence\n",
"import torch.nn as nn\n",
"import torch\n",
"import numpy as np\n",
"import pandas as pd\n",
"import time\n",
"from tqdm.notebook import tqdm\n",
"\n",
"num_rows = 100000\n",
"max_length = 100\n",
"\n",
"df = pd.DataFrame(\n",
" [\n",
" {\"data\": np.random.randn(np.random.randint(1, max_length))}\n",
" for _ in range(num_rows)\n",
" ]\n",
")\n",
"df[\"size\"] = df[\"data\"].apply(len)\n",
"\n",
"\n",
"class DummyDataset(Dataset):\n",
" def __init__(self, file_name_or_df, max_seq_len=50):\n",
" self.max_seq_len = max_seq_len\n",
" if isinstance(file_name_or_df, str):\n",
" self.data = pd.read_json(file_name_or_df)\n",
" else:\n",
" self.data = file_name_or_df\n",
"\n",
" def __len__(self):\n",
" return len(self.data)\n",
"\n",
" def __getitem__(self, idx):\n",
" sample = self.data.iloc[idx]\n",
" seq_len = sample[\"size\"]\n",
" seq = sample[\"data\"]\n",
" if seq_len > self.max_seq_len:\n",
" seq = np.asarray(seq[-self.max_seq_len :])\n",
" else:\n",
" seq = np.pad(\n",
" seq, pad_width=(0, self.max_seq_len - seq_len), constant_values=0\n",
" )\n",
"\n",
" return {\n",
" \"data\": torch.tensor(seq, dtype=torch.float),\n",
" \"size\": min(seq_len, self.max_seq_len),\n",
" }\n",
"\n",
"\n",
"class DummyModel(nn.Module):\n",
" def __init__(self, should_pack):\n",
" super(DummyModel, self).__init__()\n",
"\n",
" self.rnn_in_dim = 1\n",
" self.rnn_out_dim = 3\n",
" self.rnn_num_layers = 1\n",
"\n",
" self.should_pack = should_pack\n",
"\n",
" self.rnn = nn.RNN(\n",
" input_size=self.rnn_in_dim,\n",
" hidden_size=self.rnn_out_dim,\n",
" num_layers=self.rnn_num_layers,\n",
" batch_first=True,\n",
" )\n",
"\n",
" self.fc = nn.Linear(self.rnn_out_dim, self.num_classes)\n",
"\n",
" def forward(self, batch):\n",
"\n",
" input_rnn = batch[\"data\"].unsqueeze(-1)\n",
" if self.should_pack:\n",
" packed_input = pack_padded_sequence(\n",
" input_rnn, batch[\"size\"], batch_first=True, enforce_sorted=False\n",
" )\n",
" packed_rnn_out, _ = self.rnn(packed_input)\n",
" rnn_out, _ = pad_packed_sequence(packed_rnn_out, batch_first=True)\n",
" else:\n",
" rnn_out, _ = self.rnn(input_rnn)\n",
"\n",
" return rnn_out"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "81b65336788c4025b46205c8087e233b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3125.0), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Time taken when packing is enabled : 22.701847791671753\n"
]
}
],
"source": [
"ds = DummyDataset(df)\n",
"dl = DataLoader(ds, batch_size=32, shuffle=True)\n",
"\n",
"model = DummyModel(should_pack=True)\n",
"start_time = time.time()\n",
"for batch_idx, batch in enumerate(tqdm(dl)):\n",
" model(batch)\n",
"print(f\"Time taken when packing is enabled : {time.time() - start_time}\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ffb9dbb8b7b549ed83f3496f51a348d5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3125.0), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Time taken when packing is disabled : 19.904962062835693\n"
]
}
],
"source": [
"ds = DummyDataset(df)\n",
"dl = DataLoader(ds, batch_size=32, shuffle=True)\n",
"\n",
"model = DummyModel(should_pack=False)\n",
"start_time = time.time()\n",
"for batch_idx, batch in enumerate(tqdm(dl)):\n",
" model(batch)\n",
"print(f\"Time taken when packing is disabled : {time.time() - start_time}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment