apple2373/cvcl_vit.ipynb

## cvcl_vit.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "clip 1.0\n",
      "gsheets 0.6.1\n",
      "imageio 2.34.0\n",
      "matplotlib 3.8.4\n",
      "numpy 1.26.4\n",
      "opencv-python 4.9.0.80\n",
      "pandas 2.2.1\n",
      "pycocoevalcap 1.2\n",
      "pytorch-lightning 1.6.0\n",
      "scipy 1.13.0\n",
      "spacy 3.0.0\n",
      "torch 1.11.0+cu113\n",
      "torchinfo 1.8.0\n",
      "torchvision 0.12.0+cu113\n",
      "wandb 0.16.6\n"
     ]
    }
   ],
   "source": [
    "# print installed packages versions for those listed in requirements.txt\n",
    "import pkg_resources\n",
    "with open('requirements.txt') as f:\n",
    "    reqs = [line.split('==')[0] for line in f.read().splitlines()]\n",
    "installed_packages = {pkg.key: pkg.version for pkg in pkg_resources.working_set if pkg.key in reqs}\n",
    "for k,v in installed_packages.items():\n",
    "    print(k,v)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CVCL-VIT\n",
    "- Clone from https://github.com/wkvong/multimodal-baby and put this notebook at the root\n",
    "- Testd on the commit ['1dcc72e6f37fabcbac5a04235a3489d7304e644c'](https://github.com/wkvong/multimodal-baby/tree/1dcc72e6f37fabcbac5a04235a3489d7304e644c)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/localstorage/miniconda3/envs/baby/lib/python3.9/site-packages/pytorch_lightning/utilities/parsing.py:244: UserWarning: Attribute 'vision_encoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['vision_encoder'])`.\n",
      "  rank_zero_warn(\n",
      "/home/localstorage/miniconda3/envs/baby/lib/python3.9/site-packages/pytorch_lightning/utilities/parsing.py:244: UserWarning: Attribute 'text_encoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['text_encoder'])`.\n",
      "  rank_zero_warn(\n"
     ]
    },
    {
     "ename": "RuntimeError",
     "evalue": "The size of tensor a (27) must match the size of tensor b (25) at non-singleton dimension 0",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[43], line 26\u001b[0m\n\u001b[1;32m     24\u001b[0m texts, texts_len \u001b[38;5;241m=\u001b[39m cvcl\u001b[38;5;241m.\u001b[39mtokenize(texts)\n\u001b[1;32m     25\u001b[0m texts, texts_len \u001b[38;5;241m=\u001b[39m texts\u001b[38;5;241m.\u001b[39mto(device), texts_len\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[0;32m---> 26\u001b[0m texts_features \u001b[38;5;241m=\u001b[39m \u001b[43mcvcl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtexts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtexts_len\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/home/ssd_satoshi/projects/cvclvit/multimodal/multimodal_lit.py:158\u001b[0m, in \u001b[0;36mMultiModalLitModel.encode_text\u001b[0;34m(self, y, y_len)\u001b[0m\n\u001b[1;32m    156\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mencode_text\u001b[39m(\u001b[38;5;28mself\u001b[39m, y, y_len\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m    157\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Encode text to obtain text features\"\"\"\u001b[39;00m\n\u001b[0;32m--> 158\u001b[0m     text_features, _ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_len\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    159\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m text_features\n",
      "File \u001b[0;32m/home/ssd_satoshi/projects/cvclvit/multimodal/multimodal.py:740\u001b[0m, in \u001b[0;36mMultiModalModel.encode_text\u001b[0;34m(self, text, text_length)\u001b[0m\n\u001b[1;32m    739\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mencode_text\u001b[39m(\u001b[38;5;28mself\u001b[39m, text, text_length):\n\u001b[0;32m--> 740\u001b[0m     text_features, text_outputs, attns \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext_embed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtext_length\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    741\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnormalize_features:\n\u001b[1;32m    742\u001b[0m         \u001b[38;5;66;03m# normalize text features\u001b[39;00m\n\u001b[1;32m    743\u001b[0m         text_features \u001b[38;5;241m=\u001b[39m F\u001b[38;5;241m.\u001b[39mnormalize(text_features, p\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n",
      "File \u001b[0;32m/home/localstorage/miniconda3/envs/baby/lib/python3.9/site-packages/torch/nn/modules/module.py:1110\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m   1106\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1107\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1108\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1109\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1110\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1111\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1112\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
      "File \u001b[0;32m/home/ssd_satoshi/projects/cvclvit/multimodal/multimodal.py:563\u001b[0m, in \u001b[0;36mTextEncoder.forward\u001b[0;34m(self, x, x_len, image_features, image_feature_map)\u001b[0m\n\u001b[1;32m    561\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpos_embed_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msinusoidal\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpos_embed_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlearned\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m    562\u001b[0m     pos_embed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpos_embed[:embedding\u001b[38;5;241m.\u001b[39msize(\u001b[38;5;241m0\u001b[39m), :, :]\n\u001b[0;32m--> 563\u001b[0m     embedding \u001b[38;5;241m=\u001b[39m \u001b[43membedding\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpos_embed\u001b[49m\n\u001b[1;32m    565\u001b[0m raw_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformer_encoder(embedding, src_key_padding_mask\u001b[38;5;241m=\u001b[39msrc_key_padding_mask)\n\u001b[1;32m    567\u001b[0m \u001b[38;5;66;03m# transpose back to (B, L, E)\u001b[39;00m\n",
      "\u001b[0;31mRuntimeError\u001b[0m: The size of tensor a (27) must match the size of tensor b (25) at non-singleton dimension 0"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from multimodal.multimodal_lit import MultiModalLitModel\n",
    "from huggingface_hub import hf_hub_download\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "from torchvision import transforms\n",
    "preprocess = transforms.Compose([\n",
    "    transforms.Resize((224, 224)),\n",
    "    transforms.ToTensor(),\n",
    "    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])\n",
    "\n",
    "checkpoint_name = \"cvcl_s_dino_vit_embedding\"\n",
    "checkpoint = hf_hub_download(repo_id=\"wkvong/\"+checkpoint_name, filename=checkpoint_name+\".ckpt\")\n",
    "cvcl = MultiModalLitModel.load_from_checkpoint(checkpoint_path=checkpoint)\n",
    "cvcl = cvcl.to(device)\n",
    "cvcl.eval()\n",
    "\n",
    "# create random image to encode\n",
    "images = torch.rand(4, 3, 224, 224).to(device)\n",
    "image_features = cvcl.encode_image(images)\n",
    "\n",
    "# create texts to encode\n",
    "texts = [\"ball\", \"puzzle\", \"car\"]\n",
    "texts, texts_len = cvcl.tokenize(texts)\n",
    "texts, texts_len = texts.to(device), texts_len.to(device)\n",
    "texts_features = cvcl.encode_text(texts, texts_len)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "texts.shape torch.Size([3, 27])\n",
      "texts_len tensor([6, 8, 5], device='cuda:0')\n",
      "cvcl.text_encoder.pos_embed.shape torch.Size([25, 1, 512])\n"
     ]
    }
   ],
   "source": [
    "print(\"texts.shape\",texts.shape)\n",
    "print(\"texts_len\",texts_len)\n",
    "print(\"cvcl.text_encoder.pos_embed.shape\",cvcl.text_encoder.pos_embed.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Questions\n",
    "- Why is `texts.shape` 27 despite the positional encoding being with a length of 25?\n",
    "- Why is `texts_len` `[6, 8, 5]`? Shouldn't it be `[3, 3, 3]` because each should be `<eos> word <eos>`?\n",
    "\n",
    "## Suggested Fix\n",
    "- Modify [`tokenize(self, texts)`](https://github.com/wkvong/multimodal-baby/blob/1dcc72e6f37fabcbac5a04235a3489d7304e644c/multimodal/multimodal_lit.py#L161-L178) function in `MultiModalLitModel` to the following:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize_modified(self,texts):\n",
    "    \"\"\"Tokenize texts to obtain tokens and token lengths\"\"\"\n",
    "    max_seq_len = 23 # hold out the 2 of <sos> and <eos>\n",
    "\n",
    "    if isinstance(texts, str):\n",
    "        texts = [texts]\n",
    "\n",
    "    all_tokens = []\n",
    "    token_lengths = []\n",
    "    for text in texts:\n",
    "        doc = self.nlp(text)\n",
    "        tokens = [token.text for token in doc]\n",
    "        # TODO: might\n",
    "        tokens = [self.vocab[\"<sos>\"]] + [self.vocab.get(token, self.vocab[\"<unk>\"]) for token in tokens] + [self.vocab[\"<eos>\"]] + [self.vocab[\"<pad>\"]] * (max_seq_len - len(tokens))\n",
    "        all_tokens.append(tokens)\n",
    "        token_lengths.append(len(tokens))\n",
    "\n",
    "    tokens = torch.tensor(all_tokens, dtype=torch.long)\n",
    "    token_lengths = torch.tensor(token_lengths, dtype=torch.long)\n",
    "    return tokens, token_lengths"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- then the following code works!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "checkpoint_name = \"cvcl_s_dino_vit_embedding\"\n",
    "checkpoint = hf_hub_download(repo_id=\"wkvong/\"+checkpoint_name, filename=checkpoint_name+\".ckpt\")\n",
    "cvcl = MultiModalLitModel.load_from_checkpoint(checkpoint_path=checkpoint)\n",
    "cvcl = cvcl.to(device)\n",
    "cvcl.eval()\n",
    "\n",
    "# create random image to encode\n",
    "images = torch.rand(4, 3, 224, 224).to(device)\n",
    "image_features = cvcl.encode_image(images)\n",
    "\n",
    "# create texts to encode\n",
    "texts = [\"ball\", \"puzzle\", \"car\"]\n",
    "texts, texts_len = tokenize_modified(cvcl,texts)\n",
    "texts, texts_len = texts.to(device), texts_len.to(device)\n",
    "texts_features = cvcl.encode_text(texts, texts_len)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "prsclip",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"clip 1.0\n",
	"gsheets 0.6.1\n",
	"imageio 2.34.0\n",
	"matplotlib 3.8.4\n",
	"numpy 1.26.4\n",
	"opencv-python 4.9.0.80\n",
	"pandas 2.2.1\n",
	"pycocoevalcap 1.2\n",
	"pytorch-lightning 1.6.0\n",
	"scipy 1.13.0\n",
	"spacy 3.0.0\n",
	"torch 1.11.0+cu113\n",
	"torchinfo 1.8.0\n",
	"torchvision 0.12.0+cu113\n",
	"wandb 0.16.6\n"
	]
	}
	],
	"source": [
	"# print installed packages versions for those listed in requirements.txt\n",
	"import pkg_resources\n",
	"with open('requirements.txt') as f:\n",
	" reqs = [line.split('==')[0] for line in f.read().splitlines()]\n",
	"installed_packages = {pkg.key: pkg.version for pkg in pkg_resources.working_set if pkg.key in reqs}\n",
	"for k,v in installed_packages.items():\n",
	" print(k,v)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# CVCL-VIT\n",
	"- Clone from https://github.com/wkvong/multimodal-baby and put this notebook at the root\n",
	"- Testd on the commit ['1dcc72e6f37fabcbac5a04235a3489d7304e644c'](https://github.com/wkvong/multimodal-baby/tree/1dcc72e6f37fabcbac5a04235a3489d7304e644c)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 43,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/home/localstorage/miniconda3/envs/baby/lib/python3.9/site-packages/pytorch_lightning/utilities/parsing.py:244: UserWarning: Attribute 'vision_encoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['vision_encoder'])`.\n",
	" rank_zero_warn(\n",
	"/home/localstorage/miniconda3/envs/baby/lib/python3.9/site-packages/pytorch_lightning/utilities/parsing.py:244: UserWarning: Attribute 'text_encoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['text_encoder'])`.\n",
	" rank_zero_warn(\n"
	]
	},
	{
	"ename": "RuntimeError",
	"evalue": "The size of tensor a (27) must match the size of tensor b (25) at non-singleton dimension 0",
	"output_type": "error",
	"traceback": [
	"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
	"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
	"Cell \u001b[0;32mIn[43], line 26\u001b[0m\n\u001b[1;32m 24\u001b[0m texts, texts_len \u001b[38;5;241m=\u001b[39m cvcl\u001b[38;5;241m.\u001b[39mtokenize(texts)\n\u001b[1;32m 25\u001b[0m texts, texts_len \u001b[38;5;241m=\u001b[39m texts\u001b[38;5;241m.\u001b[39mto(device), texts_len\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[0;32m---> 26\u001b[0m texts_features \u001b[38;5;241m=\u001b[39m \u001b[43mcvcl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtexts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtexts_len\u001b[49m\u001b[43m)\u001b[49m\n",
	"File \u001b[0;32m/home/ssd_satoshi/projects/cvclvit/multimodal/multimodal_lit.py:158\u001b[0m, in \u001b[0;36mMultiModalLitModel.encode_text\u001b[0;34m(self, y, y_len)\u001b[0m\n\u001b[1;32m 156\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mencode_text\u001b[39m(\u001b[38;5;28mself\u001b[39m, y, y_len\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 157\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Encode text to obtain text features\"\"\"\u001b[39;00m\n\u001b[0;32m--> 158\u001b[0m text_features, _ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_len\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m text_features\n",
	"File \u001b[0;32m/home/ssd_satoshi/projects/cvclvit/multimodal/multimodal.py:740\u001b[0m, in \u001b[0;36mMultiModalModel.encode_text\u001b[0;34m(self, text, text_length)\u001b[0m\n\u001b[1;32m 739\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mencode_text\u001b[39m(\u001b[38;5;28mself\u001b[39m, text, text_length):\n\u001b[0;32m--> 740\u001b[0m text_features, text_outputs, attns \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext_embed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtext_length\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 741\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnormalize_features:\n\u001b[1;32m 742\u001b[0m \u001b[38;5;66;03m# normalize text features\u001b[39;00m\n\u001b[1;32m 743\u001b[0m text_features \u001b[38;5;241m=\u001b[39m F\u001b[38;5;241m.\u001b[39mnormalize(text_features, p\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n",
	"File \u001b[0;32m/home/localstorage/miniconda3/envs/baby/lib/python3.9/site-packages/torch/nn/modules/module.py:1110\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, input, kwargs)\u001b[0m\n\u001b[1;32m 1106\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1107\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1108\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1109\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1110\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m\u001b[39;49m\u001b[38;5;241;43m\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1111\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1112\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
	"File \u001b[0;32m/home/ssd_satoshi/projects/cvclvit/multimodal/multimodal.py:563\u001b[0m, in \u001b[0;36mTextEncoder.forward\u001b[0;34m(self, x, x_len, image_features, image_feature_map)\u001b[0m\n\u001b[1;32m 561\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpos_embed_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msinusoidal\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpos_embed_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlearned\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 562\u001b[0m pos_embed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpos_embed[:embedding\u001b[38;5;241m.\u001b[39msize(\u001b[38;5;241m0\u001b[39m), :, :]\n\u001b[0;32m--> 563\u001b[0m embedding \u001b[38;5;241m=\u001b[39m \u001b[43membedding\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpos_embed\u001b[49m\n\u001b[1;32m 565\u001b[0m raw_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformer_encoder(embedding, src_key_padding_mask\u001b[38;5;241m=\u001b[39msrc_key_padding_mask)\n\u001b[1;32m 567\u001b[0m \u001b[38;5;66;03m# transpose back to (B, L, E)\u001b[39;00m\n",
	"\u001b[0;31mRuntimeError\u001b[0m: The size of tensor a (27) must match the size of tensor b (25) at non-singleton dimension 0"
	]
	}
	],
	"source": [
	"import torch\n",
	"from multimodal.multimodal_lit import MultiModalLitModel\n",
	"from huggingface_hub import hf_hub_download\n",
	"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
	"\n",
	"from torchvision import transforms\n",
	"preprocess = transforms.Compose([\n",
	" transforms.Resize((224, 224)),\n",
	" transforms.ToTensor(),\n",
	" transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])\n",
	"\n",
	"checkpoint_name = \"cvcl_s_dino_vit_embedding\"\n",
	"checkpoint = hf_hub_download(repo_id=\"wkvong/\"+checkpoint_name, filename=checkpoint_name+\".ckpt\")\n",
	"cvcl = MultiModalLitModel.load_from_checkpoint(checkpoint_path=checkpoint)\n",
	"cvcl = cvcl.to(device)\n",
	"cvcl.eval()\n",
	"\n",
	"# create random image to encode\n",
	"images = torch.rand(4, 3, 224, 224).to(device)\n",
	"image_features = cvcl.encode_image(images)\n",
	"\n",
	"# create texts to encode\n",
	"texts = [\"ball\", \"puzzle\", \"car\"]\n",
	"texts, texts_len = cvcl.tokenize(texts)\n",
	"texts, texts_len = texts.to(device), texts_len.to(device)\n",
	"texts_features = cvcl.encode_text(texts, texts_len)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"texts.shape torch.Size([3, 27])\n",
	"texts_len tensor([6, 8, 5], device='cuda:0')\n",
	"cvcl.text_encoder.pos_embed.shape torch.Size([25, 1, 512])\n"
	]
	}
	],
	"source": [
	"print(\"texts.shape\",texts.shape)\n",
	"print(\"texts_len\",texts_len)\n",
	"print(\"cvcl.text_encoder.pos_embed.shape\",cvcl.text_encoder.pos_embed.shape)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Questions\n",
	"- Why is `texts.shape` 27 despite the positional encoding being with a length of 25?\n",
	"- Why is `texts_len` `[6, 8, 5]`? Shouldn't it be `[3, 3, 3]` because each should be `<eos> word <eos>`?\n",
	"\n",
	"## Suggested Fix\n",
	"- Modify [`tokenize(self, texts)`](https://github.com/wkvong/multimodal-baby/blob/1dcc72e6f37fabcbac5a04235a3489d7304e644c/multimodal/multimodal_lit.py#L161-L178) function in `MultiModalLitModel` to the following:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 40,
	"metadata": {},
	"outputs": [],
	"source": [
	"def tokenize_modified(self,texts):\n",
	" \"\"\"Tokenize texts to obtain tokens and token lengths\"\"\"\n",
	" max_seq_len = 23 # hold out the 2 of <sos> and <eos>\n",
	"\n",
	" if isinstance(texts, str):\n",
	" texts = [texts]\n",
	"\n",
	" all_tokens = []\n",
	" token_lengths = []\n",
	" for text in texts:\n",
	" doc = self.nlp(text)\n",
	" tokens = [token.text for token in doc]\n",
	" # TODO: might\n",
	" tokens = [self.vocab[\"<sos>\"]] + [self.vocab.get(token, self.vocab[\"<unk>\"]) for token in tokens] + [self.vocab[\"<eos>\"]] + [self.vocab[\"<pad>\"]] * (max_seq_len - len(tokens))\n",
	" all_tokens.append(tokens)\n",
	" token_lengths.append(len(tokens))\n",
	"\n",
	" tokens = torch.tensor(all_tokens, dtype=torch.long)\n",
	" token_lengths = torch.tensor(token_lengths, dtype=torch.long)\n",
	" return tokens, token_lengths"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"- then the following code works!"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 41,
	"metadata": {},
	"outputs": [],
	"source": [
	"checkpoint_name = \"cvcl_s_dino_vit_embedding\"\n",
	"checkpoint = hf_hub_download(repo_id=\"wkvong/\"+checkpoint_name, filename=checkpoint_name+\".ckpt\")\n",
	"cvcl = MultiModalLitModel.load_from_checkpoint(checkpoint_path=checkpoint)\n",
	"cvcl = cvcl.to(device)\n",
	"cvcl.eval()\n",
	"\n",
	"# create random image to encode\n",
	"images = torch.rand(4, 3, 224, 224).to(device)\n",
	"image_features = cvcl.encode_image(images)\n",
	"\n",
	"# create texts to encode\n",
	"texts = [\"ball\", \"puzzle\", \"car\"]\n",
	"texts, texts_len = tokenize_modified(cvcl,texts)\n",
	"texts, texts_len = texts.to(device), texts_len.to(device)\n",
	"texts_features = cvcl.encode_text(texts, texts_len)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "prsclip",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.19"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}