Skip to content

Instantly share code, notes, and snippets.

@apple2373
Last active April 6, 2024 11:44
Show Gist options
  • Save apple2373/8b421991898a0b4311456a46d8cd412a to your computer and use it in GitHub Desktop.
Save apple2373/8b421991898a0b4311456a46d8cd412a to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"clip 1.0\n",
"gsheets 0.6.1\n",
"imageio 2.34.0\n",
"matplotlib 3.8.4\n",
"numpy 1.26.4\n",
"opencv-python 4.9.0.80\n",
"pandas 2.2.1\n",
"pycocoevalcap 1.2\n",
"pytorch-lightning 1.6.0\n",
"scipy 1.13.0\n",
"spacy 3.0.0\n",
"torch 1.11.0+cu113\n",
"torchinfo 1.8.0\n",
"torchvision 0.12.0+cu113\n",
"wandb 0.16.6\n"
]
}
],
"source": [
"# print installed packages versions for those listed in requirements.txt\n",
"import pkg_resources\n",
"with open('requirements.txt') as f:\n",
" reqs = [line.split('==')[0] for line in f.read().splitlines()]\n",
"installed_packages = {pkg.key: pkg.version for pkg in pkg_resources.working_set if pkg.key in reqs}\n",
"for k,v in installed_packages.items():\n",
" print(k,v)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CVCL-VIT\n",
"- Clone from https://github.com/wkvong/multimodal-baby and put this notebook at the root\n",
"- Testd on the commit ['1dcc72e6f37fabcbac5a04235a3489d7304e644c'](https://github.com/wkvong/multimodal-baby/tree/1dcc72e6f37fabcbac5a04235a3489d7304e644c)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/localstorage/miniconda3/envs/baby/lib/python3.9/site-packages/pytorch_lightning/utilities/parsing.py:244: UserWarning: Attribute 'vision_encoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['vision_encoder'])`.\n",
" rank_zero_warn(\n",
"/home/localstorage/miniconda3/envs/baby/lib/python3.9/site-packages/pytorch_lightning/utilities/parsing.py:244: UserWarning: Attribute 'text_encoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['text_encoder'])`.\n",
" rank_zero_warn(\n"
]
},
{
"ename": "RuntimeError",
"evalue": "The size of tensor a (27) must match the size of tensor b (25) at non-singleton dimension 0",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[43], line 26\u001b[0m\n\u001b[1;32m 24\u001b[0m texts, texts_len \u001b[38;5;241m=\u001b[39m cvcl\u001b[38;5;241m.\u001b[39mtokenize(texts)\n\u001b[1;32m 25\u001b[0m texts, texts_len \u001b[38;5;241m=\u001b[39m texts\u001b[38;5;241m.\u001b[39mto(device), texts_len\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[0;32m---> 26\u001b[0m texts_features \u001b[38;5;241m=\u001b[39m \u001b[43mcvcl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtexts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtexts_len\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/home/ssd_satoshi/projects/cvclvit/multimodal/multimodal_lit.py:158\u001b[0m, in \u001b[0;36mMultiModalLitModel.encode_text\u001b[0;34m(self, y, y_len)\u001b[0m\n\u001b[1;32m 156\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mencode_text\u001b[39m(\u001b[38;5;28mself\u001b[39m, y, y_len\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 157\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Encode text to obtain text features\"\"\"\u001b[39;00m\n\u001b[0;32m--> 158\u001b[0m text_features, _ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_len\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m text_features\n",
"File \u001b[0;32m/home/ssd_satoshi/projects/cvclvit/multimodal/multimodal.py:740\u001b[0m, in \u001b[0;36mMultiModalModel.encode_text\u001b[0;34m(self, text, text_length)\u001b[0m\n\u001b[1;32m 739\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mencode_text\u001b[39m(\u001b[38;5;28mself\u001b[39m, text, text_length):\n\u001b[0;32m--> 740\u001b[0m text_features, text_outputs, attns \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext_embed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtext_length\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 741\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnormalize_features:\n\u001b[1;32m 742\u001b[0m \u001b[38;5;66;03m# normalize text features\u001b[39;00m\n\u001b[1;32m 743\u001b[0m text_features \u001b[38;5;241m=\u001b[39m F\u001b[38;5;241m.\u001b[39mnormalize(text_features, p\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n",
"File \u001b[0;32m/home/localstorage/miniconda3/envs/baby/lib/python3.9/site-packages/torch/nn/modules/module.py:1110\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1106\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1107\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1108\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1109\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1110\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1111\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1112\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
"File \u001b[0;32m/home/ssd_satoshi/projects/cvclvit/multimodal/multimodal.py:563\u001b[0m, in \u001b[0;36mTextEncoder.forward\u001b[0;34m(self, x, x_len, image_features, image_feature_map)\u001b[0m\n\u001b[1;32m 561\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpos_embed_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msinusoidal\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpos_embed_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlearned\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 562\u001b[0m pos_embed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpos_embed[:embedding\u001b[38;5;241m.\u001b[39msize(\u001b[38;5;241m0\u001b[39m), :, :]\n\u001b[0;32m--> 563\u001b[0m embedding \u001b[38;5;241m=\u001b[39m \u001b[43membedding\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpos_embed\u001b[49m\n\u001b[1;32m 565\u001b[0m raw_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformer_encoder(embedding, src_key_padding_mask\u001b[38;5;241m=\u001b[39msrc_key_padding_mask)\n\u001b[1;32m 567\u001b[0m \u001b[38;5;66;03m# transpose back to (B, L, E)\u001b[39;00m\n",
"\u001b[0;31mRuntimeError\u001b[0m: The size of tensor a (27) must match the size of tensor b (25) at non-singleton dimension 0"
]
}
],
"source": [
"import torch\n",
"from multimodal.multimodal_lit import MultiModalLitModel\n",
"from huggingface_hub import hf_hub_download\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"\n",
"from torchvision import transforms\n",
"preprocess = transforms.Compose([\n",
" transforms.Resize((224, 224)),\n",
" transforms.ToTensor(),\n",
" transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])\n",
"\n",
"checkpoint_name = \"cvcl_s_dino_vit_embedding\"\n",
"checkpoint = hf_hub_download(repo_id=\"wkvong/\"+checkpoint_name, filename=checkpoint_name+\".ckpt\")\n",
"cvcl = MultiModalLitModel.load_from_checkpoint(checkpoint_path=checkpoint)\n",
"cvcl = cvcl.to(device)\n",
"cvcl.eval()\n",
"\n",
"# create random image to encode\n",
"images = torch.rand(4, 3, 224, 224).to(device)\n",
"image_features = cvcl.encode_image(images)\n",
"\n",
"# create texts to encode\n",
"texts = [\"ball\", \"puzzle\", \"car\"]\n",
"texts, texts_len = cvcl.tokenize(texts)\n",
"texts, texts_len = texts.to(device), texts_len.to(device)\n",
"texts_features = cvcl.encode_text(texts, texts_len)\n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"texts.shape torch.Size([3, 27])\n",
"texts_len tensor([6, 8, 5], device='cuda:0')\n",
"cvcl.text_encoder.pos_embed.shape torch.Size([25, 1, 512])\n"
]
}
],
"source": [
"print(\"texts.shape\",texts.shape)\n",
"print(\"texts_len\",texts_len)\n",
"print(\"cvcl.text_encoder.pos_embed.shape\",cvcl.text_encoder.pos_embed.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Questions\n",
"- Why is `texts.shape` 27 despite the positional encoding being with a length of 25?\n",
"- Why is `texts_len` `[6, 8, 5]`? Shouldn't it be `[3, 3, 3]` because each should be `<eos> word <eos>`?\n",
"\n",
"## Suggested Fix\n",
"- Modify [`tokenize(self, texts)`](https://github.com/wkvong/multimodal-baby/blob/1dcc72e6f37fabcbac5a04235a3489d7304e644c/multimodal/multimodal_lit.py#L161-L178) function in `MultiModalLitModel` to the following:"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"def tokenize_modified(self,texts):\n",
" \"\"\"Tokenize texts to obtain tokens and token lengths\"\"\"\n",
" max_seq_len = 23 # hold out the 2 of <sos> and <eos>\n",
"\n",
" if isinstance(texts, str):\n",
" texts = [texts]\n",
"\n",
" all_tokens = []\n",
" token_lengths = []\n",
" for text in texts:\n",
" doc = self.nlp(text)\n",
" tokens = [token.text for token in doc]\n",
" # TODO: might\n",
" tokens = [self.vocab[\"<sos>\"]] + [self.vocab.get(token, self.vocab[\"<unk>\"]) for token in tokens] + [self.vocab[\"<eos>\"]] + [self.vocab[\"<pad>\"]] * (max_seq_len - len(tokens))\n",
" all_tokens.append(tokens)\n",
" token_lengths.append(len(tokens))\n",
"\n",
" tokens = torch.tensor(all_tokens, dtype=torch.long)\n",
" token_lengths = torch.tensor(token_lengths, dtype=torch.long)\n",
" return tokens, token_lengths"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- then the following code works!"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"checkpoint_name = \"cvcl_s_dino_vit_embedding\"\n",
"checkpoint = hf_hub_download(repo_id=\"wkvong/\"+checkpoint_name, filename=checkpoint_name+\".ckpt\")\n",
"cvcl = MultiModalLitModel.load_from_checkpoint(checkpoint_path=checkpoint)\n",
"cvcl = cvcl.to(device)\n",
"cvcl.eval()\n",
"\n",
"# create random image to encode\n",
"images = torch.rand(4, 3, 224, 224).to(device)\n",
"image_features = cvcl.encode_image(images)\n",
"\n",
"# create texts to encode\n",
"texts = [\"ball\", \"puzzle\", \"car\"]\n",
"texts, texts_len = tokenize_modified(cvcl,texts)\n",
"texts, texts_len = texts.to(device), texts_len.to(device)\n",
"texts_features = cvcl.encode_text(texts, texts_len)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "prsclip",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment