sayakpaul/coco_30k_hf_datasets.py

## coco_30k_hf_datasets.py
from datasets import Dataset, Features
from datasets import Image as ImageFeature
from datasets import Value
import pandas as pd
import os

# CSV comes from the notebook above.
df = pd.read_csv("coco_30k_randomly_sampled_2014_val.csv")
root_path = "val2014"

def gen_fn():
    for i, row in df.iterrows():
        path = os.path.join(root_path, row["file_name"])
        caption = row["caption"]
        yield {"image": path, "caption": caption}


if __name__ == "__main__":
    ds = Dataset.from_generator(
        gen_fn,
        features=Features(image=ImageFeature(), caption=Value("string")),
    )
    ds_id = "sayakpaul/coco-30-val-2014" # Change this.
    # To be able to push, you need to run `huggingface-cli login`.
    ds.push_to_hub(ds_id)

## instructions.md

      
    Raw
  

              instructions.md
            
          
    First download COCO 2014 val split:
wget http://images.cocodataset.org/zips/val2014.zip
wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
unzip val2014.zip
unzip annotations_trainval2014.zip
Then run the notebook to get a CSV file containing randomly selected 30k image filenames and their captions.
Then, optionally, run python coco_30k_hf_datasets.py to have the dataset stored on the HF Hub 🤗
Once the dataset is pushed it can loaded with 2 lines of code with the 🤗 Datasets library:
from datasets import load_dataset 

dataset = load_dataset("sayakpaul/coco-30-val-2014", split="train")

  
## sample_coco_30k.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "149344cc-4db6-4e53-9984-8c46dc25ac75",
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "import pandas as pd\n",
    "import json\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "54314881-dda9-465d-b2d7-ffd52074839f",
   "metadata": {},
   "outputs": [],
   "source": [
    "img_path = \"val2014\"\n",
    "annotation_path = \"annotations/captions_val2014.json\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3907ac67-63e6-49ba-aa6b-01abb072f6fa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "40504"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_images = glob.glob(f\"{img_path}/*.jpg\")\n",
    "len(all_images)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3a7e2c6d-caa2-482e-b877-269347f1ccf2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "40504"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open(annotation_path, \"r\") as f:\n",
    "    val_captions = json.load(f)\n",
    "len(val_captions[\"images\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5c89b3fd-6756-486b-86a2-4f14a61a60c6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'image_id': 203564, 'id': 37, 'caption': 'A bicycle replica with a clock as the front wheel.'}\n",
      "{'license': 3, 'file_name': 'COCO_val2014_000000391895.jpg', 'coco_url': 'http://images.cocodataset.org/val2014/COCO_val2014_000000391895.jpg', 'height': 360, 'width': 640, 'date_captured': '2013-11-14 11:18:45', 'flickr_url': 'http://farm9.staticflickr.com/8186/8119368305_4e622c8349_z.jpg', 'id': 391895}\n"
     ]
    }
   ],
   "source": [
    "print(val_captions[\"annotations\"][0])\n",
    "print(val_captions[\"images\"][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a2f44d49-8a21-4af2-bb92-f08113967c6d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "40504"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_dict = {}\n",
    "\n",
    "# Create a mapping of image IDs to captions\n",
    "image_id_to_caption = {annotation[\"image_id\"]: annotation[\"caption\"] for annotation in val_captions[\"annotations\"]}\n",
    "\n",
    "# Build data_dict using the mapping\n",
    "for image in val_captions[\"images\"]:\n",
    "    image_id = image[\"id\"]\n",
    "    if image_id in image_id_to_caption:\n",
    "        if os.path.exists(os.path.join(\"val2014\", image[\"file_name\"])):\n",
    "            data_dict[image[\"file_name\"]] = image_id_to_caption[image_id]\n",
    "\n",
    "len(data_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "ec579dad-e469-4a2f-a044-b32d0588afaf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>file_name</th>\n",
       "      <th>caption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>COCO_val2014_000000391895.jpg</td>\n",
       "      <td>A man in a red shirt and a red hat is on a mot...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>COCO_val2014_000000522418.jpg</td>\n",
       "      <td>A woman marking a cake with the back of a chef...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>COCO_val2014_000000184613.jpg</td>\n",
       "      <td>A boy holding an umbrella while standing next ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>COCO_val2014_000000318219.jpg</td>\n",
       "      <td>a young kid with head phones on using a computer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>COCO_val2014_000000554625.jpg</td>\n",
       "      <td>A small child wearing headphones plays on the ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       file_name  \\\n",
       "0  COCO_val2014_000000391895.jpg   \n",
       "1  COCO_val2014_000000522418.jpg   \n",
       "2  COCO_val2014_000000184613.jpg   \n",
       "3  COCO_val2014_000000318219.jpg   \n",
       "4  COCO_val2014_000000554625.jpg   \n",
       "\n",
       "                                             caption  \n",
       "0  A man in a red shirt and a red hat is on a mot...  \n",
       "1  A woman marking a cake with the back of a chef...  \n",
       "2  A boy holding an umbrella while standing next ...  \n",
       "3  a young kid with head phones on using a computer   \n",
       "4  A small child wearing headphones plays on the ...  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_dict = {\"file_name\": list(data_dict.keys()), \"caption\": list(data_dict.values())}\n",
    "\n",
    "data_df = pd.DataFrame(df_dict)\n",
    "data_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "29228eca-e224-4174-93cc-d9e3097623c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "N = 30_000\n",
    "data_df = data_df.sample(N, random_state=2024)\n",
    "assert len(data_df) == N\n",
    "data_df.to_csv(\"coco_30k_randomly_sampled_2014_val.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "ad559b00-3a46-4440-94dd-94ddb232fdac",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>file_name</th>\n",
       "      <th>caption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>38425</th>\n",
       "      <td>COCO_val2014_000000054123.jpg</td>\n",
       "      <td>A group of zebras grazing in the grass.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2528</th>\n",
       "      <td>COCO_val2014_000000012897.jpg</td>\n",
       "      <td>a number of people standing around a large gro...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15743</th>\n",
       "      <td>COCO_val2014_000000408863.jpg</td>\n",
       "      <td>A yellow commuter train traveling past some ho...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33003</th>\n",
       "      <td>COCO_val2014_000000274931.jpg</td>\n",
       "      <td>An old fashioned oxitue worth old cars on stre...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22327</th>\n",
       "      <td>COCO_val2014_000000563267.jpg</td>\n",
       "      <td>Two men are in a building with brick walls.</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           file_name  \\\n",
       "38425  COCO_val2014_000000054123.jpg   \n",
       "2528   COCO_val2014_000000012897.jpg   \n",
       "15743  COCO_val2014_000000408863.jpg   \n",
       "33003  COCO_val2014_000000274931.jpg   \n",
       "22327  COCO_val2014_000000563267.jpg   \n",
       "\n",
       "                                                 caption  \n",
       "38425            A group of zebras grazing in the grass.  \n",
       "2528   a number of people standing around a large gro...  \n",
       "15743  A yellow commuter train traveling past some ho...  \n",
       "33003  An old fashioned oxitue worth old cars on stre...  \n",
       "22327        Two men are in a building with brick walls.  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_df.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	from datasets import Dataset, Features
	from datasets import Image as ImageFeature
	from datasets import Value
	import pandas as pd
	import os

	# CSV comes from the notebook above.
	df = pd.read_csv("coco_30k_randomly_sampled_2014_val.csv")
	root_path = "val2014"

	def gen_fn():
	for i, row in df.iterrows():
	path = os.path.join(root_path, row["file_name"])
	caption = row["caption"]
	yield {"image": path, "caption": caption}



	if __name__ == "__main__":
	ds = Dataset.from_generator(
	gen_fn,
	features=Features(image=ImageFeature(), caption=Value("string")),
	)
	ds_id = "sayakpaul/coco-30-val-2014" # Change this.
	# To be able to push, you need to run `huggingface-cli login`.
	ds.push_to_hub(ds_id)
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "149344cc-4db6-4e53-9984-8c46dc25ac75",
	"metadata": {},
	"outputs": [],
	"source": [
	"import glob\n",
	"import pandas as pd\n",
	"import json\n",
	"import os"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "54314881-dda9-465d-b2d7-ffd52074839f",
	"metadata": {},
	"outputs": [],
	"source": [
	"img_path = \"val2014\"\n",
	"annotation_path = \"annotations/captions_val2014.json\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "3907ac67-63e6-49ba-aa6b-01abb072f6fa",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"40504"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"all_images = glob.glob(f\"{img_path}/*.jpg\")\n",
	"len(all_images)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "3a7e2c6d-caa2-482e-b877-269347f1ccf2",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"40504"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"with open(annotation_path, \"r\") as f:\n",
	" val_captions = json.load(f)\n",
	"len(val_captions[\"images\"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "5c89b3fd-6756-486b-86a2-4f14a61a60c6",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"{'image_id': 203564, 'id': 37, 'caption': 'A bicycle replica with a clock as the front wheel.'}\n",
	"{'license': 3, 'file_name': 'COCO_val2014_000000391895.jpg', 'coco_url': 'http://images.cocodataset.org/val2014/COCO_val2014_000000391895.jpg', 'height': 360, 'width': 640, 'date_captured': '2013-11-14 11:18:45', 'flickr_url': 'http://farm9.staticflickr.com/8186/8119368305_4e622c8349_z.jpg', 'id': 391895}\n"
	]
	}
	],
	"source": [
	"print(val_captions[\"annotations\"][0])\n",
	"print(val_captions[\"images\"][0])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "a2f44d49-8a21-4af2-bb92-f08113967c6d",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"40504"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"data_dict = {}\n",
	"\n",
	"# Create a mapping of image IDs to captions\n",
	"image_id_to_caption = {annotation[\"image_id\"]: annotation[\"caption\"] for annotation in val_captions[\"annotations\"]}\n",
	"\n",
	"# Build data_dict using the mapping\n",
	"for image in val_captions[\"images\"]:\n",
	" image_id = image[\"id\"]\n",
	" if image_id in image_id_to_caption:\n",
	" if os.path.exists(os.path.join(\"val2014\", image[\"file_name\"])):\n",
	" data_dict[image[\"file_name\"]] = image_id_to_caption[image_id]\n",
	"\n",
	"len(data_dict)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "ec579dad-e469-4a2f-a044-b32d0588afaf",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>file_name</th>\n",
	" <th>caption</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>COCO_val2014_000000391895.jpg</td>\n",
	" <td>A man in a red shirt and a red hat is on a mot...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>COCO_val2014_000000522418.jpg</td>\n",
	" <td>A woman marking a cake with the back of a chef...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>COCO_val2014_000000184613.jpg</td>\n",
	" <td>A boy holding an umbrella while standing next ...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>COCO_val2014_000000318219.jpg</td>\n",
	" <td>a young kid with head phones on using a computer</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>COCO_val2014_000000554625.jpg</td>\n",
	" <td>A small child wearing headphones plays on the ...</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" file_name \\\n",
	"0 COCO_val2014_000000391895.jpg \n",
	"1 COCO_val2014_000000522418.jpg \n",
	"2 COCO_val2014_000000184613.jpg \n",
	"3 COCO_val2014_000000318219.jpg \n",
	"4 COCO_val2014_000000554625.jpg \n",
	"\n",
	" caption \n",
	"0 A man in a red shirt and a red hat is on a mot... \n",
	"1 A woman marking a cake with the back of a chef... \n",
	"2 A boy holding an umbrella while standing next ... \n",
	"3 a young kid with head phones on using a computer \n",
	"4 A small child wearing headphones plays on the ... "
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df_dict = {\"file_name\": list(data_dict.keys()), \"caption\": list(data_dict.values())}\n",
	"\n",
	"data_df = pd.DataFrame(df_dict)\n",
	"data_df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "29228eca-e224-4174-93cc-d9e3097623c1",
	"metadata": {},
	"outputs": [],
	"source": [
	"N = 30_000\n",
	"data_df = data_df.sample(N, random_state=2024)\n",
	"assert len(data_df) == N\n",
	"data_df.to_csv(\"coco_30k_randomly_sampled_2014_val.csv\", index=False)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "ad559b00-3a46-4440-94dd-94ddb232fdac",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>file_name</th>\n",
	" <th>caption</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>38425</th>\n",
	" <td>COCO_val2014_000000054123.jpg</td>\n",
	" <td>A group of zebras grazing in the grass.</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2528</th>\n",
	" <td>COCO_val2014_000000012897.jpg</td>\n",
	" <td>a number of people standing around a large gro...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>15743</th>\n",
	" <td>COCO_val2014_000000408863.jpg</td>\n",
	" <td>A yellow commuter train traveling past some ho...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>33003</th>\n",
	" <td>COCO_val2014_000000274931.jpg</td>\n",
	" <td>An old fashioned oxitue worth old cars on stre...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>22327</th>\n",
	" <td>COCO_val2014_000000563267.jpg</td>\n",
	" <td>Two men are in a building with brick walls.</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" file_name \\\n",
	"38425 COCO_val2014_000000054123.jpg \n",
	"2528 COCO_val2014_000000012897.jpg \n",
	"15743 COCO_val2014_000000408863.jpg \n",
	"33003 COCO_val2014_000000274931.jpg \n",
	"22327 COCO_val2014_000000563267.jpg \n",
	"\n",
	" caption \n",
	"38425 A group of zebras grazing in the grass. \n",
	"2528 a number of people standing around a large gro... \n",
	"15743 A yellow commuter train traveling past some ho... \n",
	"33003 An old fashioned oxitue worth old cars on stre... \n",
	"22327 Two men are in a building with brick walls. "
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"data_df.head()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}