Skip to content

Instantly share code, notes, and snippets.

@sayakpaul
Created January 31, 2024 10:34
Show Gist options
  • Save sayakpaul/0c4435a1df6eb6193f824f9198cabaa5 to your computer and use it in GitHub Desktop.
Save sayakpaul/0c4435a1df6eb6193f824f9198cabaa5 to your computer and use it in GitHub Desktop.
Samples 30k samples randomly from the COCO 2014 validation set.
from datasets import Dataset, Features
from datasets import Image as ImageFeature
from datasets import Value
import pandas as pd
import os
# CSV comes from the notebook above.
df = pd.read_csv("coco_30k_randomly_sampled_2014_val.csv")
root_path = "val2014"
def gen_fn():
for i, row in df.iterrows():
path = os.path.join(root_path, row["file_name"])
caption = row["caption"]
yield {"image": path, "caption": caption}
if __name__ == "__main__":
ds = Dataset.from_generator(
gen_fn,
features=Features(image=ImageFeature(), caption=Value("string")),
)
ds_id = "sayakpaul/coco-30-val-2014" # Change this.
# To be able to push, you need to run `huggingface-cli login`.
ds.push_to_hub(ds_id)

First download COCO 2014 val split:

wget http://images.cocodataset.org/zips/val2014.zip
wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
unzip val2014.zip
unzip annotations_trainval2014.zip

Then run the notebook to get a CSV file containing randomly selected 30k image filenames and their captions. Then, optionally, run python coco_30k_hf_datasets.py to have the dataset stored on the HF Hub 🤗

Once the dataset is pushed it can loaded with 2 lines of code with the 🤗 Datasets library:

from datasets import load_dataset 

dataset = load_dataset("sayakpaul/coco-30-val-2014", split="train")
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "149344cc-4db6-4e53-9984-8c46dc25ac75",
"metadata": {},
"outputs": [],
"source": [
"import glob\n",
"import pandas as pd\n",
"import json\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "54314881-dda9-465d-b2d7-ffd52074839f",
"metadata": {},
"outputs": [],
"source": [
"img_path = \"val2014\"\n",
"annotation_path = \"annotations/captions_val2014.json\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "3907ac67-63e6-49ba-aa6b-01abb072f6fa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"40504"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_images = glob.glob(f\"{img_path}/*.jpg\")\n",
"len(all_images)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3a7e2c6d-caa2-482e-b877-269347f1ccf2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"40504"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with open(annotation_path, \"r\") as f:\n",
" val_captions = json.load(f)\n",
"len(val_captions[\"images\"])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "5c89b3fd-6756-486b-86a2-4f14a61a60c6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'image_id': 203564, 'id': 37, 'caption': 'A bicycle replica with a clock as the front wheel.'}\n",
"{'license': 3, 'file_name': 'COCO_val2014_000000391895.jpg', 'coco_url': 'http://images.cocodataset.org/val2014/COCO_val2014_000000391895.jpg', 'height': 360, 'width': 640, 'date_captured': '2013-11-14 11:18:45', 'flickr_url': 'http://farm9.staticflickr.com/8186/8119368305_4e622c8349_z.jpg', 'id': 391895}\n"
]
}
],
"source": [
"print(val_captions[\"annotations\"][0])\n",
"print(val_captions[\"images\"][0])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a2f44d49-8a21-4af2-bb92-f08113967c6d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"40504"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_dict = {}\n",
"\n",
"# Create a mapping of image IDs to captions\n",
"image_id_to_caption = {annotation[\"image_id\"]: annotation[\"caption\"] for annotation in val_captions[\"annotations\"]}\n",
"\n",
"# Build data_dict using the mapping\n",
"for image in val_captions[\"images\"]:\n",
" image_id = image[\"id\"]\n",
" if image_id in image_id_to_caption:\n",
" if os.path.exists(os.path.join(\"val2014\", image[\"file_name\"])):\n",
" data_dict[image[\"file_name\"]] = image_id_to_caption[image_id]\n",
"\n",
"len(data_dict)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "ec579dad-e469-4a2f-a044-b32d0588afaf",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>file_name</th>\n",
" <th>caption</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>COCO_val2014_000000391895.jpg</td>\n",
" <td>A man in a red shirt and a red hat is on a mot...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>COCO_val2014_000000522418.jpg</td>\n",
" <td>A woman marking a cake with the back of a chef...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>COCO_val2014_000000184613.jpg</td>\n",
" <td>A boy holding an umbrella while standing next ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>COCO_val2014_000000318219.jpg</td>\n",
" <td>a young kid with head phones on using a computer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>COCO_val2014_000000554625.jpg</td>\n",
" <td>A small child wearing headphones plays on the ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" file_name \\\n",
"0 COCO_val2014_000000391895.jpg \n",
"1 COCO_val2014_000000522418.jpg \n",
"2 COCO_val2014_000000184613.jpg \n",
"3 COCO_val2014_000000318219.jpg \n",
"4 COCO_val2014_000000554625.jpg \n",
"\n",
" caption \n",
"0 A man in a red shirt and a red hat is on a mot... \n",
"1 A woman marking a cake with the back of a chef... \n",
"2 A boy holding an umbrella while standing next ... \n",
"3 a young kid with head phones on using a computer \n",
"4 A small child wearing headphones plays on the ... "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_dict = {\"file_name\": list(data_dict.keys()), \"caption\": list(data_dict.values())}\n",
"\n",
"data_df = pd.DataFrame(df_dict)\n",
"data_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "29228eca-e224-4174-93cc-d9e3097623c1",
"metadata": {},
"outputs": [],
"source": [
"N = 30_000\n",
"data_df = data_df.sample(N, random_state=2024)\n",
"assert len(data_df) == N\n",
"data_df.to_csv(\"coco_30k_randomly_sampled_2014_val.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "ad559b00-3a46-4440-94dd-94ddb232fdac",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>file_name</th>\n",
" <th>caption</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>38425</th>\n",
" <td>COCO_val2014_000000054123.jpg</td>\n",
" <td>A group of zebras grazing in the grass.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2528</th>\n",
" <td>COCO_val2014_000000012897.jpg</td>\n",
" <td>a number of people standing around a large gro...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15743</th>\n",
" <td>COCO_val2014_000000408863.jpg</td>\n",
" <td>A yellow commuter train traveling past some ho...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33003</th>\n",
" <td>COCO_val2014_000000274931.jpg</td>\n",
" <td>An old fashioned oxitue worth old cars on stre...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22327</th>\n",
" <td>COCO_val2014_000000563267.jpg</td>\n",
" <td>Two men are in a building with brick walls.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" file_name \\\n",
"38425 COCO_val2014_000000054123.jpg \n",
"2528 COCO_val2014_000000012897.jpg \n",
"15743 COCO_val2014_000000408863.jpg \n",
"33003 COCO_val2014_000000274931.jpg \n",
"22327 COCO_val2014_000000563267.jpg \n",
"\n",
" caption \n",
"38425 A group of zebras grazing in the grass. \n",
"2528 a number of people standing around a large gro... \n",
"15743 A yellow commuter train traveling past some ho... \n",
"33003 An old fashioned oxitue worth old cars on stre... \n",
"22327 Two men are in a building with brick walls. "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_df.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment