Skip to content

Instantly share code, notes, and snippets.

@sayakpaul
Created August 2, 2022 04:18
Show Gist options
  • Save sayakpaul/108e1e05ad60c9bcc38edc5dd84801b3 to your computer and use it in GitHub Desktop.
Save sayakpaul/108e1e05ad60c9bcc38edc5dd84801b3 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "01097ef1",
"metadata": {},
"outputs": [],
"source": [
"# !pip install -q transformers datasets tensorflow pyarrow matplotlib -q "
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "cdd2b176",
"metadata": {},
"outputs": [],
"source": [
"!wget -q https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz -O flower_photos.tgz\n",
"!tar xf flower_photos.tgz"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "252a0988",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import tensorflow as tf\n",
"import glob\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "74f5bc6f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(['flower_photos/roses/16209331331_343c899d38.jpg',\n",
" 'flower_photos/roses/5777669976_a205f61e5b.jpg',\n",
" 'flower_photos/roses/4860145119_b1c3cbaa4e_n.jpg',\n",
" 'flower_photos/roses/15011625580_7974c44bce.jpg',\n",
" 'flower_photos/roses/17953368844_be3d18cf30_m.jpg'],\n",
" 3670)"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"image_paths = glob.glob(\"flower_photos/*/*.jpg\")\n",
"image_paths[:5], len(image_paths)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "d9d885b5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['roses', 'roses', 'roses', 'roses', 'roses']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_labels = list(map(lambda x: x.split(\"/\")[1], image_paths))\n",
"all_labels[:5]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "46ea492d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'daisy': 0, 'dandelion': 1, 'roses': 2, 'sunflowers': 3, 'tulips': 4}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"unique_labels = sorted(set(all_labels))\n",
"label2_id = {label: idx for idx, label in enumerate(unique_labels)}\n",
"label2_id"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "5face38d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[2, 2, 2, 2, 2]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_integer_labels = list(map(lambda x: label2_id.get(x), all_labels))\n",
"all_integer_labels[:5]\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "9266df28",
"metadata": {},
"outputs": [],
"source": [
"from pyarrow.feather import write_feather\n",
"import pyarrow as pa\n",
"import tqdm\n",
"import math"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e1537a28",
"metadata": {},
"outputs": [],
"source": [
"batch_size = 1000\n",
"chunk_size = 1000"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "4815637c",
"metadata": {},
"outputs": [],
"source": [
"# pa_type = pa.struct({\"path\": pa.string(), \"bytes\": pa.binary(), \"labels\": pa.int8()})"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "be22173e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/z_/d29z43w90kz6f4kbzv5c9m9r0000gn/T/ipykernel_64363/1007705059.py:8: TqdmDeprecationWarning: Please use `tqdm.notebook.trange` instead of `tqdm.tnrange`\n",
" for step in tqdm.tnrange(int(math.ceil(len(image_paths) / batch_size))):\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e3f1dd16582a44949dcefff38c0ff2e0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/4 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total images written: 1000.\n",
"Total images written: 2000.\n",
"Total images written: 3000.\n",
"Total images written: 3670.\n"
]
}
],
"source": [
"def read_image(path):\n",
" with open(path, \"rb\") as f:\n",
" return f.read()\n",
"\n",
"\n",
"total_images_written = 0\n",
"\n",
"for step in tqdm.tnrange(int(math.ceil(len(image_paths) / batch_size))):\n",
" batch_image_paths = image_paths[step * batch_size : (step + 1) * batch_size]\n",
" batch_image_labels = all_integer_labels[step * batch_size : (step + 1) * batch_size]\n",
"\n",
" data = [read_image(path) for path in batch_image_paths]\n",
" table = pa.Table.from_arrays([data, batch_image_labels], [\"data\", \"labels\"])\n",
" write_feather(table, f\"/tmp/flowers_feather_{step}.feather\", chunksize=chunk_size)\n",
" total_images_written += len(batch_image_paths)\n",
" print(f\"Total images written: {total_images_written}.\")\n",
"\n",
" del data"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "4a2f1b5a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-rw-r--r-- 1 sayakpaul wheel 64M Aug 2 09:22 /tmp/flowers_feather_0.feather\r\n",
"-rw-r--r-- 1 sayakpaul wheel 59M Aug 2 09:22 /tmp/flowers_feather_1.feather\r\n",
"-rw-r--r-- 1 sayakpaul wheel 51M Aug 2 09:22 /tmp/flowers_feather_2.feather\r\n",
"-rw-r--r-- 1 sayakpaul wheel 45M Aug 2 09:22 /tmp/flowers_feather_3.feather\r\n"
]
}
],
"source": [
"ls -lh /tmp/*.feather"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "c7673ea6",
"metadata": {},
"outputs": [],
"source": [
"# !pip install git+https://github.com/yongtang/io@feather -q --no-deps"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "16478d1d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/__init__.py:98: UserWarning: unable to load libtensorflow_io_plugins.so: unable to open file: libtensorflow_io_plugins.so, from paths: ['/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so']\n",
"caused by: [\"[Errno 2] The file to load file system plugin from does not exist.: '/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so'\"]\n",
" warnings.warn(f\"unable to load libtensorflow_io_plugins.so: {e}\")\n",
"/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/__init__.py:104: UserWarning: file system plugins are not loaded: unable to open file: libtensorflow_io.so, from paths: ['/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io.so']\n",
"caused by: ['dlopen(/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io.so, 6): image not found']\n",
" warnings.warn(f\"file system plugins are not loaded: {e}\")\n"
]
}
],
"source": [
"import tensorflow_io.arrow as arrow_io"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "449ae824",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-08-02 09:39:46.949147: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
},
{
"ename": "NotImplementedError",
"evalue": "unable to open file: libtensorflow_io.so, from paths: ['/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io.so']\ncaused by: ['dlopen(/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io.so, 6): image not found']",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)",
"Input \u001b[0;32mIn [14]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m dataset \u001b[38;5;241m=\u001b[39m \u001b[43marrow_io\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mArrowFeatherDataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/tmp/flowers_feather_0.feather\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_types\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstring\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mint64\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_shapes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m32\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/arrow_dataset_ops.py:375\u001b[0m, in \u001b[0;36mArrowFeatherDataset.__init__\u001b[0;34m(self, filenames, columns, output_types, output_shapes, batch_size, batch_mode)\u001b[0m\n\u001b[1;32m 353\u001b[0m \u001b[38;5;124;03m\"\"\"Create an ArrowDataset from one or more Feather file names.\u001b[39;00m\n\u001b[1;32m 354\u001b[0m \n\u001b[1;32m 355\u001b[0m \u001b[38;5;124;03mArgs:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 369\u001b[0m \u001b[38;5;124;03m \"auto\" (size to number of records in Arrow record batch)\u001b[39;00m\n\u001b[1;32m 370\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 371\u001b[0m filenames \u001b[38;5;241m=\u001b[39m tf\u001b[38;5;241m.\u001b[39mconvert_to_tensor(\n\u001b[1;32m 372\u001b[0m filenames, dtype\u001b[38;5;241m=\u001b[39mdtypes\u001b[38;5;241m.\u001b[39mstring, name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfilenames\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 373\u001b[0m )\n\u001b[1;32m 374\u001b[0m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(\n\u001b[0;32m--> 375\u001b[0m partial(\u001b[43mcore_ops\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mio_arrow_feather_dataset\u001b[49m, filenames),\n\u001b[1;32m 376\u001b[0m columns,\n\u001b[1;32m 377\u001b[0m output_types,\n\u001b[1;32m 378\u001b[0m output_shapes,\n\u001b[1;32m 379\u001b[0m batch_size,\n\u001b[1;32m 380\u001b[0m batch_mode,\n\u001b[1;32m 381\u001b[0m )\n",
"File \u001b[0;32m~/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/__init__.py:88\u001b[0m, in \u001b[0;36mLazyLoader.__getattr__\u001b[0;34m(self, attrb)\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getattr__\u001b[39m(\u001b[38;5;28mself\u001b[39m, attrb):\n\u001b[0;32m---> 88\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m, attrb)\n",
"File \u001b[0;32m~/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/__init__.py:84\u001b[0m, in \u001b[0;36mLazyLoader._load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_load\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 83\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_mod \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 84\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_mod \u001b[38;5;241m=\u001b[39m \u001b[43m_load_library\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_library\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 85\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_mod\n",
"File \u001b[0;32m~/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/__init__.py:69\u001b[0m, in \u001b[0;36m_load_library\u001b[0;34m(filename, lib)\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (tf\u001b[38;5;241m.\u001b[39merrors\u001b[38;5;241m.\u001b[39mNotFoundError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 68\u001b[0m errs\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;28mstr\u001b[39m(e))\n\u001b[0;32m---> 69\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 70\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124munable to open file: \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 71\u001b[0m \u001b[38;5;241m+\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfilename\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, from paths: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfilenames\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mcaused by: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00merrs\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 72\u001b[0m )\n",
"\u001b[0;31mNotImplementedError\u001b[0m: unable to open file: libtensorflow_io.so, from paths: ['/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io.so']\ncaused by: ['dlopen(/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io.so, 6): image not found']"
]
}
],
"source": [
"dataset = arrow_io.ArrowFeatherDataset(\n",
" [\"/tmp/flowers_feather_0.feather\"],\n",
" columns=(0, 1),\n",
" output_types=(tf.string, tf.int64),\n",
" output_shapes=([], []),\n",
" batch_size=32,\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment