Created
August 2, 2022 04:18
-
-
Save sayakpaul/108e1e05ad60c9bcc38edc5dd84801b3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "01097ef1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# !pip install -q transformers datasets tensorflow pyarrow matplotlib -q " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "cdd2b176", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!wget -q https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz -O flower_photos.tgz\n", | |
"!tar xf flower_photos.tgz" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "252a0988", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import matplotlib.pyplot as plt\n", | |
"import tensorflow as tf\n", | |
"import glob\n", | |
"import os" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "74f5bc6f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(['flower_photos/roses/16209331331_343c899d38.jpg',\n", | |
" 'flower_photos/roses/5777669976_a205f61e5b.jpg',\n", | |
" 'flower_photos/roses/4860145119_b1c3cbaa4e_n.jpg',\n", | |
" 'flower_photos/roses/15011625580_7974c44bce.jpg',\n", | |
" 'flower_photos/roses/17953368844_be3d18cf30_m.jpg'],\n", | |
" 3670)" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"image_paths = glob.glob(\"flower_photos/*/*.jpg\")\n", | |
"image_paths[:5], len(image_paths)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "d9d885b5", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['roses', 'roses', 'roses', 'roses', 'roses']" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"all_labels = list(map(lambda x: x.split(\"/\")[1], image_paths))\n", | |
"all_labels[:5]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "46ea492d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'daisy': 0, 'dandelion': 1, 'roses': 2, 'sunflowers': 3, 'tulips': 4}" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"unique_labels = sorted(set(all_labels))\n", | |
"label2_id = {label: idx for idx, label in enumerate(unique_labels)}\n", | |
"label2_id" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "5face38d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[2, 2, 2, 2, 2]" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"all_integer_labels = list(map(lambda x: label2_id.get(x), all_labels))\n", | |
"all_integer_labels[:5]\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "9266df28", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pyarrow.feather import write_feather\n", | |
"import pyarrow as pa\n", | |
"import tqdm\n", | |
"import math" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "e1537a28", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"batch_size = 1000\n", | |
"chunk_size = 1000" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "4815637c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# pa_type = pa.struct({\"path\": pa.string(), \"bytes\": pa.binary(), \"labels\": pa.int8()})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "be22173e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/var/folders/z_/d29z43w90kz6f4kbzv5c9m9r0000gn/T/ipykernel_64363/1007705059.py:8: TqdmDeprecationWarning: Please use `tqdm.notebook.trange` instead of `tqdm.tnrange`\n", | |
" for step in tqdm.tnrange(int(math.ceil(len(image_paths) / batch_size))):\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "e3f1dd16582a44949dcefff38c0ff2e0", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/4 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Total images written: 1000.\n", | |
"Total images written: 2000.\n", | |
"Total images written: 3000.\n", | |
"Total images written: 3670.\n" | |
] | |
} | |
], | |
"source": [ | |
"def read_image(path):\n", | |
" with open(path, \"rb\") as f:\n", | |
" return f.read()\n", | |
"\n", | |
"\n", | |
"total_images_written = 0\n", | |
"\n", | |
"for step in tqdm.tnrange(int(math.ceil(len(image_paths) / batch_size))):\n", | |
" batch_image_paths = image_paths[step * batch_size : (step + 1) * batch_size]\n", | |
" batch_image_labels = all_integer_labels[step * batch_size : (step + 1) * batch_size]\n", | |
"\n", | |
" data = [read_image(path) for path in batch_image_paths]\n", | |
" table = pa.Table.from_arrays([data, batch_image_labels], [\"data\", \"labels\"])\n", | |
" write_feather(table, f\"/tmp/flowers_feather_{step}.feather\", chunksize=chunk_size)\n", | |
" total_images_written += len(batch_image_paths)\n", | |
" print(f\"Total images written: {total_images_written}.\")\n", | |
"\n", | |
" del data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "4a2f1b5a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"-rw-r--r-- 1 sayakpaul wheel 64M Aug 2 09:22 /tmp/flowers_feather_0.feather\r\n", | |
"-rw-r--r-- 1 sayakpaul wheel 59M Aug 2 09:22 /tmp/flowers_feather_1.feather\r\n", | |
"-rw-r--r-- 1 sayakpaul wheel 51M Aug 2 09:22 /tmp/flowers_feather_2.feather\r\n", | |
"-rw-r--r-- 1 sayakpaul wheel 45M Aug 2 09:22 /tmp/flowers_feather_3.feather\r\n" | |
] | |
} | |
], | |
"source": [ | |
"ls -lh /tmp/*.feather" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "c7673ea6", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# !pip install git+https://github.com/yongtang/io@feather -q --no-deps" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "16478d1d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/__init__.py:98: UserWarning: unable to load libtensorflow_io_plugins.so: unable to open file: libtensorflow_io_plugins.so, from paths: ['/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so']\n", | |
"caused by: [\"[Errno 2] The file to load file system plugin from does not exist.: '/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so'\"]\n", | |
" warnings.warn(f\"unable to load libtensorflow_io_plugins.so: {e}\")\n", | |
"/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/__init__.py:104: UserWarning: file system plugins are not loaded: unable to open file: libtensorflow_io.so, from paths: ['/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io.so']\n", | |
"caused by: ['dlopen(/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io.so, 6): image not found']\n", | |
" warnings.warn(f\"file system plugins are not loaded: {e}\")\n" | |
] | |
} | |
], | |
"source": [ | |
"import tensorflow_io.arrow as arrow_io" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "449ae824", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2022-08-02 09:39:46.949147: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", | |
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" | |
] | |
}, | |
{ | |
"ename": "NotImplementedError", | |
"evalue": "unable to open file: libtensorflow_io.so, from paths: ['/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io.so']\ncaused by: ['dlopen(/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io.so, 6): image not found']", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)", | |
"Input \u001b[0;32mIn [14]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m dataset \u001b[38;5;241m=\u001b[39m \u001b[43marrow_io\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mArrowFeatherDataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/tmp/flowers_feather_0.feather\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_types\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstring\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mint64\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_shapes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m32\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/arrow_dataset_ops.py:375\u001b[0m, in \u001b[0;36mArrowFeatherDataset.__init__\u001b[0;34m(self, filenames, columns, output_types, output_shapes, batch_size, batch_mode)\u001b[0m\n\u001b[1;32m 353\u001b[0m \u001b[38;5;124;03m\"\"\"Create an ArrowDataset from one or more Feather file names.\u001b[39;00m\n\u001b[1;32m 354\u001b[0m \n\u001b[1;32m 355\u001b[0m \u001b[38;5;124;03mArgs:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 369\u001b[0m \u001b[38;5;124;03m \"auto\" (size to number of records in Arrow record batch)\u001b[39;00m\n\u001b[1;32m 370\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 371\u001b[0m filenames \u001b[38;5;241m=\u001b[39m tf\u001b[38;5;241m.\u001b[39mconvert_to_tensor(\n\u001b[1;32m 372\u001b[0m filenames, dtype\u001b[38;5;241m=\u001b[39mdtypes\u001b[38;5;241m.\u001b[39mstring, name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfilenames\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 373\u001b[0m )\n\u001b[1;32m 374\u001b[0m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(\n\u001b[0;32m--> 375\u001b[0m partial(\u001b[43mcore_ops\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mio_arrow_feather_dataset\u001b[49m, filenames),\n\u001b[1;32m 376\u001b[0m columns,\n\u001b[1;32m 377\u001b[0m output_types,\n\u001b[1;32m 378\u001b[0m output_shapes,\n\u001b[1;32m 379\u001b[0m batch_size,\n\u001b[1;32m 380\u001b[0m batch_mode,\n\u001b[1;32m 381\u001b[0m )\n", | |
"File \u001b[0;32m~/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/__init__.py:88\u001b[0m, in \u001b[0;36mLazyLoader.__getattr__\u001b[0;34m(self, attrb)\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getattr__\u001b[39m(\u001b[38;5;28mself\u001b[39m, attrb):\n\u001b[0;32m---> 88\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m, attrb)\n", | |
"File \u001b[0;32m~/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/__init__.py:84\u001b[0m, in \u001b[0;36mLazyLoader._load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_load\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 83\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_mod \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 84\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_mod \u001b[38;5;241m=\u001b[39m \u001b[43m_load_library\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_library\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 85\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_mod\n", | |
"File \u001b[0;32m~/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/__init__.py:69\u001b[0m, in \u001b[0;36m_load_library\u001b[0;34m(filename, lib)\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (tf\u001b[38;5;241m.\u001b[39merrors\u001b[38;5;241m.\u001b[39mNotFoundError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 68\u001b[0m errs\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;28mstr\u001b[39m(e))\n\u001b[0;32m---> 69\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 70\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124munable to open file: \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 71\u001b[0m \u001b[38;5;241m+\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfilename\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, from paths: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfilenames\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mcaused by: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00merrs\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 72\u001b[0m )\n", | |
"\u001b[0;31mNotImplementedError\u001b[0m: unable to open file: libtensorflow_io.so, from paths: ['/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io.so']\ncaused by: ['dlopen(/Users/sayakpaul/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io.so, 6): image not found']" | |
] | |
} | |
], | |
"source": [ | |
"dataset = arrow_io.ArrowFeatherDataset(\n", | |
" [\"/tmp/flowers_feather_0.feather\"],\n", | |
" columns=(0, 1),\n", | |
" output_types=(tf.string, tf.int64),\n", | |
" output_shapes=([], []),\n", | |
" batch_size=32,\n", | |
")" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment