Created
August 3, 2022 17:28
-
-
Save sayakpaul/2299142d23bdacd359825d040ed75332 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "01097ef1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# !pip install -q transformers datasets tensorflow pyarrow matplotlib tensorflow-io-nightly -q " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "cdd2b176", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!wget -q https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz -O flower_photos.tgz\n", | |
"!tar xf flower_photos.tgz" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "252a0988", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import matplotlib.pyplot as plt\n", | |
"import tensorflow as tf\n", | |
"import glob\n", | |
"import os" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "74f5bc6f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(['flower_photos/roses/16209331331_343c899d38.jpg',\n", | |
" 'flower_photos/roses/5777669976_a205f61e5b.jpg',\n", | |
" 'flower_photos/roses/4860145119_b1c3cbaa4e_n.jpg',\n", | |
" 'flower_photos/roses/15011625580_7974c44bce.jpg',\n", | |
" 'flower_photos/roses/17953368844_be3d18cf30_m.jpg'],\n", | |
" 3670)" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"image_paths = glob.glob(\"flower_photos/*/*.jpg\")\n", | |
"image_paths[:5], len(image_paths)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "d9d885b5", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['roses', 'roses', 'roses', 'roses', 'roses']" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"all_labels = list(map(lambda x: x.split(\"/\")[1], image_paths))\n", | |
"all_labels[:5]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "46ea492d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'daisy': 0, 'dandelion': 1, 'roses': 2, 'sunflowers': 3, 'tulips': 4}" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"unique_labels = sorted(set(all_labels))\n", | |
"label2_id = {label: idx for idx, label in enumerate(unique_labels)}\n", | |
"label2_id" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "5face38d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[2, 2, 2, 2, 2]" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"all_integer_labels = list(map(lambda x: label2_id.get(x), all_labels))\n", | |
"all_integer_labels[:5]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "9266df28", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pyarrow.feather import write_feather\n", | |
"import pyarrow as pa\n", | |
"import tqdm\n", | |
"import math" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "e1537a28", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"batch_size = 1000\n", | |
"chunk_size = 1000" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "4815637c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# pa_type = pa.struct({\"path\": pa.string(), \"bytes\": pa.binary(), \"labels\": pa.int8()})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "be22173e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/var/folders/z_/d29z43w90kz6f4kbzv5c9m9r0000gn/T/ipykernel_74731/1007705059.py:8: TqdmDeprecationWarning: Please use `tqdm.notebook.trange` instead of `tqdm.tnrange`\n", | |
" for step in tqdm.tnrange(int(math.ceil(len(image_paths) / batch_size))):\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "d925d7a64a0247de904d9b44da2fdd97", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/4 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Total images written: 1000.\n", | |
"Total images written: 2000.\n", | |
"Total images written: 3000.\n", | |
"Total images written: 3670.\n" | |
] | |
} | |
], | |
"source": [ | |
"def read_image(path):\n", | |
" with open(path, \"rb\") as f:\n", | |
" return f.read()\n", | |
"\n", | |
"\n", | |
"total_images_written = 0\n", | |
"\n", | |
"for step in tqdm.tnrange(int(math.ceil(len(image_paths) / batch_size))):\n", | |
" batch_image_paths = image_paths[step * batch_size : (step + 1) * batch_size]\n", | |
" batch_image_labels = all_integer_labels[step * batch_size : (step + 1) * batch_size]\n", | |
"\n", | |
" data = [read_image(path) for path in batch_image_paths]\n", | |
" table = pa.Table.from_arrays([data, batch_image_labels], [\"data\", \"labels\"])\n", | |
" write_feather(table, f\"/tmp/flowers_feather_{step}.feather\", chunksize=chunk_size)\n", | |
" total_images_written += len(batch_image_paths)\n", | |
" print(f\"Total images written: {total_images_written}.\")\n", | |
"\n", | |
" del data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "4a2f1b5a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"-rw-r--r-- 1 sayakpaul wheel 64M Aug 3 22:50 /tmp/flowers_feather_0.feather\r\n", | |
"-rw-r--r-- 1 sayakpaul wheel 59M Aug 3 22:50 /tmp/flowers_feather_1.feather\r\n", | |
"-rw-r--r-- 1 sayakpaul wheel 51M Aug 3 22:50 /tmp/flowers_feather_2.feather\r\n", | |
"-rw-r--r-- 1 sayakpaul wheel 45M Aug 3 22:50 /tmp/flowers_feather_3.feather\r\n" | |
] | |
} | |
], | |
"source": [ | |
"ls -lh /tmp/*.feather" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "16478d1d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"2.9.1\n", | |
"0.26.0\n" | |
] | |
} | |
], | |
"source": [ | |
"import tensorflow_io.arrow as arrow_io\n", | |
"import tensorflow_io as tfio\n", | |
"\n", | |
"print(tf.__version__)\n", | |
"print(tfio.__version__)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "449ae824", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2022-08-03 22:52:18.995564: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", | |
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", | |
"2022-08-03 22:52:19.444750: I tensorflow_io/core/kernels/cpu_check.cc:128] Your CPU supports instructions that this TensorFlow IO binary was not compiled to use: AVX2 FMA\n", | |
"2022-08-03 22:52:19.567800: E tensorflow/core/framework/dataset.cc:580] UNIMPLEMENTED: Cannot compute input sources for dataset of type IO>ArrowFeatherDataset, because the dataset does not implement `InputDatasets`.\n", | |
"2022-08-03 22:52:19.567818: E tensorflow/core/framework/dataset.cc:584] UNIMPLEMENTED: Cannot merge options for dataset of type IO>ArrowFeatherDataset, because the dataset does not implement `InputDatasets`.\n" | |
] | |
} | |
], | |
"source": [ | |
"dataset = arrow_io.ArrowFeatherDataset(\n", | |
" [\"/tmp/flowers_feather_0.feather\"],\n", | |
" columns=(0, 1),\n", | |
" output_types=(tf.string, tf.int64),\n", | |
" output_shapes=([], []),\n", | |
" batch_size=32,\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "155f4331", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))\n" | |
] | |
} | |
], | |
"source": [ | |
"print(dataset.element_spec) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "652c2e85", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2022-08-03 22:52:36.086910: E tensorflow/core/framework/dataset.cc:580] UNIMPLEMENTED: Cannot compute input sources for dataset of type IO>ArrowFeatherDataset, because the dataset does not implement `InputDatasets`.\n", | |
"2022-08-03 22:52:36.086938: E tensorflow/core/framework/dataset.cc:584] UNIMPLEMENTED: Cannot merge options for dataset of type IO>ArrowFeatherDataset, because the dataset does not implement `InputDatasets`.\n", | |
"2022-08-03 22:52:36.087371: E tensorflow/core/framework/dataset.cc:580] FAILED_PRECONDITION: Cannot compute input sources for dataset of type RootDataset, because sources could not be computed for input dataset of type IO>ArrowFeatherDataset\n" | |
] | |
}, | |
{ | |
"ename": "InternalError", | |
"evalue": "Invalid: INVALID_ARGUMENT: arrow data type 0x7fc728a5d3d8 is not supported: Type error: Arrow data type is not supported [Op:IteratorGetNext]", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mInternalError\u001b[0m Traceback (most recent call last)", | |
"Input \u001b[0;32mIn [15]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m sample \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43miter\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow/python/data/ops/iterator_ops.py:766\u001b[0m, in \u001b[0;36mOwnedIterator.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 764\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__next__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 765\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 766\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_next_internal\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 767\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m errors\u001b[38;5;241m.\u001b[39mOutOfRangeError:\n\u001b[1;32m 768\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m\n", | |
"File \u001b[0;32m~/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow/python/data/ops/iterator_ops.py:749\u001b[0m, in \u001b[0;36mOwnedIterator._next_internal\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 746\u001b[0m \u001b[38;5;66;03m# TODO(b/77291417): This runs in sync mode as iterators use an error status\u001b[39;00m\n\u001b[1;32m 747\u001b[0m \u001b[38;5;66;03m# to communicate that there is no more data to iterate over.\u001b[39;00m\n\u001b[1;32m 748\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context\u001b[38;5;241m.\u001b[39mexecution_mode(context\u001b[38;5;241m.\u001b[39mSYNC):\n\u001b[0;32m--> 749\u001b[0m ret \u001b[38;5;241m=\u001b[39m \u001b[43mgen_dataset_ops\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miterator_get_next\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 750\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_iterator_resource\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 751\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_types\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_flat_output_types\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 752\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_shapes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_flat_output_shapes\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 754\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 755\u001b[0m \u001b[38;5;66;03m# Fast path for the case `self._structure` is not a nested structure.\u001b[39;00m\n\u001b[1;32m 756\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_element_spec\u001b[38;5;241m.\u001b[39m_from_compatible_tensor_list(ret) \u001b[38;5;66;03m# pylint: disable=protected-access\u001b[39;00m\n", | |
"File \u001b[0;32m~/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow/python/ops/gen_dataset_ops.py:3017\u001b[0m, in \u001b[0;36miterator_get_next\u001b[0;34m(iterator, output_types, output_shapes, name)\u001b[0m\n\u001b[1;32m 3015\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _result\n\u001b[1;32m 3016\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m _core\u001b[38;5;241m.\u001b[39m_NotOkStatusException \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m-> 3017\u001b[0m \u001b[43m_ops\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mraise_from_not_ok_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43me\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3018\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m _core\u001b[38;5;241m.\u001b[39m_FallbackException:\n\u001b[1;32m 3019\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n", | |
"File \u001b[0;32m~/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:7164\u001b[0m, in \u001b[0;36mraise_from_not_ok_status\u001b[0;34m(e, name)\u001b[0m\n\u001b[1;32m 7162\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mraise_from_not_ok_status\u001b[39m(e, name):\n\u001b[1;32m 7163\u001b[0m e\u001b[38;5;241m.\u001b[39mmessage \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m name: \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m name \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 7164\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m core\u001b[38;5;241m.\u001b[39m_status_to_exception(e) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n", | |
"\u001b[0;31mInternalError\u001b[0m: Invalid: INVALID_ARGUMENT: arrow data type 0x7fc728a5d3d8 is not supported: Type error: Arrow data type is not supported [Op:IteratorGetNext]" | |
] | |
} | |
], | |
"source": [ | |
"sample = next(iter(dataset))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment