Skip to content

Instantly share code, notes, and snippets.

@sayakpaul
Created August 3, 2022 17:28
Show Gist options
  • Save sayakpaul/2299142d23bdacd359825d040ed75332 to your computer and use it in GitHub Desktop.
Save sayakpaul/2299142d23bdacd359825d040ed75332 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "01097ef1",
"metadata": {},
"outputs": [],
"source": [
"# !pip install -q transformers datasets tensorflow pyarrow matplotlib tensorflow-io-nightly -q "
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "cdd2b176",
"metadata": {},
"outputs": [],
"source": [
"!wget -q https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz -O flower_photos.tgz\n",
"!tar xf flower_photos.tgz"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "252a0988",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import tensorflow as tf\n",
"import glob\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "74f5bc6f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(['flower_photos/roses/16209331331_343c899d38.jpg',\n",
" 'flower_photos/roses/5777669976_a205f61e5b.jpg',\n",
" 'flower_photos/roses/4860145119_b1c3cbaa4e_n.jpg',\n",
" 'flower_photos/roses/15011625580_7974c44bce.jpg',\n",
" 'flower_photos/roses/17953368844_be3d18cf30_m.jpg'],\n",
" 3670)"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"image_paths = glob.glob(\"flower_photos/*/*.jpg\")\n",
"image_paths[:5], len(image_paths)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "d9d885b5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['roses', 'roses', 'roses', 'roses', 'roses']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_labels = list(map(lambda x: x.split(\"/\")[1], image_paths))\n",
"all_labels[:5]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "46ea492d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'daisy': 0, 'dandelion': 1, 'roses': 2, 'sunflowers': 3, 'tulips': 4}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"unique_labels = sorted(set(all_labels))\n",
"label2_id = {label: idx for idx, label in enumerate(unique_labels)}\n",
"label2_id"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "5face38d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[2, 2, 2, 2, 2]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_integer_labels = list(map(lambda x: label2_id.get(x), all_labels))\n",
"all_integer_labels[:5]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "9266df28",
"metadata": {},
"outputs": [],
"source": [
"from pyarrow.feather import write_feather\n",
"import pyarrow as pa\n",
"import tqdm\n",
"import math"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e1537a28",
"metadata": {},
"outputs": [],
"source": [
"batch_size = 1000\n",
"chunk_size = 1000"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "4815637c",
"metadata": {},
"outputs": [],
"source": [
"# pa_type = pa.struct({\"path\": pa.string(), \"bytes\": pa.binary(), \"labels\": pa.int8()})"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "be22173e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/z_/d29z43w90kz6f4kbzv5c9m9r0000gn/T/ipykernel_74731/1007705059.py:8: TqdmDeprecationWarning: Please use `tqdm.notebook.trange` instead of `tqdm.tnrange`\n",
" for step in tqdm.tnrange(int(math.ceil(len(image_paths) / batch_size))):\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d925d7a64a0247de904d9b44da2fdd97",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/4 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total images written: 1000.\n",
"Total images written: 2000.\n",
"Total images written: 3000.\n",
"Total images written: 3670.\n"
]
}
],
"source": [
"def read_image(path):\n",
" with open(path, \"rb\") as f:\n",
" return f.read()\n",
"\n",
"\n",
"total_images_written = 0\n",
"\n",
"for step in tqdm.tnrange(int(math.ceil(len(image_paths) / batch_size))):\n",
" batch_image_paths = image_paths[step * batch_size : (step + 1) * batch_size]\n",
" batch_image_labels = all_integer_labels[step * batch_size : (step + 1) * batch_size]\n",
"\n",
" data = [read_image(path) for path in batch_image_paths]\n",
" table = pa.Table.from_arrays([data, batch_image_labels], [\"data\", \"labels\"])\n",
" write_feather(table, f\"/tmp/flowers_feather_{step}.feather\", chunksize=chunk_size)\n",
" total_images_written += len(batch_image_paths)\n",
" print(f\"Total images written: {total_images_written}.\")\n",
"\n",
" del data"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "4a2f1b5a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-rw-r--r-- 1 sayakpaul wheel 64M Aug 3 22:50 /tmp/flowers_feather_0.feather\r\n",
"-rw-r--r-- 1 sayakpaul wheel 59M Aug 3 22:50 /tmp/flowers_feather_1.feather\r\n",
"-rw-r--r-- 1 sayakpaul wheel 51M Aug 3 22:50 /tmp/flowers_feather_2.feather\r\n",
"-rw-r--r-- 1 sayakpaul wheel 45M Aug 3 22:50 /tmp/flowers_feather_3.feather\r\n"
]
}
],
"source": [
"ls -lh /tmp/*.feather"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "16478d1d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2.9.1\n",
"0.26.0\n"
]
}
],
"source": [
"import tensorflow_io.arrow as arrow_io\n",
"import tensorflow_io as tfio\n",
"\n",
"print(tf.__version__)\n",
"print(tfio.__version__)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "449ae824",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-08-03 22:52:18.995564: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"2022-08-03 22:52:19.444750: I tensorflow_io/core/kernels/cpu_check.cc:128] Your CPU supports instructions that this TensorFlow IO binary was not compiled to use: AVX2 FMA\n",
"2022-08-03 22:52:19.567800: E tensorflow/core/framework/dataset.cc:580] UNIMPLEMENTED: Cannot compute input sources for dataset of type IO>ArrowFeatherDataset, because the dataset does not implement `InputDatasets`.\n",
"2022-08-03 22:52:19.567818: E tensorflow/core/framework/dataset.cc:584] UNIMPLEMENTED: Cannot merge options for dataset of type IO>ArrowFeatherDataset, because the dataset does not implement `InputDatasets`.\n"
]
}
],
"source": [
"dataset = arrow_io.ArrowFeatherDataset(\n",
" [\"/tmp/flowers_feather_0.feather\"],\n",
" columns=(0, 1),\n",
" output_types=(tf.string, tf.int64),\n",
" output_shapes=([], []),\n",
" batch_size=32,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "155f4331",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))\n"
]
}
],
"source": [
"print(dataset.element_spec) "
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "652c2e85",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-08-03 22:52:36.086910: E tensorflow/core/framework/dataset.cc:580] UNIMPLEMENTED: Cannot compute input sources for dataset of type IO>ArrowFeatherDataset, because the dataset does not implement `InputDatasets`.\n",
"2022-08-03 22:52:36.086938: E tensorflow/core/framework/dataset.cc:584] UNIMPLEMENTED: Cannot merge options for dataset of type IO>ArrowFeatherDataset, because the dataset does not implement `InputDatasets`.\n",
"2022-08-03 22:52:36.087371: E tensorflow/core/framework/dataset.cc:580] FAILED_PRECONDITION: Cannot compute input sources for dataset of type RootDataset, because sources could not be computed for input dataset of type IO>ArrowFeatherDataset\n"
]
},
{
"ename": "InternalError",
"evalue": "Invalid: INVALID_ARGUMENT: arrow data type 0x7fc728a5d3d8 is not supported: Type error: Arrow data type is not supported [Op:IteratorGetNext]",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mInternalError\u001b[0m Traceback (most recent call last)",
"Input \u001b[0;32mIn [15]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m sample \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43miter\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow/python/data/ops/iterator_ops.py:766\u001b[0m, in \u001b[0;36mOwnedIterator.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 764\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__next__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 765\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 766\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_next_internal\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 767\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m errors\u001b[38;5;241m.\u001b[39mOutOfRangeError:\n\u001b[1;32m 768\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m\n",
"File \u001b[0;32m~/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow/python/data/ops/iterator_ops.py:749\u001b[0m, in \u001b[0;36mOwnedIterator._next_internal\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 746\u001b[0m \u001b[38;5;66;03m# TODO(b/77291417): This runs in sync mode as iterators use an error status\u001b[39;00m\n\u001b[1;32m 747\u001b[0m \u001b[38;5;66;03m# to communicate that there is no more data to iterate over.\u001b[39;00m\n\u001b[1;32m 748\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context\u001b[38;5;241m.\u001b[39mexecution_mode(context\u001b[38;5;241m.\u001b[39mSYNC):\n\u001b[0;32m--> 749\u001b[0m ret \u001b[38;5;241m=\u001b[39m \u001b[43mgen_dataset_ops\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miterator_get_next\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 750\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_iterator_resource\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 751\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_types\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_flat_output_types\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 752\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_shapes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_flat_output_shapes\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 754\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 755\u001b[0m \u001b[38;5;66;03m# Fast path for the case `self._structure` is not a nested structure.\u001b[39;00m\n\u001b[1;32m 756\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_element_spec\u001b[38;5;241m.\u001b[39m_from_compatible_tensor_list(ret) \u001b[38;5;66;03m# pylint: disable=protected-access\u001b[39;00m\n",
"File \u001b[0;32m~/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow/python/ops/gen_dataset_ops.py:3017\u001b[0m, in \u001b[0;36miterator_get_next\u001b[0;34m(iterator, output_types, output_shapes, name)\u001b[0m\n\u001b[1;32m 3015\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _result\n\u001b[1;32m 3016\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m _core\u001b[38;5;241m.\u001b[39m_NotOkStatusException \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m-> 3017\u001b[0m \u001b[43m_ops\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mraise_from_not_ok_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43me\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3018\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m _core\u001b[38;5;241m.\u001b[39m_FallbackException:\n\u001b[1;32m 3019\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n",
"File \u001b[0;32m~/.local/bin/.virtualenvs/hf_datasets/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:7164\u001b[0m, in \u001b[0;36mraise_from_not_ok_status\u001b[0;34m(e, name)\u001b[0m\n\u001b[1;32m 7162\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mraise_from_not_ok_status\u001b[39m(e, name):\n\u001b[1;32m 7163\u001b[0m e\u001b[38;5;241m.\u001b[39mmessage \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m name: \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m name \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 7164\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m core\u001b[38;5;241m.\u001b[39m_status_to_exception(e) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n",
"\u001b[0;31mInternalError\u001b[0m: Invalid: INVALID_ARGUMENT: arrow data type 0x7fc728a5d3d8 is not supported: Type error: Arrow data type is not supported [Op:IteratorGetNext]"
]
}
],
"source": [
"sample = next(iter(dataset))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment