Created
September 20, 2022 06:12
-
-
Save daniel-falk/e827be6abf4eb81c7083ec1732b5bb39 to your computer and use it in GitHub Desktop.
hub-to-tf-benchmark-coco.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"collapsed_sections": [], | |
"authorship_tag": "ABX9TyOqHFss1vAMSYuPT0N2b1ue", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/daniel-falk/e827be6abf4eb81c7083ec1732b5bb39/hub-to-tf-benchmark-coco.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "VNZJDhY--SwK", | |
"outputId": "08369483-1ba9-41d4-9cfa-b3d1939b3409" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", | |
"Collecting git+https://github.com/daniel-falk/Hub.git@improve-tensorflow-dataset-throughput\n", | |
" Cloning https://github.com/daniel-falk/Hub.git (to revision improve-tensorflow-dataset-throughput) to /tmp/pip-req-build-40ubq8wd\n", | |
" Running command git clone -q https://github.com/daniel-falk/Hub.git /tmp/pip-req-build-40ubq8wd\n", | |
" Running command git checkout -b improve-tensorflow-dataset-throughput --track origin/improve-tensorflow-dataset-throughput\n", | |
" Switched to a new branch 'improve-tensorflow-dataset-throughput'\n", | |
" Branch 'improve-tensorflow-dataset-throughput' set up to track remote branch 'improve-tensorflow-dataset-throughput' from 'origin'.\n", | |
"Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (1.21.6)\n", | |
"Requirement already satisfied: pillow in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (7.1.2)\n", | |
"Collecting boto3\n", | |
" Downloading boto3-1.24.76-py3-none-any.whl (132 kB)\n", | |
"\u001b[K |████████████████████████████████| 132 kB 8.3 MB/s \n", | |
"\u001b[?25hRequirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (7.1.2)\n", | |
"Collecting pathos\n", | |
" Downloading pathos-0.2.9-py3-none-any.whl (76 kB)\n", | |
"\u001b[K |████████████████████████████████| 76 kB 3.2 MB/s \n", | |
"\u001b[?25hCollecting humbug>=0.2.6\n", | |
" Downloading humbug-0.2.7-py3-none-any.whl (11 kB)\n", | |
"Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (4.64.1)\n", | |
"Collecting numcodecs\n", | |
" Downloading numcodecs-0.10.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)\n", | |
"\u001b[K |████████████████████████████████| 6.6 MB 8.3 MB/s \n", | |
"\u001b[?25hCollecting pyjwt\n", | |
" Downloading PyJWT-2.5.0-py3-none-any.whl (20 kB)\n", | |
"Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from humbug>=0.2.6->hub==2.8.5) (2.23.0)\n", | |
"Collecting botocore<1.28.0,>=1.27.76\n", | |
" Downloading botocore-1.27.76-py3-none-any.whl (9.1 MB)\n", | |
"\u001b[K |████████████████████████████████| 9.1 MB 55.9 MB/s \n", | |
"\u001b[?25hCollecting jmespath<2.0.0,>=0.7.1\n", | |
" Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)\n", | |
"Collecting s3transfer<0.7.0,>=0.6.0\n", | |
" Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)\n", | |
"\u001b[K |████████████████████████████████| 79 kB 7.7 MB/s \n", | |
"\u001b[?25hRequirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.7/dist-packages (from botocore<1.28.0,>=1.27.76->boto3->hub==2.8.5) (2.8.2)\n", | |
"Collecting urllib3<1.27,>=1.25.4\n", | |
" Downloading urllib3-1.26.12-py2.py3-none-any.whl (140 kB)\n", | |
"\u001b[K |████████████████████████████████| 140 kB 53.0 MB/s \n", | |
"\u001b[?25hRequirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.28.0,>=1.27.76->boto3->hub==2.8.5) (1.15.0)\n", | |
"Requirement already satisfied: entrypoints in /usr/local/lib/python3.7/dist-packages (from numcodecs->hub==2.8.5) (0.4)\n", | |
"Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from numcodecs->hub==2.8.5) (4.1.1)\n", | |
"Collecting ppft>=1.7.6.5\n", | |
" Downloading ppft-1.7.6.5-py2.py3-none-any.whl (52 kB)\n", | |
"\u001b[K |████████████████████████████████| 52 kB 1.1 MB/s \n", | |
"\u001b[?25hCollecting multiprocess>=0.70.13\n", | |
" Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)\n", | |
"\u001b[K |████████████████████████████████| 115 kB 49.3 MB/s \n", | |
"\u001b[?25hCollecting pox>=0.3.1\n", | |
" Downloading pox-0.3.1-py2.py3-none-any.whl (28 kB)\n", | |
"Requirement already satisfied: dill>=0.3.5.1 in /usr/local/lib/python3.7/dist-packages (from pathos->hub==2.8.5) (0.3.5.1)\n", | |
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->hub==2.8.5) (3.0.4)\n", | |
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->hub==2.8.5) (2022.6.15)\n", | |
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->hub==2.8.5) (2.10)\n", | |
"Collecting urllib3<1.27,>=1.25.4\n", | |
" Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)\n", | |
"\u001b[K |████████████████████████████████| 127 kB 50.4 MB/s \n", | |
"\u001b[?25hBuilding wheels for collected packages: hub\n", | |
" Building wheel for hub (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
" Created wheel for hub: filename=hub-2.8.5-py3-none-any.whl size=6519148 sha256=c1b1872b7ecff0922ffcb0c055200799b0e309e51c7c8c87bf57d86f2ff0e5e8\n", | |
" Stored in directory: /tmp/pip-ephem-wheel-cache-9rkfdpjt/wheels/1a/d1/9d/0f7c57e91a1f5bed11e78a4d2b2cde7f4e21e44bb751c001b8\n", | |
"Successfully built hub\n", | |
"Installing collected packages: urllib3, jmespath, botocore, s3transfer, ppft, pox, multiprocess, pyjwt, pathos, numcodecs, humbug, boto3, hub\n", | |
" Attempting uninstall: urllib3\n", | |
" Found existing installation: urllib3 1.24.3\n", | |
" Uninstalling urllib3-1.24.3:\n", | |
" Successfully uninstalled urllib3-1.24.3\n", | |
"Successfully installed boto3-1.24.76 botocore-1.27.76 hub-2.8.5 humbug-0.2.7 jmespath-1.0.1 multiprocess-0.70.13 numcodecs-0.10.2 pathos-0.2.9 pox-0.3.1 ppft-1.7.6.5 pyjwt-2.5.0 s3transfer-0.6.0 urllib3-1.25.11\n", | |
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", | |
"Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (4.64.1)\n" | |
] | |
} | |
], | |
"source": [ | |
"!pip install git+https://github.com/daniel-falk/Hub.git@improve-tensorflow-dataset-throughput\n", | |
"!pip install tqdm\n", | |
"\n", | |
"import hub\n", | |
"from tqdm import tqdm\n", | |
"import time" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"DATASET = \"hub://activeloop/coco-test\"\n", | |
"NUM_SAMPLES = 1000" | |
], | |
"metadata": { | |
"id": "HXuqTZnD-eJa" | |
}, | |
"execution_count": 5, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"hub_ds = hub.load(DATASET)\n", | |
"\n", | |
"t0 = time.time()\n", | |
"for i, sample in tqdm(enumerate(hub_ds.tensorflow(fetch_chunks=True, tensors=(\"images\",)))):\n", | |
" sample[\"images\"]\n", | |
" if i == NUM_SAMPLES:\n", | |
" break\n", | |
"tf_chunked_total_time = time.time() - t0\n", | |
"tf_chunked_time = tf_chunked_total_time / (i + 1)\n", | |
"\n", | |
"print(f\"TF chunked dataset images per second: {1/tf_chunked_time}\")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Q6GYMV4VqU7W", | |
"outputId": "9f850cb3-64c7-4348-9ba0-ad74210f815d" | |
}, | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"hub://activeloop/coco-test loaded successfully.\n", | |
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/coco-test\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"1000it [00:33, 29.87it/s]" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"TF chunked dataset images per second: 29.86230547618074\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"hub_ds = hub.load(DATASET)\n", | |
"\n", | |
"t0 = time.time()\n", | |
"for i, sample in tqdm(enumerate(hub_ds.tensorflow(tensors=(\"images\",)))):\n", | |
" sample[\"images\"]\n", | |
" if i == NUM_SAMPLES:\n", | |
" break\n", | |
"tf_total_time = time.time() - t0\n", | |
"tf_time = tf_total_time / (i + 1)\n", | |
"\n", | |
"print(f\"TF (current) dataset images per second: {1/tf_time}\")" | |
], | |
"metadata": { | |
"id": "muxPU4ZZ4xWD", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "cfaeb8eb-92d0-422c-932f-0d1fc6f8a052" | |
}, | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"hub://activeloop/coco-test loaded successfully.\n", | |
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/coco-test\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"1000it [02:19, 7.19it/s]" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"TF (current) dataset images per second: 7.189705021858493\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import time\n", | |
"\n", | |
"hub_ds = hub.load(DATASET)\n", | |
"\n", | |
"t0 = time.time()\n", | |
"for i, sample in tqdm(enumerate(hub_ds)):\n", | |
" sample[\"images\"].numpy(fetch_chunks=True)\n", | |
" if i == NUM_SAMPLES:\n", | |
" break\n", | |
"hub_chunked_total_time = time.time() - t0\n", | |
"hub_chunked_time = hub_chunked_total_time / (i + 1)\n", | |
"\n", | |
"\n", | |
"print(f\"Hub chunked dataset images per second: {1/hub_chunked_time}\")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "CNs_UV1BmbF-", | |
"outputId": "fa031705-940d-47fc-bd91-6aa8472a1112" | |
}, | |
"execution_count": 15, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"hub://activeloop/coco-test loaded successfully.\n", | |
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/coco-test\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"1000it [00:26, 38.39it/s]" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Hub chunked dataset images per second: 38.42287724201949\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"\n" | |
] | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment