Created
September 19, 2022 18:23
-
-
Save daniel-falk/188b96013e9f0cedcf555a0a30fa177d to your computer and use it in GitHub Desktop.
hub-to-tf-benchmark.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"collapsed_sections": [], | |
"authorship_tag": "ABX9TyNOYUjviNyuGHFpIVpH5/uK", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/daniel-falk/188b96013e9f0cedcf555a0a30fa177d/hub-to-tf-benchmark.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "VNZJDhY--SwK", | |
"outputId": "d439739c-15a3-450e-91e5-3d34ca595687" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", | |
"Collecting git+https://github.com/daniel-falk/Hub.git@improve-tensorflow-dataset-throughput\n", | |
" Cloning https://github.com/daniel-falk/Hub.git (to revision improve-tensorflow-dataset-throughput) to /tmp/pip-req-build-gbleo__j\n", | |
" Running command git clone -q https://github.com/daniel-falk/Hub.git /tmp/pip-req-build-gbleo__j\n", | |
" Running command git checkout -b improve-tensorflow-dataset-throughput --track origin/improve-tensorflow-dataset-throughput\n", | |
" Switched to a new branch 'improve-tensorflow-dataset-throughput'\n", | |
" Branch 'improve-tensorflow-dataset-throughput' set up to track remote branch 'improve-tensorflow-dataset-throughput' from 'origin'.\n", | |
"Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (1.21.6)\n", | |
"Requirement already satisfied: pillow in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (7.1.2)\n", | |
"Requirement already satisfied: boto3 in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (1.24.75)\n", | |
"Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (7.1.2)\n", | |
"Requirement already satisfied: pathos in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (0.2.9)\n", | |
"Requirement already satisfied: humbug>=0.2.6 in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (0.2.7)\n", | |
"Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (4.64.1)\n", | |
"Requirement already satisfied: numcodecs in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (0.10.2)\n", | |
"Requirement already satisfied: pyjwt in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (2.5.0)\n", | |
"Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from humbug>=0.2.6->hub==2.8.5) (2.23.0)\n", | |
"Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /usr/local/lib/python3.7/dist-packages (from boto3->hub==2.8.5) (1.0.1)\n", | |
"Requirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /usr/local/lib/python3.7/dist-packages (from boto3->hub==2.8.5) (0.6.0)\n", | |
"Requirement already satisfied: botocore<1.28.0,>=1.27.75 in /usr/local/lib/python3.7/dist-packages (from boto3->hub==2.8.5) (1.27.75)\n", | |
"Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.7/dist-packages (from botocore<1.28.0,>=1.27.75->boto3->hub==2.8.5) (2.8.2)\n", | |
"Requirement already satisfied: urllib3<1.27,>=1.25.4 in /usr/local/lib/python3.7/dist-packages (from botocore<1.28.0,>=1.27.75->boto3->hub==2.8.5) (1.25.11)\n", | |
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.28.0,>=1.27.75->boto3->hub==2.8.5) (1.15.0)\n", | |
"Requirement already satisfied: entrypoints in /usr/local/lib/python3.7/dist-packages (from numcodecs->hub==2.8.5) (0.4)\n", | |
"Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from numcodecs->hub==2.8.5) (4.1.1)\n", | |
"Requirement already satisfied: ppft>=1.7.6.5 in /usr/local/lib/python3.7/dist-packages (from pathos->hub==2.8.5) (1.7.6.5)\n", | |
"Requirement already satisfied: pox>=0.3.1 in /usr/local/lib/python3.7/dist-packages (from pathos->hub==2.8.5) (0.3.1)\n", | |
"Requirement already satisfied: multiprocess>=0.70.13 in /usr/local/lib/python3.7/dist-packages (from pathos->hub==2.8.5) (0.70.13)\n", | |
"Requirement already satisfied: dill>=0.3.5.1 in /usr/local/lib/python3.7/dist-packages (from pathos->hub==2.8.5) (0.3.5.1)\n", | |
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->hub==2.8.5) (2.10)\n", | |
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->hub==2.8.5) (3.0.4)\n", | |
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->hub==2.8.5) (2022.6.15)\n", | |
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", | |
"Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (4.64.1)\n" | |
] | |
} | |
], | |
"source": [ | |
"!pip install git+https://github.com/daniel-falk/Hub.git@improve-tensorflow-dataset-throughput\n", | |
"!pip install tqdm\n", | |
"\n", | |
"import hub\n", | |
"from tqdm import tqdm\n", | |
"import time" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"DATASET = \"hub://activeloop/mnist-test\"" | |
], | |
"metadata": { | |
"id": "HXuqTZnD-eJa" | |
}, | |
"execution_count": 5, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"hub_ds = hub.load(DATASET)\n", | |
"\n", | |
"t0 = time.time()\n", | |
"for i, sample in tqdm(enumerate(hub_ds.tensorflow(fetch_chunks=True))):\n", | |
" sample[\"images\"]\n", | |
"tf_chunked_total_time = time.time() - t0\n", | |
"tf_chunked_time = tf_chunked_total_time / (i + 1)\n", | |
"\n", | |
"print(f\"TF chunked dataset images per second: {1/tf_chunked_time}\")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Q6GYMV4VqU7W", | |
"outputId": "b3621231-6d8d-4b6c-cb36-b967b4f7255e" | |
}, | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"hub://activeloop/mnist-test loaded successfully.\n", | |
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/mnist-test\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"10000it [00:15, 659.45it/s]" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"TF chunked dataset images per second: 657.2564827721415\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"hub_ds = hub.load(DATASET)\n", | |
"\n", | |
"t0 = time.time()\n", | |
"for i, sample in tqdm(enumerate(hub_ds.tensorflow())):\n", | |
" sample[\"images\"]\n", | |
"tf_total_time = time.time() - t0\n", | |
"tf_time = tf_total_time / (i + 1)\n", | |
"\n", | |
"print(f\"TF (current) dataset images per second: {1/tf_time}\")" | |
], | |
"metadata": { | |
"id": "muxPU4ZZ4xWD", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "89f36855-dd0b-44d6-ee44-c8f694b2175c" | |
}, | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"hub://activeloop/mnist-test loaded successfully.\n", | |
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/mnist-test\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"10000it [06:28, 25.74it/s]" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"TF (current) dataset images per second: 25.734239888534077\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import time\n", | |
"\n", | |
"hub_ds = hub.load(DATASET)\n", | |
"\n", | |
"t0 = time.time()\n", | |
"for i, sample in tqdm(enumerate(hub_ds)):\n", | |
" sample[\"images\"].numpy(fetch_chunks=True)\n", | |
"hub_chunked_total_time = time.time() - t0\n", | |
"hub_chunked_time = hub_chunked_total_time / (i + 1)\n", | |
"\n", | |
"\n", | |
"print(f\"Hub chunked dataset images per second: {1/hub_chunked_time}\")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "CNs_UV1BmbF-", | |
"outputId": "cf80b637-8355-4f67-a907-f86590f13e68" | |
}, | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"hub://activeloop/mnist-test loaded successfully.\n", | |
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/mnist-test\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"10000it [00:07, 1296.13it/s]" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Hub chunked dataset images per second: 1294.3886179085591\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import time\n", | |
"\n", | |
"hub_ds = hub.load(DATASET)\n", | |
"\n", | |
"t0 = time.time()\n", | |
"for i, sample in tqdm(enumerate(hub_ds)):\n", | |
" sample[\"images\"].numpy()\n", | |
"hub_total_time = time.time() - t0\n", | |
"hub_time = hub_total_time / (i + 1)\n", | |
"\n", | |
"\n", | |
"print(f\"Hub dataset images per second: {1/hub_time}\")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "QYg_qnXC6DrF", | |
"outputId": "9103f08a-c414-4c27-fd33-c7b6443c7c73" | |
}, | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"hub://activeloop/mnist-test loaded successfully.\n", | |
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/mnist-test\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"10000it [06:00, 27.76it/s]" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Hub dataset images per second: 27.76194003511129\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "k_Py5L-79_2U" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment